craa's picture
Upload folder using huggingface_hub
ccf5016 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 72000,
"best_metric": 3.5296003818511963,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_carry_frequency_2128/checkpoint-40000",
"epoch": 29.129041654529566,
"eval_steps": 1000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014564520827264784,
"grad_norm": 0.8700253963470459,
"learning_rate": 0.000294,
"loss": 8.4648,
"step": 50
},
{
"epoch": 0.029129041654529567,
"grad_norm": 0.8772715330123901,
"learning_rate": 0.0005939999999999999,
"loss": 6.735,
"step": 100
},
{
"epoch": 0.043693562481794346,
"grad_norm": 0.5721131563186646,
"learning_rate": 0.0005998286213931798,
"loss": 6.3403,
"step": 150
},
{
"epoch": 0.058258083309059135,
"grad_norm": 0.42677509784698486,
"learning_rate": 0.0005996537452637714,
"loss": 6.1411,
"step": 200
},
{
"epoch": 0.07282260413632391,
"grad_norm": 0.477642685174942,
"learning_rate": 0.0005994788691343632,
"loss": 5.9982,
"step": 250
},
{
"epoch": 0.08738712496358869,
"grad_norm": 0.49641019105911255,
"learning_rate": 0.0005993039930049548,
"loss": 5.8842,
"step": 300
},
{
"epoch": 0.10195164579085349,
"grad_norm": 0.43664422631263733,
"learning_rate": 0.0005991291168755465,
"loss": 5.7548,
"step": 350
},
{
"epoch": 0.11651616661811827,
"grad_norm": 0.4933745861053467,
"learning_rate": 0.0005989542407461382,
"loss": 5.6359,
"step": 400
},
{
"epoch": 0.13108068744538304,
"grad_norm": 0.48850634694099426,
"learning_rate": 0.0005987793646167297,
"loss": 5.5233,
"step": 450
},
{
"epoch": 0.14564520827264782,
"grad_norm": 0.4590892493724823,
"learning_rate": 0.0005986044884873214,
"loss": 5.4179,
"step": 500
},
{
"epoch": 0.1602097290999126,
"grad_norm": 0.4702446162700653,
"learning_rate": 0.0005984296123579131,
"loss": 5.3419,
"step": 550
},
{
"epoch": 0.17477424992717738,
"grad_norm": 0.43737664818763733,
"learning_rate": 0.0005982547362285047,
"loss": 5.2567,
"step": 600
},
{
"epoch": 0.18933877075444216,
"grad_norm": 0.5059506893157959,
"learning_rate": 0.0005980798600990964,
"loss": 5.1911,
"step": 650
},
{
"epoch": 0.20390329158170697,
"grad_norm": 0.46494734287261963,
"learning_rate": 0.0005979049839696881,
"loss": 5.1499,
"step": 700
},
{
"epoch": 0.21846781240897176,
"grad_norm": 0.5374208688735962,
"learning_rate": 0.0005977301078402798,
"loss": 5.0895,
"step": 750
},
{
"epoch": 0.23303233323623654,
"grad_norm": 0.3784054219722748,
"learning_rate": 0.0005975552317108715,
"loss": 5.0231,
"step": 800
},
{
"epoch": 0.24759685406350132,
"grad_norm": 0.4124829173088074,
"learning_rate": 0.0005973803555814631,
"loss": 4.9945,
"step": 850
},
{
"epoch": 0.2621613748907661,
"grad_norm": 0.41545918583869934,
"learning_rate": 0.0005972054794520547,
"loss": 4.9248,
"step": 900
},
{
"epoch": 0.27672589571803086,
"grad_norm": 0.48763376474380493,
"learning_rate": 0.0005970306033226464,
"loss": 4.8882,
"step": 950
},
{
"epoch": 0.29129041654529564,
"grad_norm": 0.4686114192008972,
"learning_rate": 0.0005968557271932381,
"loss": 4.8401,
"step": 1000
},
{
"epoch": 0.29129041654529564,
"eval_accuracy": 0.2521537178997339,
"eval_loss": 4.771047115325928,
"eval_runtime": 178.0535,
"eval_samples_per_second": 93.461,
"eval_steps_per_second": 5.847,
"step": 1000
},
{
"epoch": 0.3058549373725604,
"grad_norm": 0.5042315721511841,
"learning_rate": 0.0005966808510638297,
"loss": 4.7859,
"step": 1050
},
{
"epoch": 0.3204194581998252,
"grad_norm": 0.42784640192985535,
"learning_rate": 0.0005965059749344214,
"loss": 4.75,
"step": 1100
},
{
"epoch": 0.33498397902709,
"grad_norm": 0.4303656816482544,
"learning_rate": 0.0005963310988050131,
"loss": 4.7097,
"step": 1150
},
{
"epoch": 0.34954849985435477,
"grad_norm": 0.55189049243927,
"learning_rate": 0.0005961562226756047,
"loss": 4.6761,
"step": 1200
},
{
"epoch": 0.36411302068161955,
"grad_norm": 0.45820021629333496,
"learning_rate": 0.0005959813465461965,
"loss": 4.6322,
"step": 1250
},
{
"epoch": 0.37867754150888433,
"grad_norm": 0.45095351338386536,
"learning_rate": 0.000595806470416788,
"loss": 4.6107,
"step": 1300
},
{
"epoch": 0.39324206233614917,
"grad_norm": 0.42550787329673767,
"learning_rate": 0.0005956315942873797,
"loss": 4.5928,
"step": 1350
},
{
"epoch": 0.40780658316341395,
"grad_norm": 0.4250703454017639,
"learning_rate": 0.0005954567181579714,
"loss": 4.5615,
"step": 1400
},
{
"epoch": 0.42237110399067873,
"grad_norm": 0.4031296670436859,
"learning_rate": 0.000595281842028563,
"loss": 4.5317,
"step": 1450
},
{
"epoch": 0.4369356248179435,
"grad_norm": 0.4141838848590851,
"learning_rate": 0.0005951069658991547,
"loss": 4.5138,
"step": 1500
},
{
"epoch": 0.4515001456452083,
"grad_norm": 0.3957858383655548,
"learning_rate": 0.0005949320897697464,
"loss": 4.492,
"step": 1550
},
{
"epoch": 0.4660646664724731,
"grad_norm": 0.42144542932510376,
"learning_rate": 0.0005947572136403381,
"loss": 4.4686,
"step": 1600
},
{
"epoch": 0.48062918729973786,
"grad_norm": 0.3965926766395569,
"learning_rate": 0.0005945823375109297,
"loss": 4.4541,
"step": 1650
},
{
"epoch": 0.49519370812700264,
"grad_norm": 0.46729621291160583,
"learning_rate": 0.0005944074613815215,
"loss": 4.4285,
"step": 1700
},
{
"epoch": 0.5097582289542674,
"grad_norm": 0.3873760998249054,
"learning_rate": 0.000594232585252113,
"loss": 4.4088,
"step": 1750
},
{
"epoch": 0.5243227497815321,
"grad_norm": 0.475016325712204,
"learning_rate": 0.0005940577091227047,
"loss": 4.403,
"step": 1800
},
{
"epoch": 0.5388872706087969,
"grad_norm": 0.407781183719635,
"learning_rate": 0.0005938828329932964,
"loss": 4.3943,
"step": 1850
},
{
"epoch": 0.5534517914360617,
"grad_norm": 0.4060550332069397,
"learning_rate": 0.000593707956863888,
"loss": 4.3651,
"step": 1900
},
{
"epoch": 0.5680163122633265,
"grad_norm": 0.4468280076980591,
"learning_rate": 0.0005935330807344797,
"loss": 4.359,
"step": 1950
},
{
"epoch": 0.5825808330905913,
"grad_norm": 0.3966878652572632,
"learning_rate": 0.0005933582046050714,
"loss": 4.3331,
"step": 2000
},
{
"epoch": 0.5825808330905913,
"eval_accuracy": 0.2995227523184138,
"eval_loss": 4.283666133880615,
"eval_runtime": 177.3551,
"eval_samples_per_second": 93.829,
"eval_steps_per_second": 5.87,
"step": 2000
},
{
"epoch": 0.5971453539178561,
"grad_norm": 0.39190909266471863,
"learning_rate": 0.000593183328475663,
"loss": 4.3321,
"step": 2050
},
{
"epoch": 0.6117098747451208,
"grad_norm": 0.38748636841773987,
"learning_rate": 0.0005930084523462546,
"loss": 4.3227,
"step": 2100
},
{
"epoch": 0.6262743955723856,
"grad_norm": 0.351399302482605,
"learning_rate": 0.0005928335762168463,
"loss": 4.2934,
"step": 2150
},
{
"epoch": 0.6408389163996504,
"grad_norm": 0.42470622062683105,
"learning_rate": 0.000592658700087438,
"loss": 4.2908,
"step": 2200
},
{
"epoch": 0.6554034372269152,
"grad_norm": 0.3766157329082489,
"learning_rate": 0.0005924838239580297,
"loss": 4.2909,
"step": 2250
},
{
"epoch": 0.66996795805418,
"grad_norm": 0.5269715189933777,
"learning_rate": 0.0005923089478286214,
"loss": 4.2651,
"step": 2300
},
{
"epoch": 0.6845324788814448,
"grad_norm": 0.40443506836891174,
"learning_rate": 0.000592134071699213,
"loss": 4.2846,
"step": 2350
},
{
"epoch": 0.6990969997087095,
"grad_norm": 0.3809632360935211,
"learning_rate": 0.0005919591955698047,
"loss": 4.2389,
"step": 2400
},
{
"epoch": 0.7136615205359743,
"grad_norm": 0.3910612463951111,
"learning_rate": 0.0005917843194403964,
"loss": 4.2332,
"step": 2450
},
{
"epoch": 0.7282260413632391,
"grad_norm": 0.3787578046321869,
"learning_rate": 0.000591609443310988,
"loss": 4.2352,
"step": 2500
},
{
"epoch": 0.7427905621905039,
"grad_norm": 0.36433854699134827,
"learning_rate": 0.0005914345671815796,
"loss": 4.2098,
"step": 2550
},
{
"epoch": 0.7573550830177687,
"grad_norm": 0.3667093515396118,
"learning_rate": 0.0005912596910521713,
"loss": 4.22,
"step": 2600
},
{
"epoch": 0.7719196038450336,
"grad_norm": 0.36228933930397034,
"learning_rate": 0.0005910848149227629,
"loss": 4.2054,
"step": 2650
},
{
"epoch": 0.7864841246722983,
"grad_norm": 0.3999609649181366,
"learning_rate": 0.0005909099387933547,
"loss": 4.2007,
"step": 2700
},
{
"epoch": 0.8010486454995631,
"grad_norm": 0.3630334734916687,
"learning_rate": 0.0005907350626639463,
"loss": 4.1834,
"step": 2750
},
{
"epoch": 0.8156131663268279,
"grad_norm": 0.3720364272594452,
"learning_rate": 0.000590560186534538,
"loss": 4.1863,
"step": 2800
},
{
"epoch": 0.8301776871540927,
"grad_norm": 0.4026776850223541,
"learning_rate": 0.0005903853104051297,
"loss": 4.1729,
"step": 2850
},
{
"epoch": 0.8447422079813575,
"grad_norm": 0.42495641112327576,
"learning_rate": 0.0005902104342757214,
"loss": 4.1673,
"step": 2900
},
{
"epoch": 0.8593067288086222,
"grad_norm": 0.3550451397895813,
"learning_rate": 0.000590035558146313,
"loss": 4.1517,
"step": 2950
},
{
"epoch": 0.873871249635887,
"grad_norm": 0.37605759501457214,
"learning_rate": 0.0005898606820169046,
"loss": 4.1526,
"step": 3000
},
{
"epoch": 0.873871249635887,
"eval_accuracy": 0.31542069895270813,
"eval_loss": 4.094066619873047,
"eval_runtime": 177.1428,
"eval_samples_per_second": 93.941,
"eval_steps_per_second": 5.877,
"step": 3000
},
{
"epoch": 0.8884357704631518,
"grad_norm": 0.34033265709877014,
"learning_rate": 0.0005896858058874963,
"loss": 4.1369,
"step": 3050
},
{
"epoch": 0.9030002912904166,
"grad_norm": 0.3450336456298828,
"learning_rate": 0.0005895109297580879,
"loss": 4.1231,
"step": 3100
},
{
"epoch": 0.9175648121176814,
"grad_norm": 0.35001909732818604,
"learning_rate": 0.0005893360536286797,
"loss": 4.1407,
"step": 3150
},
{
"epoch": 0.9321293329449462,
"grad_norm": 0.365225225687027,
"learning_rate": 0.0005891611774992713,
"loss": 4.1225,
"step": 3200
},
{
"epoch": 0.9466938537722109,
"grad_norm": 0.3506869077682495,
"learning_rate": 0.000588986301369863,
"loss": 4.1155,
"step": 3250
},
{
"epoch": 0.9612583745994757,
"grad_norm": 0.3621656000614166,
"learning_rate": 0.0005888114252404547,
"loss": 4.1049,
"step": 3300
},
{
"epoch": 0.9758228954267405,
"grad_norm": 0.33093249797821045,
"learning_rate": 0.0005886365491110463,
"loss": 4.0921,
"step": 3350
},
{
"epoch": 0.9903874162540053,
"grad_norm": 0.3546348810195923,
"learning_rate": 0.000588461672981638,
"loss": 4.1038,
"step": 3400
},
{
"epoch": 1.00495193708127,
"grad_norm": 0.33640843629837036,
"learning_rate": 0.0005882867968522296,
"loss": 4.0615,
"step": 3450
},
{
"epoch": 1.0195164579085347,
"grad_norm": 0.33964186906814575,
"learning_rate": 0.0005881119207228212,
"loss": 4.0277,
"step": 3500
},
{
"epoch": 1.0340809787357996,
"grad_norm": 0.34980398416519165,
"learning_rate": 0.0005879370445934129,
"loss": 4.003,
"step": 3550
},
{
"epoch": 1.0486454995630643,
"grad_norm": 0.3941769599914551,
"learning_rate": 0.0005877621684640046,
"loss": 4.0155,
"step": 3600
},
{
"epoch": 1.0632100203903292,
"grad_norm": 0.34226739406585693,
"learning_rate": 0.0005875872923345963,
"loss": 4.0147,
"step": 3650
},
{
"epoch": 1.0777745412175939,
"grad_norm": 0.3588155508041382,
"learning_rate": 0.000587412416205188,
"loss": 4.0108,
"step": 3700
},
{
"epoch": 1.0923390620448588,
"grad_norm": 0.403024286031723,
"learning_rate": 0.0005872375400757797,
"loss": 4.0115,
"step": 3750
},
{
"epoch": 1.1069035828721234,
"grad_norm": 0.3496304154396057,
"learning_rate": 0.0005870626639463713,
"loss": 3.9869,
"step": 3800
},
{
"epoch": 1.1214681036993883,
"grad_norm": 0.3462716042995453,
"learning_rate": 0.0005868877878169629,
"loss": 4.0138,
"step": 3850
},
{
"epoch": 1.136032624526653,
"grad_norm": 0.3597177565097809,
"learning_rate": 0.0005867129116875546,
"loss": 3.9896,
"step": 3900
},
{
"epoch": 1.1505971453539179,
"grad_norm": 0.3294402062892914,
"learning_rate": 0.0005865380355581462,
"loss": 3.9925,
"step": 3950
},
{
"epoch": 1.1651616661811826,
"grad_norm": 0.34170660376548767,
"learning_rate": 0.0005863631594287379,
"loss": 4.0002,
"step": 4000
},
{
"epoch": 1.1651616661811826,
"eval_accuracy": 0.32493390114318127,
"eval_loss": 3.989036798477173,
"eval_runtime": 177.4875,
"eval_samples_per_second": 93.759,
"eval_steps_per_second": 5.865,
"step": 4000
},
{
"epoch": 1.1797261870084474,
"grad_norm": 0.35232222080230713,
"learning_rate": 0.0005861882832993296,
"loss": 3.9896,
"step": 4050
},
{
"epoch": 1.1942907078357121,
"grad_norm": 0.3442701995372772,
"learning_rate": 0.0005860134071699212,
"loss": 3.9966,
"step": 4100
},
{
"epoch": 1.208855228662977,
"grad_norm": 0.3539937436580658,
"learning_rate": 0.000585838531040513,
"loss": 3.9818,
"step": 4150
},
{
"epoch": 1.2234197494902417,
"grad_norm": 0.36053037643432617,
"learning_rate": 0.0005856636549111046,
"loss": 3.9763,
"step": 4200
},
{
"epoch": 1.2379842703175066,
"grad_norm": 0.35037800669670105,
"learning_rate": 0.0005854887787816963,
"loss": 3.978,
"step": 4250
},
{
"epoch": 1.2525487911447715,
"grad_norm": 0.3158578872680664,
"learning_rate": 0.0005853139026522879,
"loss": 3.9718,
"step": 4300
},
{
"epoch": 1.2671133119720361,
"grad_norm": 0.35837191343307495,
"learning_rate": 0.0005851390265228796,
"loss": 3.968,
"step": 4350
},
{
"epoch": 1.2816778327993008,
"grad_norm": 0.3676842749118805,
"learning_rate": 0.0005849641503934712,
"loss": 3.9758,
"step": 4400
},
{
"epoch": 1.2962423536265657,
"grad_norm": 0.3639284670352936,
"learning_rate": 0.0005847892742640629,
"loss": 3.9694,
"step": 4450
},
{
"epoch": 1.3108068744538306,
"grad_norm": 0.35421234369277954,
"learning_rate": 0.0005846143981346546,
"loss": 3.9627,
"step": 4500
},
{
"epoch": 1.3253713952810953,
"grad_norm": 0.3280850648880005,
"learning_rate": 0.0005844395220052462,
"loss": 3.9495,
"step": 4550
},
{
"epoch": 1.33993591610836,
"grad_norm": 0.34968069195747375,
"learning_rate": 0.000584264645875838,
"loss": 3.9585,
"step": 4600
},
{
"epoch": 1.3545004369356248,
"grad_norm": 0.36847731471061707,
"learning_rate": 0.0005840897697464296,
"loss": 3.9568,
"step": 4650
},
{
"epoch": 1.3690649577628897,
"grad_norm": 0.3364753723144531,
"learning_rate": 0.0005839148936170212,
"loss": 3.9476,
"step": 4700
},
{
"epoch": 1.3836294785901544,
"grad_norm": 0.330708384513855,
"learning_rate": 0.0005837400174876129,
"loss": 3.9338,
"step": 4750
},
{
"epoch": 1.398193999417419,
"grad_norm": 0.34026676416397095,
"learning_rate": 0.0005835651413582045,
"loss": 3.9356,
"step": 4800
},
{
"epoch": 1.412758520244684,
"grad_norm": 0.35521167516708374,
"learning_rate": 0.0005833902652287962,
"loss": 3.9481,
"step": 4850
},
{
"epoch": 1.4273230410719489,
"grad_norm": 0.3294624388217926,
"learning_rate": 0.0005832153890993879,
"loss": 3.9449,
"step": 4900
},
{
"epoch": 1.4418875618992135,
"grad_norm": 0.3308870792388916,
"learning_rate": 0.0005830405129699796,
"loss": 3.93,
"step": 4950
},
{
"epoch": 1.4564520827264782,
"grad_norm": 0.3472626209259033,
"learning_rate": 0.0005828656368405712,
"loss": 3.9325,
"step": 5000
},
{
"epoch": 1.4564520827264782,
"eval_accuracy": 0.3317897428968204,
"eval_loss": 3.913301467895508,
"eval_runtime": 182.7411,
"eval_samples_per_second": 91.063,
"eval_steps_per_second": 5.697,
"step": 5000
},
{
"epoch": 1.471016603553743,
"grad_norm": 0.32565367221832275,
"learning_rate": 0.0005826907607111629,
"loss": 3.9295,
"step": 5050
},
{
"epoch": 1.485581124381008,
"grad_norm": 0.32511332631111145,
"learning_rate": 0.0005825158845817546,
"loss": 3.9292,
"step": 5100
},
{
"epoch": 1.5001456452082726,
"grad_norm": 0.3272392451763153,
"learning_rate": 0.0005823410084523462,
"loss": 3.9286,
"step": 5150
},
{
"epoch": 1.5147101660355373,
"grad_norm": 0.35094988346099854,
"learning_rate": 0.0005821661323229379,
"loss": 3.9151,
"step": 5200
},
{
"epoch": 1.5292746868628022,
"grad_norm": 0.3307012617588043,
"learning_rate": 0.0005819912561935295,
"loss": 3.9205,
"step": 5250
},
{
"epoch": 1.543839207690067,
"grad_norm": 0.32005444169044495,
"learning_rate": 0.0005818163800641212,
"loss": 3.9183,
"step": 5300
},
{
"epoch": 1.5584037285173318,
"grad_norm": 0.339083194732666,
"learning_rate": 0.0005816415039347129,
"loss": 3.9148,
"step": 5350
},
{
"epoch": 1.5729682493445964,
"grad_norm": 0.33372458815574646,
"learning_rate": 0.0005814666278053045,
"loss": 3.9079,
"step": 5400
},
{
"epoch": 1.5875327701718613,
"grad_norm": 0.31862950325012207,
"learning_rate": 0.0005812917516758962,
"loss": 3.9085,
"step": 5450
},
{
"epoch": 1.6020972909991262,
"grad_norm": 0.3127332329750061,
"learning_rate": 0.0005811168755464879,
"loss": 3.9125,
"step": 5500
},
{
"epoch": 1.616661811826391,
"grad_norm": 0.33630993962287903,
"learning_rate": 0.0005809419994170794,
"loss": 3.9024,
"step": 5550
},
{
"epoch": 1.6312263326536556,
"grad_norm": 0.33405524492263794,
"learning_rate": 0.0005807671232876712,
"loss": 3.8963,
"step": 5600
},
{
"epoch": 1.6457908534809205,
"grad_norm": 0.3463149070739746,
"learning_rate": 0.0005805922471582628,
"loss": 3.8897,
"step": 5650
},
{
"epoch": 1.6603553743081854,
"grad_norm": 0.34034085273742676,
"learning_rate": 0.0005804173710288545,
"loss": 3.8894,
"step": 5700
},
{
"epoch": 1.67491989513545,
"grad_norm": 0.32425183057785034,
"learning_rate": 0.0005802424948994462,
"loss": 3.8879,
"step": 5750
},
{
"epoch": 1.6894844159627147,
"grad_norm": 0.3245879113674164,
"learning_rate": 0.0005800676187700379,
"loss": 3.8917,
"step": 5800
},
{
"epoch": 1.7040489367899796,
"grad_norm": 0.3300783336162567,
"learning_rate": 0.0005798927426406295,
"loss": 3.8913,
"step": 5850
},
{
"epoch": 1.7186134576172445,
"grad_norm": 0.32403889298439026,
"learning_rate": 0.0005797178665112212,
"loss": 3.8974,
"step": 5900
},
{
"epoch": 1.7331779784445092,
"grad_norm": 0.3420887887477875,
"learning_rate": 0.0005795429903818129,
"loss": 3.8882,
"step": 5950
},
{
"epoch": 1.7477424992717738,
"grad_norm": 0.3141533136367798,
"learning_rate": 0.0005793681142524044,
"loss": 3.8836,
"step": 6000
},
{
"epoch": 1.7477424992717738,
"eval_accuracy": 0.3370036823439996,
"eval_loss": 3.8556597232818604,
"eval_runtime": 180.6605,
"eval_samples_per_second": 92.112,
"eval_steps_per_second": 5.762,
"step": 6000
},
{
"epoch": 1.7623070200990387,
"grad_norm": 0.31102555990219116,
"learning_rate": 0.0005791932381229961,
"loss": 3.8717,
"step": 6050
},
{
"epoch": 1.7768715409263036,
"grad_norm": 0.32457712292671204,
"learning_rate": 0.0005790183619935878,
"loss": 3.878,
"step": 6100
},
{
"epoch": 1.7914360617535683,
"grad_norm": 0.3170512914657593,
"learning_rate": 0.0005788434858641795,
"loss": 3.8658,
"step": 6150
},
{
"epoch": 1.806000582580833,
"grad_norm": 0.3156622350215912,
"learning_rate": 0.0005786686097347712,
"loss": 3.8749,
"step": 6200
},
{
"epoch": 1.8205651034080979,
"grad_norm": 0.331449955701828,
"learning_rate": 0.0005784937336053628,
"loss": 3.8718,
"step": 6250
},
{
"epoch": 1.8351296242353627,
"grad_norm": 0.33456477522850037,
"learning_rate": 0.0005783188574759545,
"loss": 3.8733,
"step": 6300
},
{
"epoch": 1.8496941450626274,
"grad_norm": 0.34169068932533264,
"learning_rate": 0.0005781439813465462,
"loss": 3.8603,
"step": 6350
},
{
"epoch": 1.864258665889892,
"grad_norm": 0.33886536955833435,
"learning_rate": 0.0005779691052171379,
"loss": 3.8503,
"step": 6400
},
{
"epoch": 1.878823186717157,
"grad_norm": 0.32133013010025024,
"learning_rate": 0.0005777942290877294,
"loss": 3.8587,
"step": 6450
},
{
"epoch": 1.8933877075444219,
"grad_norm": 0.3214126527309418,
"learning_rate": 0.0005776193529583211,
"loss": 3.8627,
"step": 6500
},
{
"epoch": 1.9079522283716865,
"grad_norm": 0.3037171959877014,
"learning_rate": 0.0005774444768289128,
"loss": 3.8568,
"step": 6550
},
{
"epoch": 1.9225167491989512,
"grad_norm": 0.3018639385700226,
"learning_rate": 0.0005772696006995045,
"loss": 3.8433,
"step": 6600
},
{
"epoch": 1.937081270026216,
"grad_norm": 0.3456135094165802,
"learning_rate": 0.0005770947245700962,
"loss": 3.8554,
"step": 6650
},
{
"epoch": 1.951645790853481,
"grad_norm": 0.30263465642929077,
"learning_rate": 0.0005769198484406878,
"loss": 3.8617,
"step": 6700
},
{
"epoch": 1.9662103116807457,
"grad_norm": 0.33179566264152527,
"learning_rate": 0.0005767449723112795,
"loss": 3.845,
"step": 6750
},
{
"epoch": 1.9807748325080103,
"grad_norm": 0.32268857955932617,
"learning_rate": 0.0005765700961818712,
"loss": 3.8517,
"step": 6800
},
{
"epoch": 1.9953393533352752,
"grad_norm": 0.30953162908554077,
"learning_rate": 0.0005763952200524627,
"loss": 3.8508,
"step": 6850
},
{
"epoch": 2.00990387416254,
"grad_norm": 0.3232676088809967,
"learning_rate": 0.0005762203439230544,
"loss": 3.775,
"step": 6900
},
{
"epoch": 2.024468394989805,
"grad_norm": 0.33479437232017517,
"learning_rate": 0.0005760454677936461,
"loss": 3.7414,
"step": 6950
},
{
"epoch": 2.0390329158170695,
"grad_norm": 0.3089365065097809,
"learning_rate": 0.0005758705916642378,
"loss": 3.7659,
"step": 7000
},
{
"epoch": 2.0390329158170695,
"eval_accuracy": 0.3411992237125408,
"eval_loss": 3.8152074813842773,
"eval_runtime": 179.851,
"eval_samples_per_second": 92.527,
"eval_steps_per_second": 5.788,
"step": 7000
},
{
"epoch": 2.0535974366443344,
"grad_norm": 0.32421961426734924,
"learning_rate": 0.0005756957155348294,
"loss": 3.7506,
"step": 7050
},
{
"epoch": 2.0681619574715993,
"grad_norm": 0.31613728404045105,
"learning_rate": 0.0005755208394054211,
"loss": 3.747,
"step": 7100
},
{
"epoch": 2.082726478298864,
"grad_norm": 0.34031516313552856,
"learning_rate": 0.0005753459632760128,
"loss": 3.7435,
"step": 7150
},
{
"epoch": 2.0972909991261286,
"grad_norm": 0.33439844846725464,
"learning_rate": 0.0005751710871466045,
"loss": 3.7575,
"step": 7200
},
{
"epoch": 2.1118555199533935,
"grad_norm": 0.30961179733276367,
"learning_rate": 0.0005749962110171962,
"loss": 3.7542,
"step": 7250
},
{
"epoch": 2.1264200407806584,
"grad_norm": 0.340189665555954,
"learning_rate": 0.0005748213348877877,
"loss": 3.7555,
"step": 7300
},
{
"epoch": 2.1409845616079233,
"grad_norm": 0.3085797131061554,
"learning_rate": 0.0005746464587583794,
"loss": 3.7475,
"step": 7350
},
{
"epoch": 2.1555490824351877,
"grad_norm": 0.31862226128578186,
"learning_rate": 0.0005744715826289711,
"loss": 3.7449,
"step": 7400
},
{
"epoch": 2.1701136032624526,
"grad_norm": 0.326460599899292,
"learning_rate": 0.0005742967064995627,
"loss": 3.7634,
"step": 7450
},
{
"epoch": 2.1846781240897175,
"grad_norm": 0.32189229130744934,
"learning_rate": 0.0005741218303701544,
"loss": 3.7557,
"step": 7500
},
{
"epoch": 2.1992426449169824,
"grad_norm": 0.33258241415023804,
"learning_rate": 0.0005739469542407461,
"loss": 3.7553,
"step": 7550
},
{
"epoch": 2.213807165744247,
"grad_norm": 0.32828521728515625,
"learning_rate": 0.0005737720781113378,
"loss": 3.7561,
"step": 7600
},
{
"epoch": 2.2283716865715117,
"grad_norm": 0.3204036355018616,
"learning_rate": 0.0005735972019819295,
"loss": 3.7508,
"step": 7650
},
{
"epoch": 2.2429362073987766,
"grad_norm": 0.3126814365386963,
"learning_rate": 0.000573422325852521,
"loss": 3.7471,
"step": 7700
},
{
"epoch": 2.2575007282260415,
"grad_norm": 0.3180292546749115,
"learning_rate": 0.0005732474497231127,
"loss": 3.7549,
"step": 7750
},
{
"epoch": 2.272065249053306,
"grad_norm": 0.34320777654647827,
"learning_rate": 0.0005730725735937044,
"loss": 3.7495,
"step": 7800
},
{
"epoch": 2.286629769880571,
"grad_norm": 0.2991611063480377,
"learning_rate": 0.0005728976974642961,
"loss": 3.7613,
"step": 7850
},
{
"epoch": 2.3011942907078358,
"grad_norm": 0.3215004503726959,
"learning_rate": 0.0005727228213348877,
"loss": 3.7665,
"step": 7900
},
{
"epoch": 2.3157588115351007,
"grad_norm": 0.3161200284957886,
"learning_rate": 0.0005725479452054794,
"loss": 3.7515,
"step": 7950
},
{
"epoch": 2.330323332362365,
"grad_norm": 0.3195878267288208,
"learning_rate": 0.0005723730690760711,
"loss": 3.7532,
"step": 8000
},
{
"epoch": 2.330323332362365,
"eval_accuracy": 0.3439830019247253,
"eval_loss": 3.782982349395752,
"eval_runtime": 181.9405,
"eval_samples_per_second": 91.464,
"eval_steps_per_second": 5.722,
"step": 8000
},
{
"epoch": 2.34488785318963,
"grad_norm": 0.31494781374931335,
"learning_rate": 0.0005721981929466627,
"loss": 3.7528,
"step": 8050
},
{
"epoch": 2.359452374016895,
"grad_norm": 0.3217615783214569,
"learning_rate": 0.0005720233168172545,
"loss": 3.7411,
"step": 8100
},
{
"epoch": 2.37401689484416,
"grad_norm": 0.3436919152736664,
"learning_rate": 0.000571848440687846,
"loss": 3.7428,
"step": 8150
},
{
"epoch": 2.3885814156714242,
"grad_norm": 0.33180883526802063,
"learning_rate": 0.0005716735645584377,
"loss": 3.7483,
"step": 8200
},
{
"epoch": 2.403145936498689,
"grad_norm": 0.3267768919467926,
"learning_rate": 0.0005714986884290294,
"loss": 3.7527,
"step": 8250
},
{
"epoch": 2.417710457325954,
"grad_norm": 0.3256717920303345,
"learning_rate": 0.000571323812299621,
"loss": 3.7446,
"step": 8300
},
{
"epoch": 2.432274978153219,
"grad_norm": 0.29885855317115784,
"learning_rate": 0.0005711489361702127,
"loss": 3.7641,
"step": 8350
},
{
"epoch": 2.4468394989804834,
"grad_norm": 0.33645039796829224,
"learning_rate": 0.0005709740600408044,
"loss": 3.7423,
"step": 8400
},
{
"epoch": 2.4614040198077483,
"grad_norm": 0.34129101037979126,
"learning_rate": 0.0005707991839113961,
"loss": 3.7658,
"step": 8450
},
{
"epoch": 2.475968540635013,
"grad_norm": 0.3134017884731293,
"learning_rate": 0.0005706243077819877,
"loss": 3.7417,
"step": 8500
},
{
"epoch": 2.490533061462278,
"grad_norm": 0.34077146649360657,
"learning_rate": 0.0005704494316525793,
"loss": 3.7468,
"step": 8550
},
{
"epoch": 2.505097582289543,
"grad_norm": 0.3154885470867157,
"learning_rate": 0.000570274555523171,
"loss": 3.7407,
"step": 8600
},
{
"epoch": 2.5196621031168074,
"grad_norm": 0.30456680059432983,
"learning_rate": 0.0005700996793937627,
"loss": 3.7523,
"step": 8650
},
{
"epoch": 2.5342266239440723,
"grad_norm": 0.3212231397628784,
"learning_rate": 0.0005699248032643544,
"loss": 3.7513,
"step": 8700
},
{
"epoch": 2.548791144771337,
"grad_norm": 0.32085394859313965,
"learning_rate": 0.000569749927134946,
"loss": 3.7364,
"step": 8750
},
{
"epoch": 2.5633556655986016,
"grad_norm": 0.32111606001853943,
"learning_rate": 0.0005695750510055377,
"loss": 3.7442,
"step": 8800
},
{
"epoch": 2.5779201864258665,
"grad_norm": 0.3163037896156311,
"learning_rate": 0.0005694001748761294,
"loss": 3.7447,
"step": 8850
},
{
"epoch": 2.5924847072531314,
"grad_norm": 0.3073652386665344,
"learning_rate": 0.000569225298746721,
"loss": 3.7537,
"step": 8900
},
{
"epoch": 2.6070492280803963,
"grad_norm": 0.3179319500923157,
"learning_rate": 0.0005690504226173127,
"loss": 3.7442,
"step": 8950
},
{
"epoch": 2.621613748907661,
"grad_norm": 0.32027360796928406,
"learning_rate": 0.0005688755464879043,
"loss": 3.7549,
"step": 9000
},
{
"epoch": 2.621613748907661,
"eval_accuracy": 0.34702843553240287,
"eval_loss": 3.7536637783050537,
"eval_runtime": 182.0423,
"eval_samples_per_second": 91.413,
"eval_steps_per_second": 5.718,
"step": 9000
},
{
"epoch": 2.6361782697349256,
"grad_norm": 0.31084582209587097,
"learning_rate": 0.000568700670358496,
"loss": 3.7455,
"step": 9050
},
{
"epoch": 2.6507427905621905,
"grad_norm": 0.32904064655303955,
"learning_rate": 0.0005685257942290877,
"loss": 3.7334,
"step": 9100
},
{
"epoch": 2.6653073113894554,
"grad_norm": 0.3119734227657318,
"learning_rate": 0.0005683509180996793,
"loss": 3.7421,
"step": 9150
},
{
"epoch": 2.67987183221672,
"grad_norm": 0.3344007730484009,
"learning_rate": 0.000568176041970271,
"loss": 3.7414,
"step": 9200
},
{
"epoch": 2.6944363530439848,
"grad_norm": 0.31935930252075195,
"learning_rate": 0.0005680011658408627,
"loss": 3.7334,
"step": 9250
},
{
"epoch": 2.7090008738712497,
"grad_norm": 0.29859659075737,
"learning_rate": 0.0005678262897114544,
"loss": 3.73,
"step": 9300
},
{
"epoch": 2.7235653946985146,
"grad_norm": 0.3266841769218445,
"learning_rate": 0.000567651413582046,
"loss": 3.7266,
"step": 9350
},
{
"epoch": 2.7381299155257794,
"grad_norm": 0.31048107147216797,
"learning_rate": 0.0005674765374526377,
"loss": 3.7315,
"step": 9400
},
{
"epoch": 2.752694436353044,
"grad_norm": 0.32837024331092834,
"learning_rate": 0.0005673016613232293,
"loss": 3.734,
"step": 9450
},
{
"epoch": 2.767258957180309,
"grad_norm": 0.3099216818809509,
"learning_rate": 0.0005671267851938209,
"loss": 3.7351,
"step": 9500
},
{
"epoch": 2.7818234780075737,
"grad_norm": 0.32197684049606323,
"learning_rate": 0.0005669519090644127,
"loss": 3.7335,
"step": 9550
},
{
"epoch": 2.796387998834838,
"grad_norm": 0.32030460238456726,
"learning_rate": 0.0005667770329350043,
"loss": 3.7219,
"step": 9600
},
{
"epoch": 2.810952519662103,
"grad_norm": 0.31780993938446045,
"learning_rate": 0.000566602156805596,
"loss": 3.719,
"step": 9650
},
{
"epoch": 2.825517040489368,
"grad_norm": 0.3219031095504761,
"learning_rate": 0.0005664272806761877,
"loss": 3.7344,
"step": 9700
},
{
"epoch": 2.840081561316633,
"grad_norm": 0.3055329918861389,
"learning_rate": 0.0005662524045467793,
"loss": 3.721,
"step": 9750
},
{
"epoch": 2.8546460821438977,
"grad_norm": 0.32618221640586853,
"learning_rate": 0.000566077528417371,
"loss": 3.7255,
"step": 9800
},
{
"epoch": 2.869210602971162,
"grad_norm": 0.30417823791503906,
"learning_rate": 0.0005659026522879626,
"loss": 3.7293,
"step": 9850
},
{
"epoch": 2.883775123798427,
"grad_norm": 0.30620038509368896,
"learning_rate": 0.0005657277761585543,
"loss": 3.7166,
"step": 9900
},
{
"epoch": 2.898339644625692,
"grad_norm": 0.29888835549354553,
"learning_rate": 0.0005655529000291459,
"loss": 3.7218,
"step": 9950
},
{
"epoch": 2.9129041654529564,
"grad_norm": 0.3025401830673218,
"learning_rate": 0.0005653780238997376,
"loss": 3.7257,
"step": 10000
},
{
"epoch": 2.9129041654529564,
"eval_accuracy": 0.3495604365752613,
"eval_loss": 3.72790265083313,
"eval_runtime": 179.1187,
"eval_samples_per_second": 92.905,
"eval_steps_per_second": 5.812,
"step": 10000
},
{
"epoch": 2.9274686862802213,
"grad_norm": 0.300513356924057,
"learning_rate": 0.0005652031477703293,
"loss": 3.7143,
"step": 10050
},
{
"epoch": 2.942033207107486,
"grad_norm": 0.3117719292640686,
"learning_rate": 0.000565028271640921,
"loss": 3.7157,
"step": 10100
},
{
"epoch": 2.956597727934751,
"grad_norm": 0.30454131960868835,
"learning_rate": 0.0005648533955115127,
"loss": 3.7216,
"step": 10150
},
{
"epoch": 2.971162248762016,
"grad_norm": 0.3001880943775177,
"learning_rate": 0.0005646785193821043,
"loss": 3.7289,
"step": 10200
},
{
"epoch": 2.9857267695892804,
"grad_norm": 0.3038688898086548,
"learning_rate": 0.000564503643252696,
"loss": 3.7216,
"step": 10250
},
{
"epoch": 3.0002912904165453,
"grad_norm": 0.3145761787891388,
"learning_rate": 0.0005643287671232876,
"loss": 3.7028,
"step": 10300
},
{
"epoch": 3.01485581124381,
"grad_norm": 0.312094509601593,
"learning_rate": 0.0005641538909938792,
"loss": 3.6102,
"step": 10350
},
{
"epoch": 3.0294203320710746,
"grad_norm": 0.32154324650764465,
"learning_rate": 0.0005639790148644709,
"loss": 3.6075,
"step": 10400
},
{
"epoch": 3.0439848528983395,
"grad_norm": 0.37198418378829956,
"learning_rate": 0.0005638041387350626,
"loss": 3.6255,
"step": 10450
},
{
"epoch": 3.0585493737256044,
"grad_norm": 0.3222768008708954,
"learning_rate": 0.0005636292626056543,
"loss": 3.6191,
"step": 10500
},
{
"epoch": 3.0731138945528693,
"grad_norm": 0.3282540738582611,
"learning_rate": 0.000563454386476246,
"loss": 3.6232,
"step": 10550
},
{
"epoch": 3.087678415380134,
"grad_norm": 0.31507760286331177,
"learning_rate": 0.0005632795103468376,
"loss": 3.6289,
"step": 10600
},
{
"epoch": 3.1022429362073987,
"grad_norm": 0.32736721634864807,
"learning_rate": 0.0005631046342174293,
"loss": 3.6296,
"step": 10650
},
{
"epoch": 3.1168074570346636,
"grad_norm": 0.30472615361213684,
"learning_rate": 0.000562929758088021,
"loss": 3.6449,
"step": 10700
},
{
"epoch": 3.1313719778619284,
"grad_norm": 0.31732335686683655,
"learning_rate": 0.0005627548819586126,
"loss": 3.6378,
"step": 10750
},
{
"epoch": 3.145936498689193,
"grad_norm": 0.3036065697669983,
"learning_rate": 0.0005625800058292042,
"loss": 3.6246,
"step": 10800
},
{
"epoch": 3.160501019516458,
"grad_norm": 0.30610308051109314,
"learning_rate": 0.0005624051296997959,
"loss": 3.6416,
"step": 10850
},
{
"epoch": 3.1750655403437227,
"grad_norm": 0.3098933696746826,
"learning_rate": 0.0005622302535703876,
"loss": 3.6409,
"step": 10900
},
{
"epoch": 3.1896300611709876,
"grad_norm": 0.3232332170009613,
"learning_rate": 0.0005620553774409792,
"loss": 3.6316,
"step": 10950
},
{
"epoch": 3.2041945819982525,
"grad_norm": 0.32577645778656006,
"learning_rate": 0.000561880501311571,
"loss": 3.631,
"step": 11000
},
{
"epoch": 3.2041945819982525,
"eval_accuracy": 0.3517711600718335,
"eval_loss": 3.7140092849731445,
"eval_runtime": 177.1297,
"eval_samples_per_second": 93.948,
"eval_steps_per_second": 5.877,
"step": 11000
},
{
"epoch": 3.218759102825517,
"grad_norm": 0.3147296607494354,
"learning_rate": 0.0005617056251821626,
"loss": 3.631,
"step": 11050
},
{
"epoch": 3.233323623652782,
"grad_norm": 0.31632447242736816,
"learning_rate": 0.0005615307490527543,
"loss": 3.6358,
"step": 11100
},
{
"epoch": 3.2478881444800467,
"grad_norm": 0.3221932649612427,
"learning_rate": 0.000561355872923346,
"loss": 3.6407,
"step": 11150
},
{
"epoch": 3.262452665307311,
"grad_norm": 0.33630186319351196,
"learning_rate": 0.0005611809967939375,
"loss": 3.6311,
"step": 11200
},
{
"epoch": 3.277017186134576,
"grad_norm": 0.3187567889690399,
"learning_rate": 0.0005610061206645292,
"loss": 3.6442,
"step": 11250
},
{
"epoch": 3.291581706961841,
"grad_norm": 0.31802159547805786,
"learning_rate": 0.0005608312445351209,
"loss": 3.6407,
"step": 11300
},
{
"epoch": 3.306146227789106,
"grad_norm": 0.3117535412311554,
"learning_rate": 0.0005606563684057126,
"loss": 3.6412,
"step": 11350
},
{
"epoch": 3.3207107486163707,
"grad_norm": 0.3003256022930145,
"learning_rate": 0.0005604814922763042,
"loss": 3.6494,
"step": 11400
},
{
"epoch": 3.335275269443635,
"grad_norm": 0.3591480255126953,
"learning_rate": 0.0005603066161468959,
"loss": 3.6408,
"step": 11450
},
{
"epoch": 3.3498397902709,
"grad_norm": 0.3271133005619049,
"learning_rate": 0.0005601317400174876,
"loss": 3.6469,
"step": 11500
},
{
"epoch": 3.364404311098165,
"grad_norm": 0.3171232342720032,
"learning_rate": 0.0005599568638880793,
"loss": 3.6423,
"step": 11550
},
{
"epoch": 3.3789688319254294,
"grad_norm": 0.3135507106781006,
"learning_rate": 0.0005597819877586709,
"loss": 3.6472,
"step": 11600
},
{
"epoch": 3.3935333527526943,
"grad_norm": 0.30243971943855286,
"learning_rate": 0.0005596071116292625,
"loss": 3.6556,
"step": 11650
},
{
"epoch": 3.408097873579959,
"grad_norm": 0.3211742043495178,
"learning_rate": 0.0005594322354998542,
"loss": 3.636,
"step": 11700
},
{
"epoch": 3.422662394407224,
"grad_norm": 0.3331124484539032,
"learning_rate": 0.0005592573593704459,
"loss": 3.6303,
"step": 11750
},
{
"epoch": 3.437226915234489,
"grad_norm": 0.32489416003227234,
"learning_rate": 0.0005590824832410375,
"loss": 3.638,
"step": 11800
},
{
"epoch": 3.4517914360617534,
"grad_norm": 0.31111645698547363,
"learning_rate": 0.0005589076071116292,
"loss": 3.644,
"step": 11850
},
{
"epoch": 3.4663559568890183,
"grad_norm": 0.31853461265563965,
"learning_rate": 0.0005587327309822209,
"loss": 3.6332,
"step": 11900
},
{
"epoch": 3.480920477716283,
"grad_norm": 0.31134849786758423,
"learning_rate": 0.0005585578548528126,
"loss": 3.6513,
"step": 11950
},
{
"epoch": 3.495484998543548,
"grad_norm": 0.31197866797447205,
"learning_rate": 0.0005583829787234043,
"loss": 3.6437,
"step": 12000
},
{
"epoch": 3.495484998543548,
"eval_accuracy": 0.35321032354601034,
"eval_loss": 3.6977498531341553,
"eval_runtime": 177.4665,
"eval_samples_per_second": 93.77,
"eval_steps_per_second": 5.866,
"step": 12000
},
{
"epoch": 3.5100495193708126,
"grad_norm": 0.3007756173610687,
"learning_rate": 0.0005582081025939958,
"loss": 3.6602,
"step": 12050
},
{
"epoch": 3.5246140401980774,
"grad_norm": 0.29941409826278687,
"learning_rate": 0.0005580332264645875,
"loss": 3.6426,
"step": 12100
},
{
"epoch": 3.5391785610253423,
"grad_norm": 0.31950122117996216,
"learning_rate": 0.0005578583503351792,
"loss": 3.6384,
"step": 12150
},
{
"epoch": 3.5537430818526072,
"grad_norm": 0.30377620458602905,
"learning_rate": 0.0005576834742057709,
"loss": 3.6427,
"step": 12200
},
{
"epoch": 3.5683076026798717,
"grad_norm": 0.343955397605896,
"learning_rate": 0.0005575085980763625,
"loss": 3.6349,
"step": 12250
},
{
"epoch": 3.5828721235071366,
"grad_norm": 0.3224445879459381,
"learning_rate": 0.0005573337219469542,
"loss": 3.6349,
"step": 12300
},
{
"epoch": 3.5974366443344015,
"grad_norm": 0.3116309344768524,
"learning_rate": 0.0005571588458175459,
"loss": 3.6422,
"step": 12350
},
{
"epoch": 3.612001165161666,
"grad_norm": 0.31105419993400574,
"learning_rate": 0.0005569839696881374,
"loss": 3.6441,
"step": 12400
},
{
"epoch": 3.626565685988931,
"grad_norm": 0.3085189461708069,
"learning_rate": 0.0005568090935587292,
"loss": 3.6405,
"step": 12450
},
{
"epoch": 3.6411302068161957,
"grad_norm": 0.29982301592826843,
"learning_rate": 0.0005566342174293208,
"loss": 3.6503,
"step": 12500
},
{
"epoch": 3.6556947276434606,
"grad_norm": 0.31388601660728455,
"learning_rate": 0.0005564593412999125,
"loss": 3.6528,
"step": 12550
},
{
"epoch": 3.6702592484707255,
"grad_norm": 0.305575430393219,
"learning_rate": 0.0005562844651705042,
"loss": 3.6421,
"step": 12600
},
{
"epoch": 3.68482376929799,
"grad_norm": 0.32143914699554443,
"learning_rate": 0.0005561095890410958,
"loss": 3.646,
"step": 12650
},
{
"epoch": 3.699388290125255,
"grad_norm": 0.3150655925273895,
"learning_rate": 0.0005559347129116875,
"loss": 3.637,
"step": 12700
},
{
"epoch": 3.7139528109525197,
"grad_norm": 0.31396007537841797,
"learning_rate": 0.0005557598367822792,
"loss": 3.647,
"step": 12750
},
{
"epoch": 3.728517331779784,
"grad_norm": 0.3067757189273834,
"learning_rate": 0.0005555849606528709,
"loss": 3.6471,
"step": 12800
},
{
"epoch": 3.743081852607049,
"grad_norm": 0.313796728849411,
"learning_rate": 0.0005554100845234624,
"loss": 3.6308,
"step": 12850
},
{
"epoch": 3.757646373434314,
"grad_norm": 0.3006753921508789,
"learning_rate": 0.0005552352083940541,
"loss": 3.649,
"step": 12900
},
{
"epoch": 3.772210894261579,
"grad_norm": 0.3247009217739105,
"learning_rate": 0.0005550603322646458,
"loss": 3.633,
"step": 12950
},
{
"epoch": 3.7867754150888437,
"grad_norm": 0.30989664793014526,
"learning_rate": 0.0005548854561352375,
"loss": 3.6449,
"step": 13000
},
{
"epoch": 3.7867754150888437,
"eval_accuracy": 0.3550483792006422,
"eval_loss": 3.677523612976074,
"eval_runtime": 176.8362,
"eval_samples_per_second": 94.104,
"eval_steps_per_second": 5.887,
"step": 13000
},
{
"epoch": 3.8013399359161086,
"grad_norm": 0.31539955735206604,
"learning_rate": 0.0005547105800058292,
"loss": 3.6357,
"step": 13050
},
{
"epoch": 3.815904456743373,
"grad_norm": 0.2919875979423523,
"learning_rate": 0.0005545357038764208,
"loss": 3.6256,
"step": 13100
},
{
"epoch": 3.830468977570638,
"grad_norm": 0.3210577070713043,
"learning_rate": 0.0005543608277470125,
"loss": 3.636,
"step": 13150
},
{
"epoch": 3.845033498397903,
"grad_norm": 0.3022839426994324,
"learning_rate": 0.0005541859516176042,
"loss": 3.6284,
"step": 13200
},
{
"epoch": 3.8595980192251673,
"grad_norm": 0.315828800201416,
"learning_rate": 0.0005540110754881958,
"loss": 3.6304,
"step": 13250
},
{
"epoch": 3.874162540052432,
"grad_norm": 0.30515676736831665,
"learning_rate": 0.0005538361993587874,
"loss": 3.6442,
"step": 13300
},
{
"epoch": 3.888727060879697,
"grad_norm": 0.3042634427547455,
"learning_rate": 0.0005536613232293791,
"loss": 3.6366,
"step": 13350
},
{
"epoch": 3.903291581706962,
"grad_norm": 0.3206663727760315,
"learning_rate": 0.0005534864470999708,
"loss": 3.6317,
"step": 13400
},
{
"epoch": 3.917856102534227,
"grad_norm": 0.3056301772594452,
"learning_rate": 0.0005533115709705625,
"loss": 3.6391,
"step": 13450
},
{
"epoch": 3.9324206233614913,
"grad_norm": 0.3067208230495453,
"learning_rate": 0.0005531366948411541,
"loss": 3.6312,
"step": 13500
},
{
"epoch": 3.9469851441887562,
"grad_norm": 0.3234356641769409,
"learning_rate": 0.0005529618187117458,
"loss": 3.6221,
"step": 13550
},
{
"epoch": 3.961549665016021,
"grad_norm": 0.31787794828414917,
"learning_rate": 0.0005527869425823375,
"loss": 3.6323,
"step": 13600
},
{
"epoch": 3.9761141858432856,
"grad_norm": 0.2994586229324341,
"learning_rate": 0.0005526120664529292,
"loss": 3.6276,
"step": 13650
},
{
"epoch": 3.9906787066705505,
"grad_norm": 0.31581565737724304,
"learning_rate": 0.0005524371903235207,
"loss": 3.6349,
"step": 13700
},
{
"epoch": 4.005243227497815,
"grad_norm": 0.32180607318878174,
"learning_rate": 0.0005522623141941124,
"loss": 3.5938,
"step": 13750
},
{
"epoch": 4.01980774832508,
"grad_norm": 0.32863345742225647,
"learning_rate": 0.0005520874380647041,
"loss": 3.522,
"step": 13800
},
{
"epoch": 4.034372269152345,
"grad_norm": 0.3115909695625305,
"learning_rate": 0.0005519125619352957,
"loss": 3.5133,
"step": 13850
},
{
"epoch": 4.04893678997961,
"grad_norm": 0.3063027858734131,
"learning_rate": 0.0005517376858058875,
"loss": 3.5327,
"step": 13900
},
{
"epoch": 4.063501310806874,
"grad_norm": 0.3200002908706665,
"learning_rate": 0.0005515628096764791,
"loss": 3.5327,
"step": 13950
},
{
"epoch": 4.078065831634139,
"grad_norm": 0.32157978415489197,
"learning_rate": 0.0005513879335470708,
"loss": 3.5368,
"step": 14000
},
{
"epoch": 4.078065831634139,
"eval_accuracy": 0.356187785549825,
"eval_loss": 3.672633171081543,
"eval_runtime": 179.4019,
"eval_samples_per_second": 92.758,
"eval_steps_per_second": 5.803,
"step": 14000
},
{
"epoch": 4.092630352461404,
"grad_norm": 0.3131697475910187,
"learning_rate": 0.0005512130574176625,
"loss": 3.5373,
"step": 14050
},
{
"epoch": 4.107194873288669,
"grad_norm": 0.3397645652294159,
"learning_rate": 0.000551038181288254,
"loss": 3.5416,
"step": 14100
},
{
"epoch": 4.121759394115934,
"grad_norm": 0.31081974506378174,
"learning_rate": 0.0005508633051588457,
"loss": 3.5646,
"step": 14150
},
{
"epoch": 4.1363239149431985,
"grad_norm": 0.3408530354499817,
"learning_rate": 0.0005506884290294374,
"loss": 3.5485,
"step": 14200
},
{
"epoch": 4.150888435770463,
"grad_norm": 0.3124403655529022,
"learning_rate": 0.0005505135529000291,
"loss": 3.5518,
"step": 14250
},
{
"epoch": 4.165452956597728,
"grad_norm": 0.3036765456199646,
"learning_rate": 0.0005503386767706207,
"loss": 3.5706,
"step": 14300
},
{
"epoch": 4.180017477424992,
"grad_norm": 0.3298614025115967,
"learning_rate": 0.0005501638006412124,
"loss": 3.5419,
"step": 14350
},
{
"epoch": 4.194581998252257,
"grad_norm": 0.3148026764392853,
"learning_rate": 0.0005499889245118041,
"loss": 3.5606,
"step": 14400
},
{
"epoch": 4.209146519079522,
"grad_norm": 0.3329651653766632,
"learning_rate": 0.0005498140483823958,
"loss": 3.5666,
"step": 14450
},
{
"epoch": 4.223711039906787,
"grad_norm": 0.3147095739841461,
"learning_rate": 0.0005496391722529875,
"loss": 3.5674,
"step": 14500
},
{
"epoch": 4.238275560734052,
"grad_norm": 0.319242388010025,
"learning_rate": 0.000549464296123579,
"loss": 3.5579,
"step": 14550
},
{
"epoch": 4.252840081561317,
"grad_norm": 0.3104586601257324,
"learning_rate": 0.0005492894199941707,
"loss": 3.5685,
"step": 14600
},
{
"epoch": 4.267404602388582,
"grad_norm": 0.32091012597084045,
"learning_rate": 0.0005491145438647624,
"loss": 3.5587,
"step": 14650
},
{
"epoch": 4.2819691232158466,
"grad_norm": 0.3283713161945343,
"learning_rate": 0.000548939667735354,
"loss": 3.5648,
"step": 14700
},
{
"epoch": 4.2965336440431106,
"grad_norm": 0.3096083700656891,
"learning_rate": 0.0005487647916059457,
"loss": 3.5531,
"step": 14750
},
{
"epoch": 4.3110981648703754,
"grad_norm": 0.3022560775279999,
"learning_rate": 0.0005485899154765374,
"loss": 3.5714,
"step": 14800
},
{
"epoch": 4.32566268569764,
"grad_norm": 0.30840808153152466,
"learning_rate": 0.0005484150393471291,
"loss": 3.5691,
"step": 14850
},
{
"epoch": 4.340227206524905,
"grad_norm": 0.3090154528617859,
"learning_rate": 0.0005482401632177208,
"loss": 3.5772,
"step": 14900
},
{
"epoch": 4.35479172735217,
"grad_norm": 0.3259793221950531,
"learning_rate": 0.0005480652870883124,
"loss": 3.5707,
"step": 14950
},
{
"epoch": 4.369356248179435,
"grad_norm": 0.29390519857406616,
"learning_rate": 0.000547890410958904,
"loss": 3.5699,
"step": 15000
},
{
"epoch": 4.369356248179435,
"eval_accuracy": 0.3573489475161612,
"eval_loss": 3.6574037075042725,
"eval_runtime": 177.1658,
"eval_samples_per_second": 93.929,
"eval_steps_per_second": 5.876,
"step": 15000
},
{
"epoch": 4.3839207690067,
"grad_norm": 0.3094480037689209,
"learning_rate": 0.0005477155348294957,
"loss": 3.563,
"step": 15050
},
{
"epoch": 4.398485289833965,
"grad_norm": 0.31205040216445923,
"learning_rate": 0.0005475406587000874,
"loss": 3.5634,
"step": 15100
},
{
"epoch": 4.41304981066123,
"grad_norm": 0.31506818532943726,
"learning_rate": 0.000547365782570679,
"loss": 3.5656,
"step": 15150
},
{
"epoch": 4.427614331488494,
"grad_norm": 0.32489773631095886,
"learning_rate": 0.0005471909064412707,
"loss": 3.5617,
"step": 15200
},
{
"epoch": 4.442178852315759,
"grad_norm": 0.30814430117607117,
"learning_rate": 0.0005470160303118624,
"loss": 3.5657,
"step": 15250
},
{
"epoch": 4.4567433731430235,
"grad_norm": 0.32689669728279114,
"learning_rate": 0.000546841154182454,
"loss": 3.5538,
"step": 15300
},
{
"epoch": 4.471307893970288,
"grad_norm": 0.29945799708366394,
"learning_rate": 0.0005466662780530458,
"loss": 3.5746,
"step": 15350
},
{
"epoch": 4.485872414797553,
"grad_norm": 0.30545687675476074,
"learning_rate": 0.0005464914019236374,
"loss": 3.5712,
"step": 15400
},
{
"epoch": 4.500436935624818,
"grad_norm": 0.3214097023010254,
"learning_rate": 0.000546316525794229,
"loss": 3.572,
"step": 15450
},
{
"epoch": 4.515001456452083,
"grad_norm": 0.31878820061683655,
"learning_rate": 0.0005461416496648207,
"loss": 3.5719,
"step": 15500
},
{
"epoch": 4.529565977279347,
"grad_norm": 0.30244311690330505,
"learning_rate": 0.0005459667735354123,
"loss": 3.5655,
"step": 15550
},
{
"epoch": 4.544130498106612,
"grad_norm": 0.3169194161891937,
"learning_rate": 0.000545791897406004,
"loss": 3.5598,
"step": 15600
},
{
"epoch": 4.558695018933877,
"grad_norm": 0.30807802081108093,
"learning_rate": 0.0005456170212765957,
"loss": 3.5783,
"step": 15650
},
{
"epoch": 4.573259539761142,
"grad_norm": 0.32763341069221497,
"learning_rate": 0.0005454421451471874,
"loss": 3.5838,
"step": 15700
},
{
"epoch": 4.587824060588407,
"grad_norm": 0.3086375892162323,
"learning_rate": 0.000545267269017779,
"loss": 3.5667,
"step": 15750
},
{
"epoch": 4.6023885814156715,
"grad_norm": 0.3195725977420807,
"learning_rate": 0.0005450923928883708,
"loss": 3.5764,
"step": 15800
},
{
"epoch": 4.616953102242936,
"grad_norm": 0.3060102164745331,
"learning_rate": 0.0005449175167589623,
"loss": 3.5688,
"step": 15850
},
{
"epoch": 4.631517623070201,
"grad_norm": 0.317550390958786,
"learning_rate": 0.000544742640629554,
"loss": 3.5628,
"step": 15900
},
{
"epoch": 4.646082143897466,
"grad_norm": 0.3243100047111511,
"learning_rate": 0.0005445677645001457,
"loss": 3.5821,
"step": 15950
},
{
"epoch": 4.66064666472473,
"grad_norm": 0.34374845027923584,
"learning_rate": 0.0005443928883707373,
"loss": 3.5808,
"step": 16000
},
{
"epoch": 4.66064666472473,
"eval_accuracy": 0.35801655096794266,
"eval_loss": 3.6481385231018066,
"eval_runtime": 178.0131,
"eval_samples_per_second": 93.482,
"eval_steps_per_second": 5.848,
"step": 16000
},
{
"epoch": 4.675211185551995,
"grad_norm": 0.3434811532497406,
"learning_rate": 0.000544218012241329,
"loss": 3.5719,
"step": 16050
},
{
"epoch": 4.68977570637926,
"grad_norm": 0.3100740313529968,
"learning_rate": 0.0005440431361119207,
"loss": 3.5787,
"step": 16100
},
{
"epoch": 4.704340227206525,
"grad_norm": 0.3187655508518219,
"learning_rate": 0.0005438682599825123,
"loss": 3.5632,
"step": 16150
},
{
"epoch": 4.71890474803379,
"grad_norm": 0.3257180452346802,
"learning_rate": 0.000543693383853104,
"loss": 3.5716,
"step": 16200
},
{
"epoch": 4.733469268861055,
"grad_norm": 0.32113736867904663,
"learning_rate": 0.0005435185077236957,
"loss": 3.5782,
"step": 16250
},
{
"epoch": 4.74803378968832,
"grad_norm": 0.31867560744285583,
"learning_rate": 0.0005433436315942873,
"loss": 3.5806,
"step": 16300
},
{
"epoch": 4.762598310515584,
"grad_norm": 0.32009127736091614,
"learning_rate": 0.000543168755464879,
"loss": 3.5803,
"step": 16350
},
{
"epoch": 4.7771628313428485,
"grad_norm": 0.31877410411834717,
"learning_rate": 0.0005429938793354706,
"loss": 3.5814,
"step": 16400
},
{
"epoch": 4.791727352170113,
"grad_norm": 0.3448466360569,
"learning_rate": 0.0005428190032060623,
"loss": 3.5634,
"step": 16450
},
{
"epoch": 4.806291872997378,
"grad_norm": 0.3082084059715271,
"learning_rate": 0.000542644127076654,
"loss": 3.5878,
"step": 16500
},
{
"epoch": 4.820856393824643,
"grad_norm": 0.33161383867263794,
"learning_rate": 0.0005424692509472457,
"loss": 3.5829,
"step": 16550
},
{
"epoch": 4.835420914651908,
"grad_norm": 0.3231264650821686,
"learning_rate": 0.0005422943748178373,
"loss": 3.5867,
"step": 16600
},
{
"epoch": 4.849985435479173,
"grad_norm": 0.2988690137863159,
"learning_rate": 0.000542119498688429,
"loss": 3.5869,
"step": 16650
},
{
"epoch": 4.864549956306438,
"grad_norm": 0.31382545828819275,
"learning_rate": 0.0005419446225590207,
"loss": 3.5744,
"step": 16700
},
{
"epoch": 4.879114477133703,
"grad_norm": 0.3041090965270996,
"learning_rate": 0.0005417697464296122,
"loss": 3.5798,
"step": 16750
},
{
"epoch": 4.893678997960967,
"grad_norm": 0.32487449049949646,
"learning_rate": 0.000541594870300204,
"loss": 3.5786,
"step": 16800
},
{
"epoch": 4.908243518788232,
"grad_norm": 0.3080225884914398,
"learning_rate": 0.0005414199941707956,
"loss": 3.5735,
"step": 16850
},
{
"epoch": 4.9228080396154965,
"grad_norm": 0.3145715594291687,
"learning_rate": 0.0005412451180413873,
"loss": 3.5699,
"step": 16900
},
{
"epoch": 4.937372560442761,
"grad_norm": 0.29266849160194397,
"learning_rate": 0.000541070241911979,
"loss": 3.5733,
"step": 16950
},
{
"epoch": 4.951937081270026,
"grad_norm": 0.3222334086894989,
"learning_rate": 0.0005408953657825706,
"loss": 3.5669,
"step": 17000
},
{
"epoch": 4.951937081270026,
"eval_accuracy": 0.35965457254269423,
"eval_loss": 3.634514808654785,
"eval_runtime": 178.122,
"eval_samples_per_second": 93.425,
"eval_steps_per_second": 5.844,
"step": 17000
},
{
"epoch": 4.966501602097291,
"grad_norm": 0.3042418360710144,
"learning_rate": 0.0005407204896531623,
"loss": 3.589,
"step": 17050
},
{
"epoch": 4.981066122924556,
"grad_norm": 0.30115336179733276,
"learning_rate": 0.000540545613523754,
"loss": 3.5776,
"step": 17100
},
{
"epoch": 4.99563064375182,
"grad_norm": 0.3397320508956909,
"learning_rate": 0.0005403707373943456,
"loss": 3.5842,
"step": 17150
},
{
"epoch": 5.010195164579085,
"grad_norm": 0.3029438555240631,
"learning_rate": 0.0005401958612649372,
"loss": 3.51,
"step": 17200
},
{
"epoch": 5.02475968540635,
"grad_norm": 0.31321457028388977,
"learning_rate": 0.000540020985135529,
"loss": 3.4687,
"step": 17250
},
{
"epoch": 5.039324206233615,
"grad_norm": 0.31178945302963257,
"learning_rate": 0.0005398461090061206,
"loss": 3.4781,
"step": 17300
},
{
"epoch": 5.05388872706088,
"grad_norm": 0.34495481848716736,
"learning_rate": 0.0005396712328767123,
"loss": 3.4682,
"step": 17350
},
{
"epoch": 5.0684532478881446,
"grad_norm": 0.30439019203186035,
"learning_rate": 0.000539496356747304,
"loss": 3.4708,
"step": 17400
},
{
"epoch": 5.0830177687154094,
"grad_norm": 0.33205610513687134,
"learning_rate": 0.0005393214806178956,
"loss": 3.4734,
"step": 17450
},
{
"epoch": 5.097582289542674,
"grad_norm": 0.32609128952026367,
"learning_rate": 0.0005391466044884873,
"loss": 3.4818,
"step": 17500
},
{
"epoch": 5.112146810369939,
"grad_norm": 0.31185203790664673,
"learning_rate": 0.000538971728359079,
"loss": 3.4836,
"step": 17550
},
{
"epoch": 5.126711331197203,
"grad_norm": 0.32222872972488403,
"learning_rate": 0.0005387968522296705,
"loss": 3.4734,
"step": 17600
},
{
"epoch": 5.141275852024468,
"grad_norm": 0.3166067898273468,
"learning_rate": 0.0005386219761002622,
"loss": 3.4808,
"step": 17650
},
{
"epoch": 5.155840372851733,
"grad_norm": 0.3085167109966278,
"learning_rate": 0.0005384470999708539,
"loss": 3.4861,
"step": 17700
},
{
"epoch": 5.170404893678998,
"grad_norm": 0.3201219439506531,
"learning_rate": 0.0005382722238414456,
"loss": 3.5014,
"step": 17750
},
{
"epoch": 5.184969414506263,
"grad_norm": 0.3096485137939453,
"learning_rate": 0.0005380973477120373,
"loss": 3.5066,
"step": 17800
},
{
"epoch": 5.199533935333528,
"grad_norm": 0.3200867474079132,
"learning_rate": 0.000537922471582629,
"loss": 3.4955,
"step": 17850
},
{
"epoch": 5.214098456160793,
"grad_norm": 0.31873369216918945,
"learning_rate": 0.0005377475954532206,
"loss": 3.4958,
"step": 17900
},
{
"epoch": 5.2286629769880575,
"grad_norm": 0.3236185312271118,
"learning_rate": 0.0005375727193238123,
"loss": 3.5043,
"step": 17950
},
{
"epoch": 5.2432274978153215,
"grad_norm": 0.31578707695007324,
"learning_rate": 0.000537397843194404,
"loss": 3.4856,
"step": 18000
},
{
"epoch": 5.2432274978153215,
"eval_accuracy": 0.3600538175169409,
"eval_loss": 3.636204481124878,
"eval_runtime": 178.2632,
"eval_samples_per_second": 93.351,
"eval_steps_per_second": 5.84,
"step": 18000
},
{
"epoch": 5.257792018642586,
"grad_norm": 0.3020099997520447,
"learning_rate": 0.0005372229670649955,
"loss": 3.5085,
"step": 18050
},
{
"epoch": 5.272356539469851,
"grad_norm": 0.33753785490989685,
"learning_rate": 0.0005370480909355872,
"loss": 3.5011,
"step": 18100
},
{
"epoch": 5.286921060297116,
"grad_norm": 0.3076134920120239,
"learning_rate": 0.0005368732148061789,
"loss": 3.5104,
"step": 18150
},
{
"epoch": 5.301485581124381,
"grad_norm": 0.3213876485824585,
"learning_rate": 0.0005366983386767705,
"loss": 3.5087,
"step": 18200
},
{
"epoch": 5.316050101951646,
"grad_norm": 0.30576950311660767,
"learning_rate": 0.0005365234625473623,
"loss": 3.5067,
"step": 18250
},
{
"epoch": 5.330614622778911,
"grad_norm": 0.3095839023590088,
"learning_rate": 0.0005363485864179539,
"loss": 3.5164,
"step": 18300
},
{
"epoch": 5.345179143606176,
"grad_norm": 0.31150195002555847,
"learning_rate": 0.0005361737102885456,
"loss": 3.506,
"step": 18350
},
{
"epoch": 5.35974366443344,
"grad_norm": 0.3282429575920105,
"learning_rate": 0.0005359988341591373,
"loss": 3.5064,
"step": 18400
},
{
"epoch": 5.374308185260705,
"grad_norm": 0.32314422726631165,
"learning_rate": 0.000535823958029729,
"loss": 3.5114,
"step": 18450
},
{
"epoch": 5.3888727060879695,
"grad_norm": 0.3226771652698517,
"learning_rate": 0.0005356490819003205,
"loss": 3.515,
"step": 18500
},
{
"epoch": 5.403437226915234,
"grad_norm": 0.30987077951431274,
"learning_rate": 0.0005354742057709122,
"loss": 3.5127,
"step": 18550
},
{
"epoch": 5.418001747742499,
"grad_norm": 0.32695272564888,
"learning_rate": 0.0005352993296415039,
"loss": 3.5208,
"step": 18600
},
{
"epoch": 5.432566268569764,
"grad_norm": 0.32112646102905273,
"learning_rate": 0.0005351244535120955,
"loss": 3.5216,
"step": 18650
},
{
"epoch": 5.447130789397029,
"grad_norm": 0.32543012499809265,
"learning_rate": 0.0005349495773826873,
"loss": 3.5198,
"step": 18700
},
{
"epoch": 5.461695310224294,
"grad_norm": 0.31456059217453003,
"learning_rate": 0.0005347747012532789,
"loss": 3.5269,
"step": 18750
},
{
"epoch": 5.476259831051558,
"grad_norm": 0.2968471050262451,
"learning_rate": 0.0005345998251238706,
"loss": 3.5041,
"step": 18800
},
{
"epoch": 5.490824351878823,
"grad_norm": 0.2999701499938965,
"learning_rate": 0.0005344249489944623,
"loss": 3.5057,
"step": 18850
},
{
"epoch": 5.505388872706088,
"grad_norm": 0.31823596358299255,
"learning_rate": 0.0005342500728650538,
"loss": 3.5208,
"step": 18900
},
{
"epoch": 5.519953393533353,
"grad_norm": 0.3265931010246277,
"learning_rate": 0.0005340751967356455,
"loss": 3.5322,
"step": 18950
},
{
"epoch": 5.534517914360618,
"grad_norm": 0.3369291424751282,
"learning_rate": 0.0005339003206062372,
"loss": 3.5372,
"step": 19000
},
{
"epoch": 5.534517914360618,
"eval_accuracy": 0.3610927952334266,
"eval_loss": 3.6238300800323486,
"eval_runtime": 178.1149,
"eval_samples_per_second": 93.428,
"eval_steps_per_second": 5.845,
"step": 19000
},
{
"epoch": 5.5490824351878825,
"grad_norm": 0.3080558776855469,
"learning_rate": 0.0005337254444768288,
"loss": 3.5148,
"step": 19050
},
{
"epoch": 5.563646956015147,
"grad_norm": 0.3292733132839203,
"learning_rate": 0.0005335505683474205,
"loss": 3.5206,
"step": 19100
},
{
"epoch": 5.578211476842412,
"grad_norm": 0.2844310700893402,
"learning_rate": 0.0005333756922180122,
"loss": 3.5198,
"step": 19150
},
{
"epoch": 5.592775997669676,
"grad_norm": 0.32660531997680664,
"learning_rate": 0.0005332008160886039,
"loss": 3.5196,
"step": 19200
},
{
"epoch": 5.607340518496941,
"grad_norm": 0.3126123249530792,
"learning_rate": 0.0005330259399591956,
"loss": 3.5265,
"step": 19250
},
{
"epoch": 5.621905039324206,
"grad_norm": 0.32773616909980774,
"learning_rate": 0.0005328510638297873,
"loss": 3.5149,
"step": 19300
},
{
"epoch": 5.636469560151471,
"grad_norm": 0.33990636467933655,
"learning_rate": 0.0005326761877003788,
"loss": 3.5284,
"step": 19350
},
{
"epoch": 5.651034080978736,
"grad_norm": 0.310086190700531,
"learning_rate": 0.0005325013115709705,
"loss": 3.5261,
"step": 19400
},
{
"epoch": 5.665598601806001,
"grad_norm": 0.3189842402935028,
"learning_rate": 0.0005323264354415622,
"loss": 3.5141,
"step": 19450
},
{
"epoch": 5.680163122633266,
"grad_norm": 0.3437648117542267,
"learning_rate": 0.0005321515593121538,
"loss": 3.5307,
"step": 19500
},
{
"epoch": 5.6947276434605305,
"grad_norm": 0.3275405168533325,
"learning_rate": 0.0005319766831827455,
"loss": 3.5203,
"step": 19550
},
{
"epoch": 5.709292164287795,
"grad_norm": 0.33150243759155273,
"learning_rate": 0.0005318018070533372,
"loss": 3.5261,
"step": 19600
},
{
"epoch": 5.723856685115059,
"grad_norm": 0.31431785225868225,
"learning_rate": 0.0005316269309239288,
"loss": 3.5317,
"step": 19650
},
{
"epoch": 5.738421205942324,
"grad_norm": 0.310982882976532,
"learning_rate": 0.0005314520547945206,
"loss": 3.5258,
"step": 19700
},
{
"epoch": 5.752985726769589,
"grad_norm": 0.3376849591732025,
"learning_rate": 0.0005312771786651121,
"loss": 3.5293,
"step": 19750
},
{
"epoch": 5.767550247596854,
"grad_norm": 0.3104221522808075,
"learning_rate": 0.0005311023025357038,
"loss": 3.5352,
"step": 19800
},
{
"epoch": 5.782114768424119,
"grad_norm": 0.3133756220340729,
"learning_rate": 0.0005309274264062955,
"loss": 3.5305,
"step": 19850
},
{
"epoch": 5.796679289251384,
"grad_norm": 0.3189453184604645,
"learning_rate": 0.0005307525502768872,
"loss": 3.5303,
"step": 19900
},
{
"epoch": 5.811243810078649,
"grad_norm": 0.3235698342323303,
"learning_rate": 0.0005305776741474788,
"loss": 3.5243,
"step": 19950
},
{
"epoch": 5.825808330905913,
"grad_norm": 0.33478856086730957,
"learning_rate": 0.0005304027980180705,
"loss": 3.5283,
"step": 20000
},
{
"epoch": 5.825808330905913,
"eval_accuracy": 0.36207003403636906,
"eval_loss": 3.6125800609588623,
"eval_runtime": 177.938,
"eval_samples_per_second": 93.521,
"eval_steps_per_second": 5.85,
"step": 20000
},
{
"epoch": 5.840372851733178,
"grad_norm": 0.3260256052017212,
"learning_rate": 0.0005302279218886622,
"loss": 3.5366,
"step": 20050
},
{
"epoch": 5.8549373725604426,
"grad_norm": 0.32015252113342285,
"learning_rate": 0.0005300530457592538,
"loss": 3.5379,
"step": 20100
},
{
"epoch": 5.8695018933877074,
"grad_norm": 0.3291431665420532,
"learning_rate": 0.0005298781696298456,
"loss": 3.538,
"step": 20150
},
{
"epoch": 5.884066414214972,
"grad_norm": 0.30976444482803345,
"learning_rate": 0.0005297032935004371,
"loss": 3.5296,
"step": 20200
},
{
"epoch": 5.898630935042237,
"grad_norm": 0.315712034702301,
"learning_rate": 0.0005295284173710288,
"loss": 3.5213,
"step": 20250
},
{
"epoch": 5.913195455869502,
"grad_norm": 0.32749900221824646,
"learning_rate": 0.0005293535412416205,
"loss": 3.5263,
"step": 20300
},
{
"epoch": 5.927759976696767,
"grad_norm": 0.30342087149620056,
"learning_rate": 0.0005291786651122121,
"loss": 3.5256,
"step": 20350
},
{
"epoch": 5.942324497524032,
"grad_norm": 0.2914330065250397,
"learning_rate": 0.0005290037889828038,
"loss": 3.5342,
"step": 20400
},
{
"epoch": 5.956889018351296,
"grad_norm": 0.32604503631591797,
"learning_rate": 0.0005288289128533955,
"loss": 3.545,
"step": 20450
},
{
"epoch": 5.971453539178561,
"grad_norm": 0.36326587200164795,
"learning_rate": 0.0005286540367239872,
"loss": 3.5389,
"step": 20500
},
{
"epoch": 5.986018060005826,
"grad_norm": 0.31661051511764526,
"learning_rate": 0.0005284791605945788,
"loss": 3.529,
"step": 20550
},
{
"epoch": 6.000582580833091,
"grad_norm": 0.30371156334877014,
"learning_rate": 0.0005283042844651704,
"loss": 3.5235,
"step": 20600
},
{
"epoch": 6.0151471016603555,
"grad_norm": 0.31312742829322815,
"learning_rate": 0.0005281294083357621,
"loss": 3.4146,
"step": 20650
},
{
"epoch": 6.02971162248762,
"grad_norm": 0.31279024481773376,
"learning_rate": 0.0005279545322063538,
"loss": 3.419,
"step": 20700
},
{
"epoch": 6.044276143314885,
"grad_norm": 0.30267471075057983,
"learning_rate": 0.0005277796560769455,
"loss": 3.4232,
"step": 20750
},
{
"epoch": 6.058840664142149,
"grad_norm": 0.3136891722679138,
"learning_rate": 0.0005276047799475371,
"loss": 3.4228,
"step": 20800
},
{
"epoch": 6.073405184969414,
"grad_norm": 0.3055378198623657,
"learning_rate": 0.0005274299038181288,
"loss": 3.4312,
"step": 20850
},
{
"epoch": 6.087969705796679,
"grad_norm": 0.32942959666252136,
"learning_rate": 0.0005272550276887205,
"loss": 3.4293,
"step": 20900
},
{
"epoch": 6.102534226623944,
"grad_norm": 0.34652891755104065,
"learning_rate": 0.0005270801515593121,
"loss": 3.431,
"step": 20950
},
{
"epoch": 6.117098747451209,
"grad_norm": 0.3244026005268097,
"learning_rate": 0.0005269052754299037,
"loss": 3.4499,
"step": 21000
},
{
"epoch": 6.117098747451209,
"eval_accuracy": 0.3623327478132371,
"eval_loss": 3.6183032989501953,
"eval_runtime": 177.6206,
"eval_samples_per_second": 93.688,
"eval_steps_per_second": 5.861,
"step": 21000
},
{
"epoch": 6.131663268278474,
"grad_norm": 0.31809812784194946,
"learning_rate": 0.0005267303993004954,
"loss": 3.4428,
"step": 21050
},
{
"epoch": 6.146227789105739,
"grad_norm": 0.3030714988708496,
"learning_rate": 0.000526555523171087,
"loss": 3.4458,
"step": 21100
},
{
"epoch": 6.1607923099330035,
"grad_norm": 0.34124982357025146,
"learning_rate": 0.0005263806470416788,
"loss": 3.4431,
"step": 21150
},
{
"epoch": 6.175356830760268,
"grad_norm": 0.31598350405693054,
"learning_rate": 0.0005262057709122704,
"loss": 3.4564,
"step": 21200
},
{
"epoch": 6.189921351587532,
"grad_norm": 0.3193134069442749,
"learning_rate": 0.0005260308947828621,
"loss": 3.4537,
"step": 21250
},
{
"epoch": 6.204485872414797,
"grad_norm": 0.3082662522792816,
"learning_rate": 0.0005258560186534538,
"loss": 3.4516,
"step": 21300
},
{
"epoch": 6.219050393242062,
"grad_norm": 0.3147899806499481,
"learning_rate": 0.0005256811425240455,
"loss": 3.4563,
"step": 21350
},
{
"epoch": 6.233614914069327,
"grad_norm": 0.3170771300792694,
"learning_rate": 0.0005255062663946371,
"loss": 3.449,
"step": 21400
},
{
"epoch": 6.248179434896592,
"grad_norm": 0.7409493923187256,
"learning_rate": 0.0005253313902652287,
"loss": 3.4565,
"step": 21450
},
{
"epoch": 6.262743955723857,
"grad_norm": 0.3383581340312958,
"learning_rate": 0.0005251565141358204,
"loss": 3.4602,
"step": 21500
},
{
"epoch": 6.277308476551122,
"grad_norm": 0.32042765617370605,
"learning_rate": 0.000524981638006412,
"loss": 3.4521,
"step": 21550
},
{
"epoch": 6.291872997378386,
"grad_norm": 0.32029616832733154,
"learning_rate": 0.0005248067618770038,
"loss": 3.4656,
"step": 21600
},
{
"epoch": 6.306437518205651,
"grad_norm": 0.3129471242427826,
"learning_rate": 0.0005246318857475954,
"loss": 3.4757,
"step": 21650
},
{
"epoch": 6.321002039032916,
"grad_norm": 0.325960636138916,
"learning_rate": 0.0005244570096181871,
"loss": 3.4633,
"step": 21700
},
{
"epoch": 6.3355665598601805,
"grad_norm": 0.3158135712146759,
"learning_rate": 0.0005242821334887788,
"loss": 3.4638,
"step": 21750
},
{
"epoch": 6.350131080687445,
"grad_norm": 0.3112996220588684,
"learning_rate": 0.0005241072573593704,
"loss": 3.4754,
"step": 21800
},
{
"epoch": 6.36469560151471,
"grad_norm": 0.32430002093315125,
"learning_rate": 0.000523932381229962,
"loss": 3.4659,
"step": 21850
},
{
"epoch": 6.379260122341975,
"grad_norm": 0.34102514386177063,
"learning_rate": 0.0005237575051005537,
"loss": 3.4633,
"step": 21900
},
{
"epoch": 6.39382464316924,
"grad_norm": 0.3184153437614441,
"learning_rate": 0.0005235826289711454,
"loss": 3.4729,
"step": 21950
},
{
"epoch": 6.408389163996505,
"grad_norm": 0.31477609276771545,
"learning_rate": 0.000523407752841737,
"loss": 3.478,
"step": 22000
},
{
"epoch": 6.408389163996505,
"eval_accuracy": 0.3631589908733422,
"eval_loss": 3.611259937286377,
"eval_runtime": 177.9074,
"eval_samples_per_second": 93.537,
"eval_steps_per_second": 5.851,
"step": 22000
},
{
"epoch": 6.422953684823769,
"grad_norm": 0.30401411652565,
"learning_rate": 0.0005232328767123287,
"loss": 3.4673,
"step": 22050
},
{
"epoch": 6.437518205651034,
"grad_norm": 0.32664841413497925,
"learning_rate": 0.0005230580005829204,
"loss": 3.4699,
"step": 22100
},
{
"epoch": 6.452082726478299,
"grad_norm": 0.30075621604919434,
"learning_rate": 0.0005228831244535121,
"loss": 3.4883,
"step": 22150
},
{
"epoch": 6.466647247305564,
"grad_norm": 0.3116818070411682,
"learning_rate": 0.0005227082483241038,
"loss": 3.4699,
"step": 22200
},
{
"epoch": 6.4812117681328285,
"grad_norm": 0.33584606647491455,
"learning_rate": 0.0005225333721946954,
"loss": 3.4799,
"step": 22250
},
{
"epoch": 6.495776288960093,
"grad_norm": 0.3176494240760803,
"learning_rate": 0.000522358496065287,
"loss": 3.4829,
"step": 22300
},
{
"epoch": 6.510340809787358,
"grad_norm": 0.3094027042388916,
"learning_rate": 0.0005221836199358787,
"loss": 3.4793,
"step": 22350
},
{
"epoch": 6.524905330614622,
"grad_norm": 0.29500812292099,
"learning_rate": 0.0005220087438064703,
"loss": 3.4881,
"step": 22400
},
{
"epoch": 6.539469851441887,
"grad_norm": 0.32507067918777466,
"learning_rate": 0.000521833867677062,
"loss": 3.4813,
"step": 22450
},
{
"epoch": 6.554034372269152,
"grad_norm": 0.300907164812088,
"learning_rate": 0.0005216589915476537,
"loss": 3.4818,
"step": 22500
},
{
"epoch": 6.568598893096417,
"grad_norm": 0.3277975618839264,
"learning_rate": 0.0005214841154182454,
"loss": 3.4774,
"step": 22550
},
{
"epoch": 6.583163413923682,
"grad_norm": 0.32927611470222473,
"learning_rate": 0.0005213092392888371,
"loss": 3.4826,
"step": 22600
},
{
"epoch": 6.597727934750947,
"grad_norm": 0.3341529369354248,
"learning_rate": 0.0005211343631594287,
"loss": 3.4821,
"step": 22650
},
{
"epoch": 6.612292455578212,
"grad_norm": 0.323186457157135,
"learning_rate": 0.0005209594870300204,
"loss": 3.4759,
"step": 22700
},
{
"epoch": 6.6268569764054766,
"grad_norm": 0.3139779269695282,
"learning_rate": 0.000520784610900612,
"loss": 3.484,
"step": 22750
},
{
"epoch": 6.6414214972327414,
"grad_norm": 0.31565406918525696,
"learning_rate": 0.0005206097347712037,
"loss": 3.4833,
"step": 22800
},
{
"epoch": 6.6559860180600054,
"grad_norm": 0.31253114342689514,
"learning_rate": 0.0005204348586417953,
"loss": 3.4874,
"step": 22850
},
{
"epoch": 6.67055053888727,
"grad_norm": 0.3426834046840668,
"learning_rate": 0.000520259982512387,
"loss": 3.484,
"step": 22900
},
{
"epoch": 6.685115059714535,
"grad_norm": 0.3148155212402344,
"learning_rate": 0.0005200851063829787,
"loss": 3.4855,
"step": 22950
},
{
"epoch": 6.6996795805418,
"grad_norm": 0.33372002840042114,
"learning_rate": 0.0005199102302535703,
"loss": 3.4924,
"step": 23000
},
{
"epoch": 6.6996795805418,
"eval_accuracy": 0.36406990444344955,
"eval_loss": 3.59806489944458,
"eval_runtime": 177.7268,
"eval_samples_per_second": 93.632,
"eval_steps_per_second": 5.857,
"step": 23000
},
{
"epoch": 6.714244101369065,
"grad_norm": 0.3200724720954895,
"learning_rate": 0.0005197353541241621,
"loss": 3.4794,
"step": 23050
},
{
"epoch": 6.72880862219633,
"grad_norm": 0.3293537199497223,
"learning_rate": 0.0005195604779947537,
"loss": 3.4753,
"step": 23100
},
{
"epoch": 6.743373143023595,
"grad_norm": 0.32551905512809753,
"learning_rate": 0.0005193856018653454,
"loss": 3.4896,
"step": 23150
},
{
"epoch": 6.757937663850859,
"grad_norm": 0.3181687593460083,
"learning_rate": 0.000519210725735937,
"loss": 3.4845,
"step": 23200
},
{
"epoch": 6.772502184678124,
"grad_norm": 0.3090181350708008,
"learning_rate": 0.0005190358496065286,
"loss": 3.4876,
"step": 23250
},
{
"epoch": 6.787066705505389,
"grad_norm": 0.32364892959594727,
"learning_rate": 0.0005188609734771203,
"loss": 3.4979,
"step": 23300
},
{
"epoch": 6.8016312263326535,
"grad_norm": 0.3206116855144501,
"learning_rate": 0.000518686097347712,
"loss": 3.4882,
"step": 23350
},
{
"epoch": 6.816195747159918,
"grad_norm": 0.3306562006473541,
"learning_rate": 0.0005185112212183037,
"loss": 3.4947,
"step": 23400
},
{
"epoch": 6.830760267987183,
"grad_norm": 0.3269500136375427,
"learning_rate": 0.0005183363450888953,
"loss": 3.4892,
"step": 23450
},
{
"epoch": 6.845324788814448,
"grad_norm": 0.3314773142337799,
"learning_rate": 0.000518161468959487,
"loss": 3.4941,
"step": 23500
},
{
"epoch": 6.859889309641713,
"grad_norm": 0.3544946014881134,
"learning_rate": 0.0005179865928300787,
"loss": 3.4873,
"step": 23550
},
{
"epoch": 6.874453830468978,
"grad_norm": 0.34047919511795044,
"learning_rate": 0.0005178117167006703,
"loss": 3.4965,
"step": 23600
},
{
"epoch": 6.889018351296242,
"grad_norm": 0.3339918553829193,
"learning_rate": 0.000517636840571262,
"loss": 3.498,
"step": 23650
},
{
"epoch": 6.903582872123507,
"grad_norm": 0.33018672466278076,
"learning_rate": 0.0005174619644418536,
"loss": 3.4936,
"step": 23700
},
{
"epoch": 6.918147392950772,
"grad_norm": 0.3264847993850708,
"learning_rate": 0.0005172870883124453,
"loss": 3.507,
"step": 23750
},
{
"epoch": 6.932711913778037,
"grad_norm": 0.3399565815925598,
"learning_rate": 0.000517112212183037,
"loss": 3.4907,
"step": 23800
},
{
"epoch": 6.9472764346053015,
"grad_norm": 0.30828312039375305,
"learning_rate": 0.0005169373360536286,
"loss": 3.4914,
"step": 23850
},
{
"epoch": 6.961840955432566,
"grad_norm": 0.3247275948524475,
"learning_rate": 0.0005167624599242203,
"loss": 3.493,
"step": 23900
},
{
"epoch": 6.976405476259831,
"grad_norm": 0.3123205304145813,
"learning_rate": 0.000516587583794812,
"loss": 3.4967,
"step": 23950
},
{
"epoch": 6.990969997087096,
"grad_norm": 0.32256489992141724,
"learning_rate": 0.0005164127076654037,
"loss": 3.4865,
"step": 24000
},
{
"epoch": 6.990969997087096,
"eval_accuracy": 0.3646406072004507,
"eval_loss": 3.5887346267700195,
"eval_runtime": 177.4869,
"eval_samples_per_second": 93.759,
"eval_steps_per_second": 5.865,
"step": 24000
},
{
"epoch": 7.00553451791436,
"grad_norm": 0.3418632447719574,
"learning_rate": 0.0005162378315359953,
"loss": 3.458,
"step": 24050
},
{
"epoch": 7.020099038741625,
"grad_norm": 0.30545857548713684,
"learning_rate": 0.0005160629554065869,
"loss": 3.3774,
"step": 24100
},
{
"epoch": 7.03466355956889,
"grad_norm": 0.30984604358673096,
"learning_rate": 0.0005158880792771786,
"loss": 3.3816,
"step": 24150
},
{
"epoch": 7.049228080396155,
"grad_norm": 0.31869596242904663,
"learning_rate": 0.0005157132031477703,
"loss": 3.3935,
"step": 24200
},
{
"epoch": 7.06379260122342,
"grad_norm": 0.34166282415390015,
"learning_rate": 0.000515538327018362,
"loss": 3.3896,
"step": 24250
},
{
"epoch": 7.078357122050685,
"grad_norm": 0.3672468364238739,
"learning_rate": 0.0005153634508889536,
"loss": 3.3978,
"step": 24300
},
{
"epoch": 7.09292164287795,
"grad_norm": 0.3174588978290558,
"learning_rate": 0.0005151885747595453,
"loss": 3.4059,
"step": 24350
},
{
"epoch": 7.1074861637052145,
"grad_norm": 0.30413857102394104,
"learning_rate": 0.000515013698630137,
"loss": 3.4103,
"step": 24400
},
{
"epoch": 7.1220506845324785,
"grad_norm": 0.31710121035575867,
"learning_rate": 0.0005148388225007285,
"loss": 3.4017,
"step": 24450
},
{
"epoch": 7.136615205359743,
"grad_norm": 0.33834975957870483,
"learning_rate": 0.0005146639463713203,
"loss": 3.4017,
"step": 24500
},
{
"epoch": 7.151179726187008,
"grad_norm": 0.307821661233902,
"learning_rate": 0.0005144890702419119,
"loss": 3.4139,
"step": 24550
},
{
"epoch": 7.165744247014273,
"grad_norm": 0.33020275831222534,
"learning_rate": 0.0005143141941125036,
"loss": 3.4018,
"step": 24600
},
{
"epoch": 7.180308767841538,
"grad_norm": 0.319395512342453,
"learning_rate": 0.0005141393179830953,
"loss": 3.4067,
"step": 24650
},
{
"epoch": 7.194873288668803,
"grad_norm": 0.3333781659603119,
"learning_rate": 0.0005139644418536869,
"loss": 3.4189,
"step": 24700
},
{
"epoch": 7.209437809496068,
"grad_norm": 0.34063929319381714,
"learning_rate": 0.0005137895657242786,
"loss": 3.4158,
"step": 24750
},
{
"epoch": 7.224002330323333,
"grad_norm": 0.3293705880641937,
"learning_rate": 0.0005136146895948703,
"loss": 3.4296,
"step": 24800
},
{
"epoch": 7.238566851150597,
"grad_norm": 0.3184684216976166,
"learning_rate": 0.000513439813465462,
"loss": 3.425,
"step": 24850
},
{
"epoch": 7.253131371977862,
"grad_norm": 0.33404359221458435,
"learning_rate": 0.0005132649373360535,
"loss": 3.4167,
"step": 24900
},
{
"epoch": 7.2676958928051265,
"grad_norm": 0.33032795786857605,
"learning_rate": 0.0005130900612066452,
"loss": 3.4225,
"step": 24950
},
{
"epoch": 7.282260413632391,
"grad_norm": 0.3188001215457916,
"learning_rate": 0.0005129151850772369,
"loss": 3.4371,
"step": 25000
},
{
"epoch": 7.282260413632391,
"eval_accuracy": 0.36465624772521504,
"eval_loss": 3.598834276199341,
"eval_runtime": 177.8103,
"eval_samples_per_second": 93.589,
"eval_steps_per_second": 5.855,
"step": 25000
},
{
"epoch": 7.296824934459656,
"grad_norm": 0.32110267877578735,
"learning_rate": 0.0005127403089478286,
"loss": 3.4326,
"step": 25050
},
{
"epoch": 7.311389455286921,
"grad_norm": 0.3441673517227173,
"learning_rate": 0.0005125654328184203,
"loss": 3.418,
"step": 25100
},
{
"epoch": 7.325953976114186,
"grad_norm": 0.3550114035606384,
"learning_rate": 0.0005123905566890119,
"loss": 3.4363,
"step": 25150
},
{
"epoch": 7.340518496941451,
"grad_norm": 0.31895849108695984,
"learning_rate": 0.0005122156805596036,
"loss": 3.4401,
"step": 25200
},
{
"epoch": 7.355083017768715,
"grad_norm": 0.3154217004776001,
"learning_rate": 0.0005120408044301953,
"loss": 3.419,
"step": 25250
},
{
"epoch": 7.36964753859598,
"grad_norm": 0.35524681210517883,
"learning_rate": 0.0005118659283007868,
"loss": 3.4247,
"step": 25300
},
{
"epoch": 7.384212059423245,
"grad_norm": 0.33160510659217834,
"learning_rate": 0.0005116910521713785,
"loss": 3.4234,
"step": 25350
},
{
"epoch": 7.39877658025051,
"grad_norm": 0.32226237654685974,
"learning_rate": 0.0005115161760419702,
"loss": 3.4328,
"step": 25400
},
{
"epoch": 7.4133411010777746,
"grad_norm": 0.3131123185157776,
"learning_rate": 0.0005113412999125619,
"loss": 3.4389,
"step": 25450
},
{
"epoch": 7.4279056219050394,
"grad_norm": 0.3247349262237549,
"learning_rate": 0.0005111664237831536,
"loss": 3.4296,
"step": 25500
},
{
"epoch": 7.442470142732304,
"grad_norm": 0.318561851978302,
"learning_rate": 0.0005109915476537452,
"loss": 3.4451,
"step": 25550
},
{
"epoch": 7.457034663559569,
"grad_norm": 0.31211626529693604,
"learning_rate": 0.0005108166715243369,
"loss": 3.4448,
"step": 25600
},
{
"epoch": 7.471599184386834,
"grad_norm": 0.32736170291900635,
"learning_rate": 0.0005106417953949286,
"loss": 3.4446,
"step": 25650
},
{
"epoch": 7.486163705214098,
"grad_norm": 0.32811227440834045,
"learning_rate": 0.0005104669192655203,
"loss": 3.4383,
"step": 25700
},
{
"epoch": 7.500728226041363,
"grad_norm": 0.3180325925350189,
"learning_rate": 0.0005102920431361118,
"loss": 3.4454,
"step": 25750
},
{
"epoch": 7.515292746868628,
"grad_norm": 0.32189640402793884,
"learning_rate": 0.0005101171670067035,
"loss": 3.4465,
"step": 25800
},
{
"epoch": 7.529857267695893,
"grad_norm": 0.32531917095184326,
"learning_rate": 0.0005099422908772952,
"loss": 3.4449,
"step": 25850
},
{
"epoch": 7.544421788523158,
"grad_norm": 0.33265554904937744,
"learning_rate": 0.0005097674147478868,
"loss": 3.4514,
"step": 25900
},
{
"epoch": 7.558986309350423,
"grad_norm": 0.32440370321273804,
"learning_rate": 0.0005095925386184786,
"loss": 3.459,
"step": 25950
},
{
"epoch": 7.5735508301776875,
"grad_norm": 0.32886600494384766,
"learning_rate": 0.0005094176624890702,
"loss": 3.4552,
"step": 26000
},
{
"epoch": 7.5735508301776875,
"eval_accuracy": 0.36528269190130097,
"eval_loss": 3.5886290073394775,
"eval_runtime": 177.9265,
"eval_samples_per_second": 93.527,
"eval_steps_per_second": 5.851,
"step": 26000
},
{
"epoch": 7.5881153510049515,
"grad_norm": 0.3494354486465454,
"learning_rate": 0.0005092427863596619,
"loss": 3.4429,
"step": 26050
},
{
"epoch": 7.602679871832216,
"grad_norm": 0.3267322778701782,
"learning_rate": 0.0005090679102302536,
"loss": 3.4374,
"step": 26100
},
{
"epoch": 7.617244392659481,
"grad_norm": 0.3428941071033478,
"learning_rate": 0.0005088930341008451,
"loss": 3.4462,
"step": 26150
},
{
"epoch": 7.631808913486746,
"grad_norm": 0.3226052522659302,
"learning_rate": 0.0005087181579714368,
"loss": 3.4491,
"step": 26200
},
{
"epoch": 7.646373434314011,
"grad_norm": 0.32438158988952637,
"learning_rate": 0.0005085432818420285,
"loss": 3.454,
"step": 26250
},
{
"epoch": 7.660937955141276,
"grad_norm": 0.2988736629486084,
"learning_rate": 0.0005083684057126202,
"loss": 3.4578,
"step": 26300
},
{
"epoch": 7.675502475968541,
"grad_norm": 0.31402266025543213,
"learning_rate": 0.0005081935295832118,
"loss": 3.4428,
"step": 26350
},
{
"epoch": 7.690066996795806,
"grad_norm": 0.34731653332710266,
"learning_rate": 0.0005080186534538035,
"loss": 3.4578,
"step": 26400
},
{
"epoch": 7.704631517623071,
"grad_norm": 0.33528274297714233,
"learning_rate": 0.0005078437773243952,
"loss": 3.458,
"step": 26450
},
{
"epoch": 7.719196038450335,
"grad_norm": 0.3237103223800659,
"learning_rate": 0.0005076689011949869,
"loss": 3.4564,
"step": 26500
},
{
"epoch": 7.7337605592775995,
"grad_norm": 0.3397464454174042,
"learning_rate": 0.0005074940250655786,
"loss": 3.4604,
"step": 26550
},
{
"epoch": 7.748325080104864,
"grad_norm": 0.3478679358959198,
"learning_rate": 0.0005073191489361701,
"loss": 3.4475,
"step": 26600
},
{
"epoch": 7.762889600932129,
"grad_norm": 0.33171746134757996,
"learning_rate": 0.0005071442728067618,
"loss": 3.4538,
"step": 26650
},
{
"epoch": 7.777454121759394,
"grad_norm": 0.309638112783432,
"learning_rate": 0.0005069693966773535,
"loss": 3.4554,
"step": 26700
},
{
"epoch": 7.792018642586659,
"grad_norm": 0.3162028193473816,
"learning_rate": 0.0005067945205479451,
"loss": 3.4465,
"step": 26750
},
{
"epoch": 7.806583163413924,
"grad_norm": 0.3107338845729828,
"learning_rate": 0.0005066196444185368,
"loss": 3.454,
"step": 26800
},
{
"epoch": 7.821147684241188,
"grad_norm": 0.33051323890686035,
"learning_rate": 0.0005064447682891285,
"loss": 3.4553,
"step": 26850
},
{
"epoch": 7.835712205068453,
"grad_norm": 0.33980193734169006,
"learning_rate": 0.0005062698921597202,
"loss": 3.4523,
"step": 26900
},
{
"epoch": 7.850276725895718,
"grad_norm": 0.34411004185676575,
"learning_rate": 0.0005060950160303119,
"loss": 3.4671,
"step": 26950
},
{
"epoch": 7.864841246722983,
"grad_norm": 0.3099103271961212,
"learning_rate": 0.0005059201399009035,
"loss": 3.4686,
"step": 27000
},
{
"epoch": 7.864841246722983,
"eval_accuracy": 0.36589326035676156,
"eval_loss": 3.5795626640319824,
"eval_runtime": 177.1061,
"eval_samples_per_second": 93.961,
"eval_steps_per_second": 5.878,
"step": 27000
},
{
"epoch": 7.879405767550248,
"grad_norm": 0.32725054025650024,
"learning_rate": 0.0005057452637714951,
"loss": 3.4652,
"step": 27050
},
{
"epoch": 7.8939702883775125,
"grad_norm": 0.31909966468811035,
"learning_rate": 0.0005055703876420868,
"loss": 3.4615,
"step": 27100
},
{
"epoch": 7.908534809204777,
"grad_norm": 0.33096399903297424,
"learning_rate": 0.0005053955115126785,
"loss": 3.4602,
"step": 27150
},
{
"epoch": 7.923099330032042,
"grad_norm": 0.3407413959503174,
"learning_rate": 0.0005052206353832701,
"loss": 3.4593,
"step": 27200
},
{
"epoch": 7.937663850859307,
"grad_norm": 0.33889928460121155,
"learning_rate": 0.0005050457592538618,
"loss": 3.4601,
"step": 27250
},
{
"epoch": 7.952228371686571,
"grad_norm": 0.32174986600875854,
"learning_rate": 0.0005048708831244535,
"loss": 3.4621,
"step": 27300
},
{
"epoch": 7.966792892513836,
"grad_norm": 0.31754982471466064,
"learning_rate": 0.0005046960069950451,
"loss": 3.4708,
"step": 27350
},
{
"epoch": 7.981357413341101,
"grad_norm": 0.3301248252391815,
"learning_rate": 0.0005045211308656369,
"loss": 3.4638,
"step": 27400
},
{
"epoch": 7.995921934168366,
"grad_norm": 0.3506939113140106,
"learning_rate": 0.0005043462547362284,
"loss": 3.46,
"step": 27450
},
{
"epoch": 8.01048645499563,
"grad_norm": 0.32264554500579834,
"learning_rate": 0.0005041713786068201,
"loss": 3.3792,
"step": 27500
},
{
"epoch": 8.025050975822895,
"grad_norm": 0.3109362721443176,
"learning_rate": 0.0005039965024774118,
"loss": 3.3501,
"step": 27550
},
{
"epoch": 8.03961549665016,
"grad_norm": 0.3230056166648865,
"learning_rate": 0.0005038216263480034,
"loss": 3.3474,
"step": 27600
},
{
"epoch": 8.054180017477425,
"grad_norm": 0.3607815206050873,
"learning_rate": 0.0005036467502185951,
"loss": 3.355,
"step": 27650
},
{
"epoch": 8.06874453830469,
"grad_norm": 0.3454449772834778,
"learning_rate": 0.0005034718740891868,
"loss": 3.3632,
"step": 27700
},
{
"epoch": 8.083309059131954,
"grad_norm": 0.3308209478855133,
"learning_rate": 0.0005032969979597785,
"loss": 3.3578,
"step": 27750
},
{
"epoch": 8.09787357995922,
"grad_norm": 0.3335447907447815,
"learning_rate": 0.0005031221218303701,
"loss": 3.372,
"step": 27800
},
{
"epoch": 8.112438100786484,
"grad_norm": 0.35093986988067627,
"learning_rate": 0.0005029472457009618,
"loss": 3.3682,
"step": 27850
},
{
"epoch": 8.127002621613748,
"grad_norm": 0.3294939696788788,
"learning_rate": 0.0005027723695715534,
"loss": 3.3767,
"step": 27900
},
{
"epoch": 8.141567142441014,
"grad_norm": 0.3499290645122528,
"learning_rate": 0.0005025974934421451,
"loss": 3.3834,
"step": 27950
},
{
"epoch": 8.156131663268278,
"grad_norm": 0.3340826630592346,
"learning_rate": 0.0005024226173127368,
"loss": 3.3643,
"step": 28000
},
{
"epoch": 8.156131663268278,
"eval_accuracy": 0.36620242531620023,
"eval_loss": 3.586412191390991,
"eval_runtime": 177.6212,
"eval_samples_per_second": 93.688,
"eval_steps_per_second": 5.861,
"step": 28000
},
{
"epoch": 8.170696184095544,
"grad_norm": 0.3302430212497711,
"learning_rate": 0.0005022477411833284,
"loss": 3.3864,
"step": 28050
},
{
"epoch": 8.185260704922808,
"grad_norm": 0.35755255818367004,
"learning_rate": 0.0005020728650539201,
"loss": 3.3841,
"step": 28100
},
{
"epoch": 8.199825225750073,
"grad_norm": 0.3413600027561188,
"learning_rate": 0.0005018979889245118,
"loss": 3.3885,
"step": 28150
},
{
"epoch": 8.214389746577337,
"grad_norm": 0.31416332721710205,
"learning_rate": 0.0005017231127951034,
"loss": 3.394,
"step": 28200
},
{
"epoch": 8.228954267404603,
"grad_norm": 0.3258178234100342,
"learning_rate": 0.0005015482366656951,
"loss": 3.3826,
"step": 28250
},
{
"epoch": 8.243518788231867,
"grad_norm": 0.3640033006668091,
"learning_rate": 0.0005013733605362868,
"loss": 3.4016,
"step": 28300
},
{
"epoch": 8.258083309059131,
"grad_norm": 0.3285066485404968,
"learning_rate": 0.0005011984844068784,
"loss": 3.4005,
"step": 28350
},
{
"epoch": 8.272647829886397,
"grad_norm": 0.357901394367218,
"learning_rate": 0.0005010236082774701,
"loss": 3.3969,
"step": 28400
},
{
"epoch": 8.287212350713661,
"grad_norm": 0.34709328413009644,
"learning_rate": 0.0005008487321480617,
"loss": 3.3909,
"step": 28450
},
{
"epoch": 8.301776871540927,
"grad_norm": 0.31995636224746704,
"learning_rate": 0.0005006738560186534,
"loss": 3.4128,
"step": 28500
},
{
"epoch": 8.31634139236819,
"grad_norm": 0.315400630235672,
"learning_rate": 0.0005004989798892451,
"loss": 3.3999,
"step": 28550
},
{
"epoch": 8.330905913195457,
"grad_norm": 0.3495756685733795,
"learning_rate": 0.0005003241037598368,
"loss": 3.3972,
"step": 28600
},
{
"epoch": 8.34547043402272,
"grad_norm": 0.3411330282688141,
"learning_rate": 0.0005001492276304284,
"loss": 3.4064,
"step": 28650
},
{
"epoch": 8.360034954849985,
"grad_norm": 0.3218335211277008,
"learning_rate": 0.0004999743515010201,
"loss": 3.3977,
"step": 28700
},
{
"epoch": 8.37459947567725,
"grad_norm": 0.33580780029296875,
"learning_rate": 0.0004997994753716117,
"loss": 3.4069,
"step": 28750
},
{
"epoch": 8.389163996504514,
"grad_norm": 0.3284832537174225,
"learning_rate": 0.0004996245992422033,
"loss": 3.4001,
"step": 28800
},
{
"epoch": 8.40372851733178,
"grad_norm": 0.32526466250419617,
"learning_rate": 0.0004994497231127951,
"loss": 3.4016,
"step": 28850
},
{
"epoch": 8.418293038159044,
"grad_norm": 0.3263266086578369,
"learning_rate": 0.0004992748469833867,
"loss": 3.4154,
"step": 28900
},
{
"epoch": 8.43285755898631,
"grad_norm": 0.3042384386062622,
"learning_rate": 0.0004990999708539784,
"loss": 3.4174,
"step": 28950
},
{
"epoch": 8.447422079813574,
"grad_norm": 0.3383604884147644,
"learning_rate": 0.0004989250947245701,
"loss": 3.4177,
"step": 29000
},
{
"epoch": 8.447422079813574,
"eval_accuracy": 0.36640586973606676,
"eval_loss": 3.581047534942627,
"eval_runtime": 177.2332,
"eval_samples_per_second": 93.893,
"eval_steps_per_second": 5.874,
"step": 29000
},
{
"epoch": 8.46198660064084,
"grad_norm": 0.3453238606452942,
"learning_rate": 0.0004987502185951617,
"loss": 3.4169,
"step": 29050
},
{
"epoch": 8.476551121468104,
"grad_norm": 0.32911792397499084,
"learning_rate": 0.0004985753424657534,
"loss": 3.3998,
"step": 29100
},
{
"epoch": 8.491115642295368,
"grad_norm": 0.34755417704582214,
"learning_rate": 0.000498400466336345,
"loss": 3.4153,
"step": 29150
},
{
"epoch": 8.505680163122634,
"grad_norm": 0.3180427849292755,
"learning_rate": 0.0004982255902069367,
"loss": 3.406,
"step": 29200
},
{
"epoch": 8.520244683949898,
"grad_norm": 0.34140604734420776,
"learning_rate": 0.0004980507140775283,
"loss": 3.4127,
"step": 29250
},
{
"epoch": 8.534809204777163,
"grad_norm": 0.329262375831604,
"learning_rate": 0.0004978758379481201,
"loss": 3.4351,
"step": 29300
},
{
"epoch": 8.549373725604427,
"grad_norm": 0.3097538948059082,
"learning_rate": 0.0004977009618187117,
"loss": 3.4126,
"step": 29350
},
{
"epoch": 8.563938246431693,
"grad_norm": 0.32443442940711975,
"learning_rate": 0.0004975260856893034,
"loss": 3.4117,
"step": 29400
},
{
"epoch": 8.578502767258957,
"grad_norm": 0.3432025909423828,
"learning_rate": 0.0004973512095598951,
"loss": 3.4247,
"step": 29450
},
{
"epoch": 8.593067288086221,
"grad_norm": 0.3406945765018463,
"learning_rate": 0.0004971763334304867,
"loss": 3.4125,
"step": 29500
},
{
"epoch": 8.607631808913487,
"grad_norm": 0.32931724190711975,
"learning_rate": 0.0004970014573010784,
"loss": 3.4215,
"step": 29550
},
{
"epoch": 8.622196329740751,
"grad_norm": 0.33969646692276,
"learning_rate": 0.00049682658117167,
"loss": 3.4158,
"step": 29600
},
{
"epoch": 8.636760850568017,
"grad_norm": 0.3158787190914154,
"learning_rate": 0.0004966517050422616,
"loss": 3.4155,
"step": 29650
},
{
"epoch": 8.65132537139528,
"grad_norm": 0.32108256220817566,
"learning_rate": 0.0004964768289128533,
"loss": 3.4218,
"step": 29700
},
{
"epoch": 8.665889892222546,
"grad_norm": 0.3323673903942108,
"learning_rate": 0.000496301952783445,
"loss": 3.4267,
"step": 29750
},
{
"epoch": 8.68045441304981,
"grad_norm": 0.3276951014995575,
"learning_rate": 0.0004961270766540367,
"loss": 3.4214,
"step": 29800
},
{
"epoch": 8.695018933877076,
"grad_norm": 0.33299991488456726,
"learning_rate": 0.0004959522005246284,
"loss": 3.4335,
"step": 29850
},
{
"epoch": 8.70958345470434,
"grad_norm": 0.33059605956077576,
"learning_rate": 0.00049577732439522,
"loss": 3.4223,
"step": 29900
},
{
"epoch": 8.724147975531604,
"grad_norm": 0.32752349972724915,
"learning_rate": 0.0004956024482658117,
"loss": 3.4305,
"step": 29950
},
{
"epoch": 8.73871249635887,
"grad_norm": 0.31223592162132263,
"learning_rate": 0.0004954275721364034,
"loss": 3.4214,
"step": 30000
},
{
"epoch": 8.73871249635887,
"eval_accuracy": 0.36692435901189985,
"eval_loss": 3.5738139152526855,
"eval_runtime": 177.4984,
"eval_samples_per_second": 93.753,
"eval_steps_per_second": 5.865,
"step": 30000
},
{
"epoch": 8.753277017186134,
"grad_norm": 0.3213596045970917,
"learning_rate": 0.000495252696006995,
"loss": 3.4262,
"step": 30050
},
{
"epoch": 8.7678415380134,
"grad_norm": 0.3408668041229248,
"learning_rate": 0.0004950778198775866,
"loss": 3.4239,
"step": 30100
},
{
"epoch": 8.782406058840664,
"grad_norm": 0.32721853256225586,
"learning_rate": 0.0004949029437481783,
"loss": 3.4222,
"step": 30150
},
{
"epoch": 8.79697057966793,
"grad_norm": 0.31464850902557373,
"learning_rate": 0.00049472806761877,
"loss": 3.4332,
"step": 30200
},
{
"epoch": 8.811535100495194,
"grad_norm": 0.3212704658508301,
"learning_rate": 0.0004945531914893616,
"loss": 3.4302,
"step": 30250
},
{
"epoch": 8.82609962132246,
"grad_norm": 0.3242170810699463,
"learning_rate": 0.0004943783153599534,
"loss": 3.446,
"step": 30300
},
{
"epoch": 8.840664142149723,
"grad_norm": 0.3245343863964081,
"learning_rate": 0.000494203439230545,
"loss": 3.4213,
"step": 30350
},
{
"epoch": 8.855228662976987,
"grad_norm": 0.32496681809425354,
"learning_rate": 0.0004940285631011367,
"loss": 3.4294,
"step": 30400
},
{
"epoch": 8.869793183804253,
"grad_norm": 0.36191901564598083,
"learning_rate": 0.0004938536869717284,
"loss": 3.4359,
"step": 30450
},
{
"epoch": 8.884357704631517,
"grad_norm": 0.3322804272174835,
"learning_rate": 0.0004936788108423199,
"loss": 3.4327,
"step": 30500
},
{
"epoch": 8.898922225458783,
"grad_norm": 0.3395395576953888,
"learning_rate": 0.0004935039347129116,
"loss": 3.423,
"step": 30550
},
{
"epoch": 8.913486746286047,
"grad_norm": 0.3019578754901886,
"learning_rate": 0.0004933290585835033,
"loss": 3.4376,
"step": 30600
},
{
"epoch": 8.928051267113313,
"grad_norm": 0.32183343172073364,
"learning_rate": 0.000493154182454095,
"loss": 3.4407,
"step": 30650
},
{
"epoch": 8.942615787940577,
"grad_norm": 0.3220561742782593,
"learning_rate": 0.0004929793063246866,
"loss": 3.4417,
"step": 30700
},
{
"epoch": 8.95718030876784,
"grad_norm": 0.36304983496665955,
"learning_rate": 0.0004928044301952783,
"loss": 3.4312,
"step": 30750
},
{
"epoch": 8.971744829595107,
"grad_norm": 0.33577391505241394,
"learning_rate": 0.00049262955406587,
"loss": 3.4396,
"step": 30800
},
{
"epoch": 8.98630935042237,
"grad_norm": 0.3271602392196655,
"learning_rate": 0.0004924546779364617,
"loss": 3.426,
"step": 30850
},
{
"epoch": 9.000873871249636,
"grad_norm": 0.33349940180778503,
"learning_rate": 0.0004922798018070533,
"loss": 3.4341,
"step": 30900
},
{
"epoch": 9.0154383920769,
"grad_norm": 0.32157132029533386,
"learning_rate": 0.0004921049256776449,
"loss": 3.3276,
"step": 30950
},
{
"epoch": 9.030002912904166,
"grad_norm": 0.3370971381664276,
"learning_rate": 0.0004919300495482366,
"loss": 3.3159,
"step": 31000
},
{
"epoch": 9.030002912904166,
"eval_accuracy": 0.3673821677555647,
"eval_loss": 3.579301595687866,
"eval_runtime": 177.4898,
"eval_samples_per_second": 93.757,
"eval_steps_per_second": 5.865,
"step": 31000
},
{
"epoch": 9.04456743373143,
"grad_norm": 0.33405694365501404,
"learning_rate": 0.0004917551734188283,
"loss": 3.3151,
"step": 31050
},
{
"epoch": 9.059131954558694,
"grad_norm": 0.3237508237361908,
"learning_rate": 0.0004915802972894199,
"loss": 3.3404,
"step": 31100
},
{
"epoch": 9.07369647538596,
"grad_norm": 0.35597681999206543,
"learning_rate": 0.0004914054211600116,
"loss": 3.3346,
"step": 31150
},
{
"epoch": 9.088260996213224,
"grad_norm": 0.32660970091819763,
"learning_rate": 0.0004912305450306033,
"loss": 3.3228,
"step": 31200
},
{
"epoch": 9.10282551704049,
"grad_norm": 0.3498513996601105,
"learning_rate": 0.000491055668901195,
"loss": 3.3412,
"step": 31250
},
{
"epoch": 9.117390037867754,
"grad_norm": 0.3514196574687958,
"learning_rate": 0.0004908807927717865,
"loss": 3.3467,
"step": 31300
},
{
"epoch": 9.13195455869502,
"grad_norm": 0.33097749948501587,
"learning_rate": 0.0004907059166423783,
"loss": 3.3572,
"step": 31350
},
{
"epoch": 9.146519079522283,
"grad_norm": 0.3433399498462677,
"learning_rate": 0.0004905310405129699,
"loss": 3.339,
"step": 31400
},
{
"epoch": 9.16108360034955,
"grad_norm": 0.327028751373291,
"learning_rate": 0.0004903561643835616,
"loss": 3.3498,
"step": 31450
},
{
"epoch": 9.175648121176813,
"grad_norm": 0.340170681476593,
"learning_rate": 0.0004901812882541533,
"loss": 3.3463,
"step": 31500
},
{
"epoch": 9.190212642004077,
"grad_norm": 0.3196989893913269,
"learning_rate": 0.0004900064121247449,
"loss": 3.3639,
"step": 31550
},
{
"epoch": 9.204777162831343,
"grad_norm": 0.3304920196533203,
"learning_rate": 0.0004898315359953366,
"loss": 3.3434,
"step": 31600
},
{
"epoch": 9.219341683658607,
"grad_norm": 0.3360269367694855,
"learning_rate": 0.0004896566598659283,
"loss": 3.3688,
"step": 31650
},
{
"epoch": 9.233906204485873,
"grad_norm": 0.3214164674282074,
"learning_rate": 0.0004894817837365199,
"loss": 3.3743,
"step": 31700
},
{
"epoch": 9.248470725313137,
"grad_norm": 0.3371962904930115,
"learning_rate": 0.0004893069076071115,
"loss": 3.369,
"step": 31750
},
{
"epoch": 9.263035246140403,
"grad_norm": 0.33907076716423035,
"learning_rate": 0.0004891320314777032,
"loss": 3.3607,
"step": 31800
},
{
"epoch": 9.277599766967667,
"grad_norm": 0.3246038854122162,
"learning_rate": 0.0004889571553482949,
"loss": 3.375,
"step": 31850
},
{
"epoch": 9.292164287794932,
"grad_norm": 0.3564911186695099,
"learning_rate": 0.0004887822792188866,
"loss": 3.3753,
"step": 31900
},
{
"epoch": 9.306728808622196,
"grad_norm": 0.34122955799102783,
"learning_rate": 0.0004886074030894782,
"loss": 3.3754,
"step": 31950
},
{
"epoch": 9.32129332944946,
"grad_norm": 0.3213154375553131,
"learning_rate": 0.0004884325269600699,
"loss": 3.3668,
"step": 32000
},
{
"epoch": 9.32129332944946,
"eval_accuracy": 0.3676478214806967,
"eval_loss": 3.577354907989502,
"eval_runtime": 177.8216,
"eval_samples_per_second": 93.583,
"eval_steps_per_second": 5.854,
"step": 32000
},
{
"epoch": 9.335857850276726,
"grad_norm": 0.3175533711910248,
"learning_rate": 0.0004882576508306615,
"loss": 3.3725,
"step": 32050
},
{
"epoch": 9.35042237110399,
"grad_norm": 0.32937026023864746,
"learning_rate": 0.00048808277470125327,
"loss": 3.365,
"step": 32100
},
{
"epoch": 9.364986891931256,
"grad_norm": 0.34098199009895325,
"learning_rate": 0.0004879078985718449,
"loss": 3.3741,
"step": 32150
},
{
"epoch": 9.37955141275852,
"grad_norm": 0.3347371816635132,
"learning_rate": 0.0004877330224424366,
"loss": 3.3783,
"step": 32200
},
{
"epoch": 9.394115933585786,
"grad_norm": 0.3389059603214264,
"learning_rate": 0.00048755814631302823,
"loss": 3.3772,
"step": 32250
},
{
"epoch": 9.40868045441305,
"grad_norm": 0.34368738532066345,
"learning_rate": 0.00048738327018361987,
"loss": 3.3875,
"step": 32300
},
{
"epoch": 9.423244975240314,
"grad_norm": 0.34114500880241394,
"learning_rate": 0.00048720839405421156,
"loss": 3.3882,
"step": 32350
},
{
"epoch": 9.43780949606758,
"grad_norm": 0.38217416405677795,
"learning_rate": 0.0004870335179248032,
"loss": 3.3811,
"step": 32400
},
{
"epoch": 9.452374016894844,
"grad_norm": 0.3433772921562195,
"learning_rate": 0.0004868586417953949,
"loss": 3.3803,
"step": 32450
},
{
"epoch": 9.46693853772211,
"grad_norm": 0.31574320793151855,
"learning_rate": 0.0004866837656659865,
"loss": 3.3872,
"step": 32500
},
{
"epoch": 9.481503058549373,
"grad_norm": 0.36580872535705566,
"learning_rate": 0.00048650888953657816,
"loss": 3.3849,
"step": 32550
},
{
"epoch": 9.49606757937664,
"grad_norm": 0.3435962200164795,
"learning_rate": 0.0004863340134071699,
"loss": 3.3961,
"step": 32600
},
{
"epoch": 9.510632100203903,
"grad_norm": 0.3202696740627289,
"learning_rate": 0.00048615913727776154,
"loss": 3.399,
"step": 32650
},
{
"epoch": 9.525196621031167,
"grad_norm": 0.3501302897930145,
"learning_rate": 0.00048598426114835323,
"loss": 3.393,
"step": 32700
},
{
"epoch": 9.539761141858433,
"grad_norm": 0.36124247312545776,
"learning_rate": 0.00048580938501894486,
"loss": 3.3921,
"step": 32750
},
{
"epoch": 9.554325662685697,
"grad_norm": 0.34877392649650574,
"learning_rate": 0.00048563450888953655,
"loss": 3.3981,
"step": 32800
},
{
"epoch": 9.568890183512963,
"grad_norm": 0.340874582529068,
"learning_rate": 0.0004854596327601282,
"loss": 3.411,
"step": 32850
},
{
"epoch": 9.583454704340227,
"grad_norm": 0.3228774666786194,
"learning_rate": 0.0004852847566307198,
"loss": 3.3866,
"step": 32900
},
{
"epoch": 9.598019225167493,
"grad_norm": 0.33764421939849854,
"learning_rate": 0.0004851098805013115,
"loss": 3.3922,
"step": 32950
},
{
"epoch": 9.612583745994757,
"grad_norm": 0.3215000331401825,
"learning_rate": 0.00048493500437190315,
"loss": 3.3972,
"step": 33000
},
{
"epoch": 9.612583745994757,
"eval_accuracy": 0.36817630658062733,
"eval_loss": 3.5689446926116943,
"eval_runtime": 177.5448,
"eval_samples_per_second": 93.728,
"eval_steps_per_second": 5.863,
"step": 33000
},
{
"epoch": 9.627148266822022,
"grad_norm": 0.3449784517288208,
"learning_rate": 0.0004847601282424949,
"loss": 3.3968,
"step": 33050
},
{
"epoch": 9.641712787649286,
"grad_norm": 0.343000590801239,
"learning_rate": 0.00048458525211308653,
"loss": 3.3888,
"step": 33100
},
{
"epoch": 9.65627730847655,
"grad_norm": 0.35578057169914246,
"learning_rate": 0.00048441037598367817,
"loss": 3.3971,
"step": 33150
},
{
"epoch": 9.670841829303816,
"grad_norm": 0.3461526334285736,
"learning_rate": 0.00048423549985426986,
"loss": 3.4172,
"step": 33200
},
{
"epoch": 9.68540635013108,
"grad_norm": 0.3729863166809082,
"learning_rate": 0.0004840606237248615,
"loss": 3.3911,
"step": 33250
},
{
"epoch": 9.699970870958346,
"grad_norm": 0.3255765736103058,
"learning_rate": 0.0004838857475954532,
"loss": 3.3958,
"step": 33300
},
{
"epoch": 9.71453539178561,
"grad_norm": 0.32752174139022827,
"learning_rate": 0.0004837108714660448,
"loss": 3.397,
"step": 33350
},
{
"epoch": 9.729099912612876,
"grad_norm": 0.35780587792396545,
"learning_rate": 0.0004835359953366365,
"loss": 3.403,
"step": 33400
},
{
"epoch": 9.74366443344014,
"grad_norm": 0.3370216190814972,
"learning_rate": 0.00048336111920722815,
"loss": 3.4004,
"step": 33450
},
{
"epoch": 9.758228954267405,
"grad_norm": 0.3510589897632599,
"learning_rate": 0.0004831862430778198,
"loss": 3.4153,
"step": 33500
},
{
"epoch": 9.77279347509467,
"grad_norm": 0.3223910629749298,
"learning_rate": 0.00048301136694841153,
"loss": 3.407,
"step": 33550
},
{
"epoch": 9.787357995921933,
"grad_norm": 0.34690526127815247,
"learning_rate": 0.00048283649081900317,
"loss": 3.3977,
"step": 33600
},
{
"epoch": 9.8019225167492,
"grad_norm": 0.33463695645332336,
"learning_rate": 0.00048266161468959486,
"loss": 3.4125,
"step": 33650
},
{
"epoch": 9.816487037576463,
"grad_norm": 0.35382333397865295,
"learning_rate": 0.0004824867385601865,
"loss": 3.4114,
"step": 33700
},
{
"epoch": 9.831051558403729,
"grad_norm": 0.3453107476234436,
"learning_rate": 0.00048231186243077813,
"loss": 3.4108,
"step": 33750
},
{
"epoch": 9.845616079230993,
"grad_norm": 0.33159446716308594,
"learning_rate": 0.0004821369863013698,
"loss": 3.4144,
"step": 33800
},
{
"epoch": 9.860180600058259,
"grad_norm": 0.35295242071151733,
"learning_rate": 0.00048196211017196146,
"loss": 3.4014,
"step": 33850
},
{
"epoch": 9.874745120885523,
"grad_norm": 0.3284096419811249,
"learning_rate": 0.00048178723404255315,
"loss": 3.4039,
"step": 33900
},
{
"epoch": 9.889309641712789,
"grad_norm": 0.33951765298843384,
"learning_rate": 0.0004816123579131448,
"loss": 3.4153,
"step": 33950
},
{
"epoch": 9.903874162540053,
"grad_norm": 0.3084050416946411,
"learning_rate": 0.0004814374817837364,
"loss": 3.416,
"step": 34000
},
{
"epoch": 9.903874162540053,
"eval_accuracy": 0.36868256567168234,
"eval_loss": 3.560567617416382,
"eval_runtime": 177.1807,
"eval_samples_per_second": 93.921,
"eval_steps_per_second": 5.875,
"step": 34000
},
{
"epoch": 9.918438683367317,
"grad_norm": 0.347179114818573,
"learning_rate": 0.00048126260565432816,
"loss": 3.4238,
"step": 34050
},
{
"epoch": 9.933003204194582,
"grad_norm": 0.32636216282844543,
"learning_rate": 0.0004810877295249198,
"loss": 3.42,
"step": 34100
},
{
"epoch": 9.947567725021846,
"grad_norm": 0.33995139598846436,
"learning_rate": 0.0004809128533955115,
"loss": 3.408,
"step": 34150
},
{
"epoch": 9.962132245849112,
"grad_norm": 0.3723163902759552,
"learning_rate": 0.0004807379772661031,
"loss": 3.4068,
"step": 34200
},
{
"epoch": 9.976696766676376,
"grad_norm": 0.33295756578445435,
"learning_rate": 0.0004805631011366948,
"loss": 3.4085,
"step": 34250
},
{
"epoch": 9.991261287503642,
"grad_norm": 0.34439772367477417,
"learning_rate": 0.00048038822500728645,
"loss": 3.4136,
"step": 34300
},
{
"epoch": 10.005825808330906,
"grad_norm": 0.33056867122650146,
"learning_rate": 0.0004802133488778781,
"loss": 3.3779,
"step": 34350
},
{
"epoch": 10.02039032915817,
"grad_norm": 0.3284781575202942,
"learning_rate": 0.0004800384727484698,
"loss": 3.2906,
"step": 34400
},
{
"epoch": 10.034954849985436,
"grad_norm": 0.35688483715057373,
"learning_rate": 0.0004798635966190614,
"loss": 3.3066,
"step": 34450
},
{
"epoch": 10.0495193708127,
"grad_norm": 0.3438079059123993,
"learning_rate": 0.00047968872048965316,
"loss": 3.3037,
"step": 34500
},
{
"epoch": 10.064083891639966,
"grad_norm": 0.33732229471206665,
"learning_rate": 0.0004795138443602448,
"loss": 3.3133,
"step": 34550
},
{
"epoch": 10.07864841246723,
"grad_norm": 0.35935407876968384,
"learning_rate": 0.00047933896823083643,
"loss": 3.3062,
"step": 34600
},
{
"epoch": 10.093212933294495,
"grad_norm": 0.3332599103450775,
"learning_rate": 0.0004791640921014281,
"loss": 3.3235,
"step": 34650
},
{
"epoch": 10.10777745412176,
"grad_norm": 0.3485143184661865,
"learning_rate": 0.00047898921597201976,
"loss": 3.3178,
"step": 34700
},
{
"epoch": 10.122341974949023,
"grad_norm": 0.32437801361083984,
"learning_rate": 0.00047881433984261145,
"loss": 3.326,
"step": 34750
},
{
"epoch": 10.136906495776289,
"grad_norm": 0.3398897647857666,
"learning_rate": 0.0004786394637132031,
"loss": 3.3193,
"step": 34800
},
{
"epoch": 10.151471016603553,
"grad_norm": 0.35131925344467163,
"learning_rate": 0.0004784645875837948,
"loss": 3.3303,
"step": 34850
},
{
"epoch": 10.166035537430819,
"grad_norm": 0.341104120016098,
"learning_rate": 0.0004782897114543864,
"loss": 3.3299,
"step": 34900
},
{
"epoch": 10.180600058258083,
"grad_norm": 0.3273942768573761,
"learning_rate": 0.00047811483532497805,
"loss": 3.3342,
"step": 34950
},
{
"epoch": 10.195164579085349,
"grad_norm": 0.35876280069351196,
"learning_rate": 0.0004779399591955698,
"loss": 3.3313,
"step": 35000
},
{
"epoch": 10.195164579085349,
"eval_accuracy": 0.36803965778531816,
"eval_loss": 3.5746335983276367,
"eval_runtime": 177.7657,
"eval_samples_per_second": 93.612,
"eval_steps_per_second": 5.856,
"step": 35000
},
{
"epoch": 10.209729099912613,
"grad_norm": 0.33633482456207275,
"learning_rate": 0.00047776508306616143,
"loss": 3.3403,
"step": 35050
},
{
"epoch": 10.224293620739878,
"grad_norm": 0.35305869579315186,
"learning_rate": 0.0004775902069367531,
"loss": 3.3432,
"step": 35100
},
{
"epoch": 10.238858141567142,
"grad_norm": 0.3344394564628601,
"learning_rate": 0.00047741533080734476,
"loss": 3.3395,
"step": 35150
},
{
"epoch": 10.253422662394406,
"grad_norm": 0.3661476969718933,
"learning_rate": 0.0004772404546779364,
"loss": 3.344,
"step": 35200
},
{
"epoch": 10.267987183221672,
"grad_norm": 0.3455207049846649,
"learning_rate": 0.0004770655785485281,
"loss": 3.3494,
"step": 35250
},
{
"epoch": 10.282551704048936,
"grad_norm": 0.33272865414619446,
"learning_rate": 0.0004768907024191197,
"loss": 3.3453,
"step": 35300
},
{
"epoch": 10.297116224876202,
"grad_norm": 0.3241981565952301,
"learning_rate": 0.0004767158262897114,
"loss": 3.348,
"step": 35350
},
{
"epoch": 10.311680745703466,
"grad_norm": 0.35547518730163574,
"learning_rate": 0.00047654095016030305,
"loss": 3.3454,
"step": 35400
},
{
"epoch": 10.326245266530732,
"grad_norm": 0.3643966019153595,
"learning_rate": 0.0004763660740308948,
"loss": 3.3509,
"step": 35450
},
{
"epoch": 10.340809787357996,
"grad_norm": 0.3433777987957001,
"learning_rate": 0.0004761911979014864,
"loss": 3.337,
"step": 35500
},
{
"epoch": 10.355374308185262,
"grad_norm": 0.33827000856399536,
"learning_rate": 0.00047601632177207806,
"loss": 3.3625,
"step": 35550
},
{
"epoch": 10.369938829012526,
"grad_norm": 0.34222304821014404,
"learning_rate": 0.00047584144564266975,
"loss": 3.3584,
"step": 35600
},
{
"epoch": 10.38450334983979,
"grad_norm": 0.3403022885322571,
"learning_rate": 0.0004756665695132614,
"loss": 3.3523,
"step": 35650
},
{
"epoch": 10.399067870667055,
"grad_norm": 0.3352959156036377,
"learning_rate": 0.0004754916933838531,
"loss": 3.3585,
"step": 35700
},
{
"epoch": 10.41363239149432,
"grad_norm": 0.3228458762168884,
"learning_rate": 0.0004753168172544447,
"loss": 3.3669,
"step": 35750
},
{
"epoch": 10.428196912321585,
"grad_norm": 0.3480757474899292,
"learning_rate": 0.00047514194112503635,
"loss": 3.3639,
"step": 35800
},
{
"epoch": 10.44276143314885,
"grad_norm": 0.3398342728614807,
"learning_rate": 0.00047496706499562804,
"loss": 3.3651,
"step": 35850
},
{
"epoch": 10.457325953976115,
"grad_norm": 0.3550876975059509,
"learning_rate": 0.0004747921888662197,
"loss": 3.3634,
"step": 35900
},
{
"epoch": 10.471890474803379,
"grad_norm": 0.35050809383392334,
"learning_rate": 0.0004746173127368114,
"loss": 3.3584,
"step": 35950
},
{
"epoch": 10.486454995630643,
"grad_norm": 0.3375207781791687,
"learning_rate": 0.00047444243660740306,
"loss": 3.3625,
"step": 36000
},
{
"epoch": 10.486454995630643,
"eval_accuracy": 0.3688550818358119,
"eval_loss": 3.565290927886963,
"eval_runtime": 177.5982,
"eval_samples_per_second": 93.7,
"eval_steps_per_second": 5.862,
"step": 36000
},
{
"epoch": 10.501019516457909,
"grad_norm": 0.34734171628952026,
"learning_rate": 0.0004742675604779947,
"loss": 3.3803,
"step": 36050
},
{
"epoch": 10.515584037285173,
"grad_norm": 0.34996819496154785,
"learning_rate": 0.0004740926843485864,
"loss": 3.3676,
"step": 36100
},
{
"epoch": 10.530148558112439,
"grad_norm": 0.3279918432235718,
"learning_rate": 0.000473917808219178,
"loss": 3.3782,
"step": 36150
},
{
"epoch": 10.544713078939703,
"grad_norm": 0.354153573513031,
"learning_rate": 0.0004737429320897697,
"loss": 3.3713,
"step": 36200
},
{
"epoch": 10.559277599766968,
"grad_norm": 0.3350876569747925,
"learning_rate": 0.00047356805596036135,
"loss": 3.3706,
"step": 36250
},
{
"epoch": 10.573842120594232,
"grad_norm": 0.3531542122364044,
"learning_rate": 0.00047339317983095304,
"loss": 3.3671,
"step": 36300
},
{
"epoch": 10.588406641421496,
"grad_norm": 0.31363147497177124,
"learning_rate": 0.0004732183037015447,
"loss": 3.3689,
"step": 36350
},
{
"epoch": 10.602971162248762,
"grad_norm": 0.3484424650669098,
"learning_rate": 0.0004730434275721363,
"loss": 3.3738,
"step": 36400
},
{
"epoch": 10.617535683076026,
"grad_norm": 0.3363831341266632,
"learning_rate": 0.00047286855144272806,
"loss": 3.3781,
"step": 36450
},
{
"epoch": 10.632100203903292,
"grad_norm": 0.3282327353954315,
"learning_rate": 0.0004726936753133197,
"loss": 3.3775,
"step": 36500
},
{
"epoch": 10.646664724730556,
"grad_norm": 0.35747599601745605,
"learning_rate": 0.0004725187991839114,
"loss": 3.3739,
"step": 36550
},
{
"epoch": 10.661229245557822,
"grad_norm": 0.3524666428565979,
"learning_rate": 0.000472343923054503,
"loss": 3.3857,
"step": 36600
},
{
"epoch": 10.675793766385086,
"grad_norm": 0.34615790843963623,
"learning_rate": 0.00047216904692509465,
"loss": 3.374,
"step": 36650
},
{
"epoch": 10.690358287212351,
"grad_norm": 0.32927921414375305,
"learning_rate": 0.00047199417079568634,
"loss": 3.3821,
"step": 36700
},
{
"epoch": 10.704922808039615,
"grad_norm": 0.3339250981807709,
"learning_rate": 0.000471819294666278,
"loss": 3.3747,
"step": 36750
},
{
"epoch": 10.71948732886688,
"grad_norm": 0.3413323760032654,
"learning_rate": 0.00047164441853686967,
"loss": 3.3821,
"step": 36800
},
{
"epoch": 10.734051849694145,
"grad_norm": 0.31661462783813477,
"learning_rate": 0.0004714695424074613,
"loss": 3.3896,
"step": 36850
},
{
"epoch": 10.74861637052141,
"grad_norm": 0.35339003801345825,
"learning_rate": 0.00047129466627805305,
"loss": 3.3677,
"step": 36900
},
{
"epoch": 10.763180891348675,
"grad_norm": 0.3317316174507141,
"learning_rate": 0.0004711197901486447,
"loss": 3.3845,
"step": 36950
},
{
"epoch": 10.777745412175939,
"grad_norm": 0.3493446707725525,
"learning_rate": 0.0004709449140192363,
"loss": 3.3663,
"step": 37000
},
{
"epoch": 10.777745412175939,
"eval_accuracy": 0.36960982535413733,
"eval_loss": 3.55883526802063,
"eval_runtime": 177.6184,
"eval_samples_per_second": 93.69,
"eval_steps_per_second": 5.861,
"step": 37000
},
{
"epoch": 10.792309933003205,
"grad_norm": 0.3213796019554138,
"learning_rate": 0.000470770037889828,
"loss": 3.3895,
"step": 37050
},
{
"epoch": 10.806874453830469,
"grad_norm": 0.325885534286499,
"learning_rate": 0.00047059516176041965,
"loss": 3.3857,
"step": 37100
},
{
"epoch": 10.821438974657735,
"grad_norm": 0.3278997540473938,
"learning_rate": 0.00047042028563101134,
"loss": 3.3964,
"step": 37150
},
{
"epoch": 10.836003495484999,
"grad_norm": 0.31899142265319824,
"learning_rate": 0.000470245409501603,
"loss": 3.3808,
"step": 37200
},
{
"epoch": 10.850568016312263,
"grad_norm": 0.3405511677265167,
"learning_rate": 0.0004700705333721946,
"loss": 3.3904,
"step": 37250
},
{
"epoch": 10.865132537139528,
"grad_norm": 0.32967904210090637,
"learning_rate": 0.0004698956572427863,
"loss": 3.3937,
"step": 37300
},
{
"epoch": 10.879697057966792,
"grad_norm": 0.368004709482193,
"learning_rate": 0.00046972078111337794,
"loss": 3.3859,
"step": 37350
},
{
"epoch": 10.894261578794058,
"grad_norm": 0.34228798747062683,
"learning_rate": 0.0004695459049839697,
"loss": 3.3779,
"step": 37400
},
{
"epoch": 10.908826099621322,
"grad_norm": 0.31284409761428833,
"learning_rate": 0.0004693710288545613,
"loss": 3.3925,
"step": 37450
},
{
"epoch": 10.923390620448588,
"grad_norm": 0.3652680218219757,
"learning_rate": 0.000469196152725153,
"loss": 3.3866,
"step": 37500
},
{
"epoch": 10.937955141275852,
"grad_norm": 0.3488202393054962,
"learning_rate": 0.00046902127659574465,
"loss": 3.4059,
"step": 37550
},
{
"epoch": 10.952519662103116,
"grad_norm": 0.3468438982963562,
"learning_rate": 0.0004688464004663363,
"loss": 3.3932,
"step": 37600
},
{
"epoch": 10.967084182930382,
"grad_norm": 0.3332480788230896,
"learning_rate": 0.000468671524336928,
"loss": 3.3879,
"step": 37650
},
{
"epoch": 10.981648703757646,
"grad_norm": 0.3508252501487732,
"learning_rate": 0.0004684966482075196,
"loss": 3.3867,
"step": 37700
},
{
"epoch": 10.996213224584912,
"grad_norm": 0.318619966506958,
"learning_rate": 0.0004683217720781113,
"loss": 3.3918,
"step": 37750
},
{
"epoch": 11.010777745412176,
"grad_norm": 0.346383273601532,
"learning_rate": 0.00046814689594870294,
"loss": 3.3044,
"step": 37800
},
{
"epoch": 11.025342266239441,
"grad_norm": 0.361944317817688,
"learning_rate": 0.0004679720198192946,
"loss": 3.2746,
"step": 37850
},
{
"epoch": 11.039906787066705,
"grad_norm": 0.3629739284515381,
"learning_rate": 0.0004677971436898863,
"loss": 3.2872,
"step": 37900
},
{
"epoch": 11.054471307893971,
"grad_norm": 0.343257874250412,
"learning_rate": 0.00046762226756047795,
"loss": 3.2868,
"step": 37950
},
{
"epoch": 11.069035828721235,
"grad_norm": 0.3298133611679077,
"learning_rate": 0.00046744739143106964,
"loss": 3.2875,
"step": 38000
},
{
"epoch": 11.069035828721235,
"eval_accuracy": 0.3689343428410084,
"eval_loss": 3.569741725921631,
"eval_runtime": 177.41,
"eval_samples_per_second": 93.8,
"eval_steps_per_second": 5.868,
"step": 38000
},
{
"epoch": 11.0836003495485,
"grad_norm": 0.34320956468582153,
"learning_rate": 0.0004672725153016613,
"loss": 3.304,
"step": 38050
},
{
"epoch": 11.098164870375765,
"grad_norm": 0.34359210729599,
"learning_rate": 0.00046709763917225297,
"loss": 3.2989,
"step": 38100
},
{
"epoch": 11.112729391203029,
"grad_norm": 0.34714171290397644,
"learning_rate": 0.0004669227630428446,
"loss": 3.3076,
"step": 38150
},
{
"epoch": 11.127293912030295,
"grad_norm": 0.36771103739738464,
"learning_rate": 0.00046674788691343624,
"loss": 3.3131,
"step": 38200
},
{
"epoch": 11.141858432857559,
"grad_norm": 0.3316338360309601,
"learning_rate": 0.00046657301078402793,
"loss": 3.305,
"step": 38250
},
{
"epoch": 11.156422953684825,
"grad_norm": 0.324266254901886,
"learning_rate": 0.00046639813465461957,
"loss": 3.3045,
"step": 38300
},
{
"epoch": 11.170987474512089,
"grad_norm": 0.3661448359489441,
"learning_rate": 0.0004662232585252113,
"loss": 3.3088,
"step": 38350
},
{
"epoch": 11.185551995339353,
"grad_norm": 0.33063098788261414,
"learning_rate": 0.00046604838239580295,
"loss": 3.319,
"step": 38400
},
{
"epoch": 11.200116516166618,
"grad_norm": 0.347318172454834,
"learning_rate": 0.0004658735062663946,
"loss": 3.3146,
"step": 38450
},
{
"epoch": 11.214681036993882,
"grad_norm": 0.34231555461883545,
"learning_rate": 0.0004656986301369863,
"loss": 3.3067,
"step": 38500
},
{
"epoch": 11.229245557821148,
"grad_norm": 0.3556668162345886,
"learning_rate": 0.0004655237540075779,
"loss": 3.318,
"step": 38550
},
{
"epoch": 11.243810078648412,
"grad_norm": 0.34402233362197876,
"learning_rate": 0.0004653488778781696,
"loss": 3.3317,
"step": 38600
},
{
"epoch": 11.258374599475678,
"grad_norm": 0.36647072434425354,
"learning_rate": 0.00046517400174876124,
"loss": 3.3338,
"step": 38650
},
{
"epoch": 11.272939120302942,
"grad_norm": 0.34458011388778687,
"learning_rate": 0.0004649991256193529,
"loss": 3.34,
"step": 38700
},
{
"epoch": 11.287503641130208,
"grad_norm": 0.35463765263557434,
"learning_rate": 0.00046482424948994457,
"loss": 3.3393,
"step": 38750
},
{
"epoch": 11.302068161957472,
"grad_norm": 0.36296704411506653,
"learning_rate": 0.0004646493733605362,
"loss": 3.3322,
"step": 38800
},
{
"epoch": 11.316632682784736,
"grad_norm": 0.334695041179657,
"learning_rate": 0.00046447449723112795,
"loss": 3.3302,
"step": 38850
},
{
"epoch": 11.331197203612001,
"grad_norm": 0.3402169346809387,
"learning_rate": 0.0004642996211017196,
"loss": 3.3366,
"step": 38900
},
{
"epoch": 11.345761724439265,
"grad_norm": 0.35098734498023987,
"learning_rate": 0.0004641247449723113,
"loss": 3.3427,
"step": 38950
},
{
"epoch": 11.360326245266531,
"grad_norm": 0.37707361578941345,
"learning_rate": 0.0004639498688429029,
"loss": 3.3281,
"step": 39000
},
{
"epoch": 11.360326245266531,
"eval_accuracy": 0.3694583592195778,
"eval_loss": 3.564202308654785,
"eval_runtime": 177.5,
"eval_samples_per_second": 93.752,
"eval_steps_per_second": 5.865,
"step": 39000
},
{
"epoch": 11.374890766093795,
"grad_norm": 0.33432015776634216,
"learning_rate": 0.00046377499271349455,
"loss": 3.3417,
"step": 39050
},
{
"epoch": 11.389455286921061,
"grad_norm": 0.34372082352638245,
"learning_rate": 0.00046360011658408624,
"loss": 3.3352,
"step": 39100
},
{
"epoch": 11.404019807748325,
"grad_norm": 0.3262201249599457,
"learning_rate": 0.00046342524045467787,
"loss": 3.3347,
"step": 39150
},
{
"epoch": 11.418584328575589,
"grad_norm": 0.3505764901638031,
"learning_rate": 0.00046325036432526956,
"loss": 3.3344,
"step": 39200
},
{
"epoch": 11.433148849402855,
"grad_norm": 0.34340110421180725,
"learning_rate": 0.0004630754881958612,
"loss": 3.3473,
"step": 39250
},
{
"epoch": 11.447713370230119,
"grad_norm": 0.33771929144859314,
"learning_rate": 0.00046290061206645284,
"loss": 3.3516,
"step": 39300
},
{
"epoch": 11.462277891057385,
"grad_norm": 0.3311103880405426,
"learning_rate": 0.0004627257359370446,
"loss": 3.3527,
"step": 39350
},
{
"epoch": 11.476842411884649,
"grad_norm": 0.36507630348205566,
"learning_rate": 0.0004625508598076362,
"loss": 3.3381,
"step": 39400
},
{
"epoch": 11.491406932711914,
"grad_norm": 0.36566632986068726,
"learning_rate": 0.0004623759836782279,
"loss": 3.3481,
"step": 39450
},
{
"epoch": 11.505971453539178,
"grad_norm": 0.3532668352127075,
"learning_rate": 0.00046220110754881954,
"loss": 3.3388,
"step": 39500
},
{
"epoch": 11.520535974366444,
"grad_norm": 0.3642367720603943,
"learning_rate": 0.00046202623141941123,
"loss": 3.3464,
"step": 39550
},
{
"epoch": 11.535100495193708,
"grad_norm": 0.3446687161922455,
"learning_rate": 0.00046185135529000287,
"loss": 3.3419,
"step": 39600
},
{
"epoch": 11.549665016020972,
"grad_norm": 0.32603853940963745,
"learning_rate": 0.0004616764791605945,
"loss": 3.345,
"step": 39650
},
{
"epoch": 11.564229536848238,
"grad_norm": 0.3597732186317444,
"learning_rate": 0.0004615016030311862,
"loss": 3.3517,
"step": 39700
},
{
"epoch": 11.578794057675502,
"grad_norm": 0.3671717643737793,
"learning_rate": 0.00046132672690177783,
"loss": 3.3627,
"step": 39750
},
{
"epoch": 11.593358578502768,
"grad_norm": 0.326857328414917,
"learning_rate": 0.0004611518507723696,
"loss": 3.356,
"step": 39800
},
{
"epoch": 11.607923099330032,
"grad_norm": 0.355274498462677,
"learning_rate": 0.0004609769746429612,
"loss": 3.352,
"step": 39850
},
{
"epoch": 11.622487620157298,
"grad_norm": 0.35776060819625854,
"learning_rate": 0.00046080209851355285,
"loss": 3.3488,
"step": 39900
},
{
"epoch": 11.637052140984562,
"grad_norm": 0.3599199950695038,
"learning_rate": 0.00046062722238414454,
"loss": 3.3537,
"step": 39950
},
{
"epoch": 11.651616661811826,
"grad_norm": 0.38023436069488525,
"learning_rate": 0.0004604523462547362,
"loss": 3.3652,
"step": 40000
},
{
"epoch": 11.651616661811826,
"eval_accuracy": 0.36993510123006257,
"eval_loss": 3.5553505420684814,
"eval_runtime": 177.5604,
"eval_samples_per_second": 93.72,
"eval_steps_per_second": 5.863,
"step": 40000
},
{
"epoch": 11.666181182639091,
"grad_norm": 0.33803287148475647,
"learning_rate": 0.00046027747012532787,
"loss": 3.3708,
"step": 40050
},
{
"epoch": 11.680745703466355,
"grad_norm": 0.3471790850162506,
"learning_rate": 0.0004601025939959195,
"loss": 3.3482,
"step": 40100
},
{
"epoch": 11.695310224293621,
"grad_norm": 0.3702428340911865,
"learning_rate": 0.0004599277178665112,
"loss": 3.3483,
"step": 40150
},
{
"epoch": 11.709874745120885,
"grad_norm": 0.341546893119812,
"learning_rate": 0.00045975284173710283,
"loss": 3.3534,
"step": 40200
},
{
"epoch": 11.724439265948151,
"grad_norm": 0.34679174423217773,
"learning_rate": 0.00045957796560769446,
"loss": 3.3671,
"step": 40250
},
{
"epoch": 11.739003786775415,
"grad_norm": 0.3513627350330353,
"learning_rate": 0.0004594030894782862,
"loss": 3.361,
"step": 40300
},
{
"epoch": 11.75356830760268,
"grad_norm": 0.34741392731666565,
"learning_rate": 0.00045922821334887785,
"loss": 3.3687,
"step": 40350
},
{
"epoch": 11.768132828429945,
"grad_norm": 0.36889341473579407,
"learning_rate": 0.00045905333721946954,
"loss": 3.3538,
"step": 40400
},
{
"epoch": 11.782697349257209,
"grad_norm": 0.3231208622455597,
"learning_rate": 0.00045887846109006117,
"loss": 3.3654,
"step": 40450
},
{
"epoch": 11.797261870084474,
"grad_norm": 0.350035160779953,
"learning_rate": 0.0004587035849606528,
"loss": 3.3555,
"step": 40500
},
{
"epoch": 11.811826390911738,
"grad_norm": 0.33473822474479675,
"learning_rate": 0.0004585287088312445,
"loss": 3.3757,
"step": 40550
},
{
"epoch": 11.826390911739004,
"grad_norm": 0.3523299992084503,
"learning_rate": 0.00045835383270183613,
"loss": 3.3593,
"step": 40600
},
{
"epoch": 11.840955432566268,
"grad_norm": 0.336702823638916,
"learning_rate": 0.0004581789565724278,
"loss": 3.3607,
"step": 40650
},
{
"epoch": 11.855519953393534,
"grad_norm": 0.353645384311676,
"learning_rate": 0.00045800408044301946,
"loss": 3.3705,
"step": 40700
},
{
"epoch": 11.870084474220798,
"grad_norm": 0.35596734285354614,
"learning_rate": 0.0004578292043136111,
"loss": 3.3642,
"step": 40750
},
{
"epoch": 11.884648995048064,
"grad_norm": 0.35289227962493896,
"learning_rate": 0.00045765432818420284,
"loss": 3.3708,
"step": 40800
},
{
"epoch": 11.899213515875328,
"grad_norm": 0.3323861062526703,
"learning_rate": 0.0004574794520547945,
"loss": 3.3633,
"step": 40850
},
{
"epoch": 11.913778036702592,
"grad_norm": 0.3809449374675751,
"learning_rate": 0.00045730457592538617,
"loss": 3.3729,
"step": 40900
},
{
"epoch": 11.928342557529858,
"grad_norm": 0.361569344997406,
"learning_rate": 0.0004571296997959778,
"loss": 3.3697,
"step": 40950
},
{
"epoch": 11.942907078357122,
"grad_norm": 0.3860667645931244,
"learning_rate": 0.0004569548236665695,
"loss": 3.3736,
"step": 41000
},
{
"epoch": 11.942907078357122,
"eval_accuracy": 0.3703710367586435,
"eval_loss": 3.5485339164733887,
"eval_runtime": 177.6718,
"eval_samples_per_second": 93.661,
"eval_steps_per_second": 5.859,
"step": 41000
},
{
"epoch": 11.957471599184387,
"grad_norm": 0.3505721390247345,
"learning_rate": 0.00045677994753716113,
"loss": 3.3755,
"step": 41050
},
{
"epoch": 11.972036120011651,
"grad_norm": 0.35810521245002747,
"learning_rate": 0.00045660507140775277,
"loss": 3.3569,
"step": 41100
},
{
"epoch": 11.986600640838917,
"grad_norm": 0.3344600796699524,
"learning_rate": 0.00045643019527834446,
"loss": 3.3613,
"step": 41150
},
{
"epoch": 12.001165161666181,
"grad_norm": 0.34730231761932373,
"learning_rate": 0.0004562553191489361,
"loss": 3.36,
"step": 41200
},
{
"epoch": 12.015729682493445,
"grad_norm": 0.3313257694244385,
"learning_rate": 0.00045608044301952784,
"loss": 3.269,
"step": 41250
},
{
"epoch": 12.030294203320711,
"grad_norm": 0.3686826825141907,
"learning_rate": 0.0004559055668901195,
"loss": 3.2643,
"step": 41300
},
{
"epoch": 12.044858724147975,
"grad_norm": 0.36742156744003296,
"learning_rate": 0.0004557306907607111,
"loss": 3.2595,
"step": 41350
},
{
"epoch": 12.05942324497524,
"grad_norm": 0.34834322333335876,
"learning_rate": 0.0004555558146313028,
"loss": 3.2717,
"step": 41400
},
{
"epoch": 12.073987765802505,
"grad_norm": 0.3443371057510376,
"learning_rate": 0.00045538093850189444,
"loss": 3.2769,
"step": 41450
},
{
"epoch": 12.08855228662977,
"grad_norm": 0.352764755487442,
"learning_rate": 0.00045520606237248613,
"loss": 3.2705,
"step": 41500
},
{
"epoch": 12.103116807457035,
"grad_norm": 0.3674822151660919,
"learning_rate": 0.00045503118624307776,
"loss": 3.2857,
"step": 41550
},
{
"epoch": 12.117681328284299,
"grad_norm": 0.3471083343029022,
"learning_rate": 0.00045485631011366945,
"loss": 3.2775,
"step": 41600
},
{
"epoch": 12.132245849111564,
"grad_norm": 0.36739566922187805,
"learning_rate": 0.0004546814339842611,
"loss": 3.2886,
"step": 41650
},
{
"epoch": 12.146810369938828,
"grad_norm": 0.3347199261188507,
"learning_rate": 0.0004545065578548527,
"loss": 3.2946,
"step": 41700
},
{
"epoch": 12.161374890766094,
"grad_norm": 0.42980143427848816,
"learning_rate": 0.00045433168172544447,
"loss": 3.2904,
"step": 41750
},
{
"epoch": 12.175939411593358,
"grad_norm": 0.3469353914260864,
"learning_rate": 0.0004541568055960361,
"loss": 3.2946,
"step": 41800
},
{
"epoch": 12.190503932420624,
"grad_norm": 0.3609008491039276,
"learning_rate": 0.0004539819294666278,
"loss": 3.3039,
"step": 41850
},
{
"epoch": 12.205068453247888,
"grad_norm": 0.3403509855270386,
"learning_rate": 0.00045380705333721943,
"loss": 3.2941,
"step": 41900
},
{
"epoch": 12.219632974075154,
"grad_norm": 0.35095566511154175,
"learning_rate": 0.00045363217720781107,
"loss": 3.2979,
"step": 41950
},
{
"epoch": 12.234197494902418,
"grad_norm": 0.3400515019893646,
"learning_rate": 0.00045345730107840276,
"loss": 3.2965,
"step": 42000
},
{
"epoch": 12.234197494902418,
"eval_accuracy": 0.37021039798550043,
"eval_loss": 3.562443256378174,
"eval_runtime": 177.9087,
"eval_samples_per_second": 93.537,
"eval_steps_per_second": 5.851,
"step": 42000
},
{
"epoch": 12.248762015729682,
"grad_norm": 0.32279014587402344,
"learning_rate": 0.0004532824249489944,
"loss": 3.3031,
"step": 42050
},
{
"epoch": 12.263326536556947,
"grad_norm": 0.32717186212539673,
"learning_rate": 0.0004531075488195861,
"loss": 3.3086,
"step": 42100
},
{
"epoch": 12.277891057384211,
"grad_norm": 0.3563939034938812,
"learning_rate": 0.0004529326726901777,
"loss": 3.3055,
"step": 42150
},
{
"epoch": 12.292455578211477,
"grad_norm": 0.3626421391963959,
"learning_rate": 0.00045275779656076947,
"loss": 3.3105,
"step": 42200
},
{
"epoch": 12.307020099038741,
"grad_norm": 0.36385101079940796,
"learning_rate": 0.0004525829204313611,
"loss": 3.3156,
"step": 42250
},
{
"epoch": 12.321584619866007,
"grad_norm": 0.3256656229496002,
"learning_rate": 0.00045240804430195274,
"loss": 3.3008,
"step": 42300
},
{
"epoch": 12.336149140693271,
"grad_norm": 0.34348002076148987,
"learning_rate": 0.00045223316817254443,
"loss": 3.3234,
"step": 42350
},
{
"epoch": 12.350713661520537,
"grad_norm": 0.37969276309013367,
"learning_rate": 0.00045205829204313607,
"loss": 3.3084,
"step": 42400
},
{
"epoch": 12.3652781823478,
"grad_norm": 0.3616692125797272,
"learning_rate": 0.00045188341591372776,
"loss": 3.3194,
"step": 42450
},
{
"epoch": 12.379842703175065,
"grad_norm": 0.3250804543495178,
"learning_rate": 0.0004517085397843194,
"loss": 3.3203,
"step": 42500
},
{
"epoch": 12.39440722400233,
"grad_norm": 0.3668137490749359,
"learning_rate": 0.00045153366365491103,
"loss": 3.3219,
"step": 42550
},
{
"epoch": 12.408971744829595,
"grad_norm": 0.3868430256843567,
"learning_rate": 0.0004513587875255027,
"loss": 3.3128,
"step": 42600
},
{
"epoch": 12.42353626565686,
"grad_norm": 0.3570868968963623,
"learning_rate": 0.00045118391139609436,
"loss": 3.3157,
"step": 42650
},
{
"epoch": 12.438100786484124,
"grad_norm": 0.35844898223876953,
"learning_rate": 0.0004510090352666861,
"loss": 3.3106,
"step": 42700
},
{
"epoch": 12.45266530731139,
"grad_norm": 0.3538632094860077,
"learning_rate": 0.00045083415913727774,
"loss": 3.324,
"step": 42750
},
{
"epoch": 12.467229828138654,
"grad_norm": 0.3571759760379791,
"learning_rate": 0.0004506592830078694,
"loss": 3.3308,
"step": 42800
},
{
"epoch": 12.481794348965918,
"grad_norm": 0.3505135178565979,
"learning_rate": 0.00045048440687846106,
"loss": 3.3307,
"step": 42850
},
{
"epoch": 12.496358869793184,
"grad_norm": 0.36675187945365906,
"learning_rate": 0.0004503095307490527,
"loss": 3.3453,
"step": 42900
},
{
"epoch": 12.510923390620448,
"grad_norm": 0.348091185092926,
"learning_rate": 0.0004501346546196444,
"loss": 3.3215,
"step": 42950
},
{
"epoch": 12.525487911447714,
"grad_norm": 0.3269301950931549,
"learning_rate": 0.000449959778490236,
"loss": 3.3183,
"step": 43000
},
{
"epoch": 12.525487911447714,
"eval_accuracy": 0.3701031486728309,
"eval_loss": 3.5565950870513916,
"eval_runtime": 177.6773,
"eval_samples_per_second": 93.659,
"eval_steps_per_second": 5.859,
"step": 43000
},
{
"epoch": 12.540052432274978,
"grad_norm": 0.35199809074401855,
"learning_rate": 0.0004497849023608277,
"loss": 3.3305,
"step": 43050
},
{
"epoch": 12.554616953102244,
"grad_norm": Infinity,
"learning_rate": 0.00044961002623141935,
"loss": 3.3298,
"step": 43100
},
{
"epoch": 12.569181473929508,
"grad_norm": 0.3471096158027649,
"learning_rate": 0.000449435150102011,
"loss": 3.3326,
"step": 43150
},
{
"epoch": 12.583745994756772,
"grad_norm": 0.3414683938026428,
"learning_rate": 0.00044926027397260273,
"loss": 3.3381,
"step": 43200
},
{
"epoch": 12.598310515584037,
"grad_norm": 0.3478773832321167,
"learning_rate": 0.00044908539784319437,
"loss": 3.3287,
"step": 43250
},
{
"epoch": 12.612875036411301,
"grad_norm": 0.33130770921707153,
"learning_rate": 0.00044891052171378606,
"loss": 3.3472,
"step": 43300
},
{
"epoch": 12.627439557238567,
"grad_norm": 0.37261322140693665,
"learning_rate": 0.0004487356455843777,
"loss": 3.338,
"step": 43350
},
{
"epoch": 12.642004078065831,
"grad_norm": 0.3253554701805115,
"learning_rate": 0.00044856076945496933,
"loss": 3.3443,
"step": 43400
},
{
"epoch": 12.656568598893097,
"grad_norm": 0.3404645323753357,
"learning_rate": 0.000448385893325561,
"loss": 3.3503,
"step": 43450
},
{
"epoch": 12.671133119720361,
"grad_norm": 0.3743685483932495,
"learning_rate": 0.00044821101719615266,
"loss": 3.3357,
"step": 43500
},
{
"epoch": 12.685697640547627,
"grad_norm": 0.3546348214149475,
"learning_rate": 0.00044803614106674435,
"loss": 3.3363,
"step": 43550
},
{
"epoch": 12.70026216137489,
"grad_norm": 0.3247695565223694,
"learning_rate": 0.000447861264937336,
"loss": 3.3544,
"step": 43600
},
{
"epoch": 12.714826682202155,
"grad_norm": 0.366551011800766,
"learning_rate": 0.00044768638880792773,
"loss": 3.3392,
"step": 43650
},
{
"epoch": 12.72939120302942,
"grad_norm": 0.3813282549381256,
"learning_rate": 0.00044751151267851937,
"loss": 3.3486,
"step": 43700
},
{
"epoch": 12.743955723856685,
"grad_norm": 0.3566057085990906,
"learning_rate": 0.000447336636549111,
"loss": 3.3455,
"step": 43750
},
{
"epoch": 12.75852024468395,
"grad_norm": 0.36512717604637146,
"learning_rate": 0.0004471617604197027,
"loss": 3.3554,
"step": 43800
},
{
"epoch": 12.773084765511214,
"grad_norm": 0.3692280352115631,
"learning_rate": 0.00044698688429029433,
"loss": 3.3474,
"step": 43850
},
{
"epoch": 12.78764928633848,
"grad_norm": 0.35620564222335815,
"learning_rate": 0.000446812008160886,
"loss": 3.3539,
"step": 43900
},
{
"epoch": 12.802213807165744,
"grad_norm": 0.3521715998649597,
"learning_rate": 0.00044663713203147766,
"loss": 3.3583,
"step": 43950
},
{
"epoch": 12.81677832799301,
"grad_norm": 0.36695396900177,
"learning_rate": 0.0004464622559020693,
"loss": 3.3537,
"step": 44000
},
{
"epoch": 12.81677832799301,
"eval_accuracy": 0.3705989415480662,
"eval_loss": 3.5467417240142822,
"eval_runtime": 185.5243,
"eval_samples_per_second": 89.697,
"eval_steps_per_second": 5.611,
"step": 44000
},
{
"epoch": 12.831342848820274,
"grad_norm": 0.3474844694137573,
"learning_rate": 0.000446287379772661,
"loss": 3.3493,
"step": 44050
},
{
"epoch": 12.845907369647538,
"grad_norm": 0.3925248980522156,
"learning_rate": 0.0004461125036432526,
"loss": 3.3387,
"step": 44100
},
{
"epoch": 12.860471890474804,
"grad_norm": 0.35316935181617737,
"learning_rate": 0.00044593762751384436,
"loss": 3.3445,
"step": 44150
},
{
"epoch": 12.875036411302068,
"grad_norm": 0.3575705885887146,
"learning_rate": 0.000445762751384436,
"loss": 3.3477,
"step": 44200
},
{
"epoch": 12.889600932129333,
"grad_norm": 0.3460574150085449,
"learning_rate": 0.0004455878752550277,
"loss": 3.3678,
"step": 44250
},
{
"epoch": 12.904165452956597,
"grad_norm": 0.35342928767204285,
"learning_rate": 0.0004454129991256193,
"loss": 3.34,
"step": 44300
},
{
"epoch": 12.918729973783863,
"grad_norm": 0.3239762485027313,
"learning_rate": 0.00044523812299621096,
"loss": 3.3549,
"step": 44350
},
{
"epoch": 12.933294494611127,
"grad_norm": 0.3479582369327545,
"learning_rate": 0.00044506324686680265,
"loss": 3.3561,
"step": 44400
},
{
"epoch": 12.947859015438393,
"grad_norm": 0.348714143037796,
"learning_rate": 0.0004448883707373943,
"loss": 3.3436,
"step": 44450
},
{
"epoch": 12.962423536265657,
"grad_norm": 0.34078627824783325,
"learning_rate": 0.000444713494607986,
"loss": 3.3457,
"step": 44500
},
{
"epoch": 12.976988057092921,
"grad_norm": 0.3270307183265686,
"learning_rate": 0.0004445386184785776,
"loss": 3.3583,
"step": 44550
},
{
"epoch": 12.991552577920187,
"grad_norm": 0.3412953019142151,
"learning_rate": 0.00044436374234916925,
"loss": 3.3598,
"step": 44600
},
{
"epoch": 13.00611709874745,
"grad_norm": 0.3603624701499939,
"learning_rate": 0.000444188866219761,
"loss": 3.3055,
"step": 44650
},
{
"epoch": 13.020681619574717,
"grad_norm": 0.34541741013526917,
"learning_rate": 0.00044401399009035263,
"loss": 3.2502,
"step": 44700
},
{
"epoch": 13.03524614040198,
"grad_norm": 0.34776028990745544,
"learning_rate": 0.0004438391139609443,
"loss": 3.2458,
"step": 44750
},
{
"epoch": 13.049810661229246,
"grad_norm": 0.3470947742462158,
"learning_rate": 0.00044366423783153596,
"loss": 3.2418,
"step": 44800
},
{
"epoch": 13.06437518205651,
"grad_norm": 0.37817707657814026,
"learning_rate": 0.0004434893617021276,
"loss": 3.2555,
"step": 44850
},
{
"epoch": 13.078939702883774,
"grad_norm": 0.37848788499832153,
"learning_rate": 0.0004433144855727193,
"loss": 3.2488,
"step": 44900
},
{
"epoch": 13.09350422371104,
"grad_norm": 0.3598591089248657,
"learning_rate": 0.0004431396094433109,
"loss": 3.274,
"step": 44950
},
{
"epoch": 13.108068744538304,
"grad_norm": 0.3582230508327484,
"learning_rate": 0.0004429647333139026,
"loss": 3.2595,
"step": 45000
},
{
"epoch": 13.108068744538304,
"eval_accuracy": 0.3703395205132538,
"eval_loss": 3.558483839035034,
"eval_runtime": 177.819,
"eval_samples_per_second": 93.584,
"eval_steps_per_second": 5.854,
"step": 45000
},
{
"epoch": 13.12263326536557,
"grad_norm": 0.3447740972042084,
"learning_rate": 0.00044278985718449425,
"loss": 3.2738,
"step": 45050
},
{
"epoch": 13.137197786192834,
"grad_norm": 0.36638307571411133,
"learning_rate": 0.000442614981055086,
"loss": 3.2711,
"step": 45100
},
{
"epoch": 13.1517623070201,
"grad_norm": 0.33102279901504517,
"learning_rate": 0.00044244010492567763,
"loss": 3.2644,
"step": 45150
},
{
"epoch": 13.166326827847364,
"grad_norm": 0.3540899455547333,
"learning_rate": 0.00044226522879626927,
"loss": 3.2714,
"step": 45200
},
{
"epoch": 13.180891348674628,
"grad_norm": 0.3566311299800873,
"learning_rate": 0.00044209035266686096,
"loss": 3.2841,
"step": 45250
},
{
"epoch": 13.195455869501894,
"grad_norm": 0.373551607131958,
"learning_rate": 0.0004419154765374526,
"loss": 3.2718,
"step": 45300
},
{
"epoch": 13.210020390329158,
"grad_norm": 0.41288265585899353,
"learning_rate": 0.0004417406004080443,
"loss": 3.2849,
"step": 45350
},
{
"epoch": 13.224584911156423,
"grad_norm": 0.35161110758781433,
"learning_rate": 0.0004415657242786359,
"loss": 3.2853,
"step": 45400
},
{
"epoch": 13.239149431983687,
"grad_norm": 0.3606819808483124,
"learning_rate": 0.00044139084814922755,
"loss": 3.286,
"step": 45450
},
{
"epoch": 13.253713952810953,
"grad_norm": 0.3907514810562134,
"learning_rate": 0.00044121597201981924,
"loss": 3.2811,
"step": 45500
},
{
"epoch": 13.268278473638217,
"grad_norm": 0.355471134185791,
"learning_rate": 0.0004410410958904109,
"loss": 3.2962,
"step": 45550
},
{
"epoch": 13.282842994465483,
"grad_norm": 0.36252570152282715,
"learning_rate": 0.0004408662197610026,
"loss": 3.2966,
"step": 45600
},
{
"epoch": 13.297407515292747,
"grad_norm": 0.336823433637619,
"learning_rate": 0.00044069134363159426,
"loss": 3.3074,
"step": 45650
},
{
"epoch": 13.311972036120011,
"grad_norm": 0.36092522740364075,
"learning_rate": 0.00044051646750218595,
"loss": 3.2994,
"step": 45700
},
{
"epoch": 13.326536556947277,
"grad_norm": 0.36176231503486633,
"learning_rate": 0.0004403415913727776,
"loss": 3.3003,
"step": 45750
},
{
"epoch": 13.34110107777454,
"grad_norm": 0.36278125643730164,
"learning_rate": 0.0004401667152433692,
"loss": 3.2931,
"step": 45800
},
{
"epoch": 13.355665598601806,
"grad_norm": 0.34519457817077637,
"learning_rate": 0.0004399918391139609,
"loss": 3.3055,
"step": 45850
},
{
"epoch": 13.37023011942907,
"grad_norm": 0.36939191818237305,
"learning_rate": 0.00043981696298455255,
"loss": 3.3019,
"step": 45900
},
{
"epoch": 13.384794640256336,
"grad_norm": 0.3272865116596222,
"learning_rate": 0.00043964208685514424,
"loss": 3.2993,
"step": 45950
},
{
"epoch": 13.3993591610836,
"grad_norm": 0.3592440187931061,
"learning_rate": 0.0004394672107257359,
"loss": 3.3001,
"step": 46000
},
{
"epoch": 13.3993591610836,
"eval_accuracy": 0.3707306512302919,
"eval_loss": 3.5545082092285156,
"eval_runtime": 178.5043,
"eval_samples_per_second": 93.225,
"eval_steps_per_second": 5.832,
"step": 46000
},
{
"epoch": 13.413923681910866,
"grad_norm": 0.33466318249702454,
"learning_rate": 0.0004392923345963275,
"loss": 3.301,
"step": 46050
},
{
"epoch": 13.42848820273813,
"grad_norm": 0.3258253335952759,
"learning_rate": 0.00043911745846691926,
"loss": 3.3132,
"step": 46100
},
{
"epoch": 13.443052723565394,
"grad_norm": 0.35109880566596985,
"learning_rate": 0.0004389425823375109,
"loss": 3.312,
"step": 46150
},
{
"epoch": 13.45761724439266,
"grad_norm": 0.3569508492946625,
"learning_rate": 0.0004387677062081026,
"loss": 3.3208,
"step": 46200
},
{
"epoch": 13.472181765219924,
"grad_norm": 0.3373895585536957,
"learning_rate": 0.0004385928300786942,
"loss": 3.306,
"step": 46250
},
{
"epoch": 13.48674628604719,
"grad_norm": 0.3376927375793457,
"learning_rate": 0.0004384179539492859,
"loss": 3.3033,
"step": 46300
},
{
"epoch": 13.501310806874454,
"grad_norm": 0.4034666419029236,
"learning_rate": 0.00043824307781987755,
"loss": 3.3082,
"step": 46350
},
{
"epoch": 13.51587532770172,
"grad_norm": 0.3468514680862427,
"learning_rate": 0.0004380682016904692,
"loss": 3.3109,
"step": 46400
},
{
"epoch": 13.530439848528983,
"grad_norm": 0.35658130049705505,
"learning_rate": 0.0004378933255610609,
"loss": 3.323,
"step": 46450
},
{
"epoch": 13.545004369356247,
"grad_norm": 0.3441319763660431,
"learning_rate": 0.0004377184494316525,
"loss": 3.3248,
"step": 46500
},
{
"epoch": 13.559568890183513,
"grad_norm": 0.3547191619873047,
"learning_rate": 0.00043754357330224426,
"loss": 3.3243,
"step": 46550
},
{
"epoch": 13.574133411010777,
"grad_norm": 0.3560062050819397,
"learning_rate": 0.0004373686971728359,
"loss": 3.3055,
"step": 46600
},
{
"epoch": 13.588697931838043,
"grad_norm": 0.35935595631599426,
"learning_rate": 0.00043719382104342753,
"loss": 3.318,
"step": 46650
},
{
"epoch": 13.603262452665307,
"grad_norm": 0.3890141546726227,
"learning_rate": 0.0004370189449140192,
"loss": 3.3161,
"step": 46700
},
{
"epoch": 13.617826973492573,
"grad_norm": 0.35833755135536194,
"learning_rate": 0.00043684406878461085,
"loss": 3.3174,
"step": 46750
},
{
"epoch": 13.632391494319837,
"grad_norm": 0.35992640256881714,
"learning_rate": 0.00043666919265520254,
"loss": 3.3209,
"step": 46800
},
{
"epoch": 13.6469560151471,
"grad_norm": 0.36018410325050354,
"learning_rate": 0.0004364943165257942,
"loss": 3.3206,
"step": 46850
},
{
"epoch": 13.661520535974367,
"grad_norm": 0.37604689598083496,
"learning_rate": 0.0004363194403963858,
"loss": 3.327,
"step": 46900
},
{
"epoch": 13.67608505680163,
"grad_norm": 0.3590412735939026,
"learning_rate": 0.0004361445642669775,
"loss": 3.3319,
"step": 46950
},
{
"epoch": 13.690649577628896,
"grad_norm": 0.3439190089702606,
"learning_rate": 0.00043596968813756914,
"loss": 3.3331,
"step": 47000
},
{
"epoch": 13.690649577628896,
"eval_accuracy": 0.37110143750534336,
"eval_loss": 3.5459725856781006,
"eval_runtime": 178.5404,
"eval_samples_per_second": 93.206,
"eval_steps_per_second": 5.831,
"step": 47000
},
{
"epoch": 13.70521409845616,
"grad_norm": 0.3402903378009796,
"learning_rate": 0.0004357948120081609,
"loss": 3.3174,
"step": 47050
},
{
"epoch": 13.719778619283426,
"grad_norm": 0.3717620372772217,
"learning_rate": 0.0004356199358787525,
"loss": 3.3351,
"step": 47100
},
{
"epoch": 13.73434314011069,
"grad_norm": 0.3349936604499817,
"learning_rate": 0.0004354450597493442,
"loss": 3.3225,
"step": 47150
},
{
"epoch": 13.748907660937956,
"grad_norm": 0.35813501477241516,
"learning_rate": 0.00043527018361993585,
"loss": 3.3373,
"step": 47200
},
{
"epoch": 13.76347218176522,
"grad_norm": 0.32840263843536377,
"learning_rate": 0.0004350953074905275,
"loss": 3.3305,
"step": 47250
},
{
"epoch": 13.778036702592484,
"grad_norm": 0.35672280192375183,
"learning_rate": 0.0004349204313611192,
"loss": 3.3327,
"step": 47300
},
{
"epoch": 13.79260122341975,
"grad_norm": 0.3735807240009308,
"learning_rate": 0.0004347455552317108,
"loss": 3.3331,
"step": 47350
},
{
"epoch": 13.807165744247014,
"grad_norm": 0.34489327669143677,
"learning_rate": 0.0004345706791023025,
"loss": 3.3362,
"step": 47400
},
{
"epoch": 13.82173026507428,
"grad_norm": 0.36615490913391113,
"learning_rate": 0.00043439580297289414,
"loss": 3.3339,
"step": 47450
},
{
"epoch": 13.836294785901543,
"grad_norm": 0.3411681056022644,
"learning_rate": 0.0004342209268434858,
"loss": 3.3363,
"step": 47500
},
{
"epoch": 13.85085930672881,
"grad_norm": 0.3618829846382141,
"learning_rate": 0.0004340460507140775,
"loss": 3.3285,
"step": 47550
},
{
"epoch": 13.865423827556073,
"grad_norm": 0.4026733338832855,
"learning_rate": 0.00043387117458466916,
"loss": 3.3342,
"step": 47600
},
{
"epoch": 13.879988348383339,
"grad_norm": 0.35623764991760254,
"learning_rate": 0.00043369629845526085,
"loss": 3.3368,
"step": 47650
},
{
"epoch": 13.894552869210603,
"grad_norm": 0.33052340149879456,
"learning_rate": 0.0004335214223258525,
"loss": 3.3291,
"step": 47700
},
{
"epoch": 13.909117390037867,
"grad_norm": 0.3380168378353119,
"learning_rate": 0.0004333465461964442,
"loss": 3.3356,
"step": 47750
},
{
"epoch": 13.923681910865133,
"grad_norm": 0.401959091424942,
"learning_rate": 0.0004331716700670358,
"loss": 3.3324,
"step": 47800
},
{
"epoch": 13.938246431692397,
"grad_norm": 0.3894999623298645,
"learning_rate": 0.00043299679393762745,
"loss": 3.3359,
"step": 47850
},
{
"epoch": 13.952810952519663,
"grad_norm": 0.35449472069740295,
"learning_rate": 0.00043282191780821914,
"loss": 3.3198,
"step": 47900
},
{
"epoch": 13.967375473346927,
"grad_norm": 0.3355594575405121,
"learning_rate": 0.00043264704167881077,
"loss": 3.3327,
"step": 47950
},
{
"epoch": 13.981939994174192,
"grad_norm": 0.35608407855033875,
"learning_rate": 0.0004324721655494025,
"loss": 3.3397,
"step": 48000
},
{
"epoch": 13.981939994174192,
"eval_accuracy": 0.37175398842201335,
"eval_loss": 3.540625810623169,
"eval_runtime": 179.0517,
"eval_samples_per_second": 92.94,
"eval_steps_per_second": 5.814,
"step": 48000
},
{
"epoch": 13.996504515001456,
"grad_norm": 0.36551040410995483,
"learning_rate": 0.00043229728941999415,
"loss": 3.3388,
"step": 48050
},
{
"epoch": 14.01106903582872,
"grad_norm": 0.34817105531692505,
"learning_rate": 0.0004321224132905858,
"loss": 3.2595,
"step": 48100
},
{
"epoch": 14.025633556655986,
"grad_norm": 0.3400195837020874,
"learning_rate": 0.0004319475371611775,
"loss": 3.2207,
"step": 48150
},
{
"epoch": 14.04019807748325,
"grad_norm": 0.3744097352027893,
"learning_rate": 0.0004317726610317691,
"loss": 3.2301,
"step": 48200
},
{
"epoch": 14.054762598310516,
"grad_norm": 0.3383084535598755,
"learning_rate": 0.0004315977849023608,
"loss": 3.2422,
"step": 48250
},
{
"epoch": 14.06932711913778,
"grad_norm": 0.35336896777153015,
"learning_rate": 0.00043142290877295244,
"loss": 3.2273,
"step": 48300
},
{
"epoch": 14.083891639965046,
"grad_norm": 0.3526099920272827,
"learning_rate": 0.00043124803264354413,
"loss": 3.2395,
"step": 48350
},
{
"epoch": 14.09845616079231,
"grad_norm": 0.3588810861110687,
"learning_rate": 0.00043107315651413577,
"loss": 3.2406,
"step": 48400
},
{
"epoch": 14.113020681619576,
"grad_norm": 0.36780351400375366,
"learning_rate": 0.0004308982803847274,
"loss": 3.2556,
"step": 48450
},
{
"epoch": 14.12758520244684,
"grad_norm": 0.3457690477371216,
"learning_rate": 0.00043072340425531915,
"loss": 3.2578,
"step": 48500
},
{
"epoch": 14.142149723274104,
"grad_norm": 0.3812199831008911,
"learning_rate": 0.0004305485281259108,
"loss": 3.2557,
"step": 48550
},
{
"epoch": 14.15671424410137,
"grad_norm": 0.37096258997917175,
"learning_rate": 0.0004303736519965025,
"loss": 3.261,
"step": 48600
},
{
"epoch": 14.171278764928633,
"grad_norm": 0.3497552275657654,
"learning_rate": 0.0004301987758670941,
"loss": 3.269,
"step": 48650
},
{
"epoch": 14.1858432857559,
"grad_norm": 0.3859023153781891,
"learning_rate": 0.00043002389973768575,
"loss": 3.2629,
"step": 48700
},
{
"epoch": 14.200407806583163,
"grad_norm": 0.35794612765312195,
"learning_rate": 0.00042984902360827744,
"loss": 3.2692,
"step": 48750
},
{
"epoch": 14.214972327410429,
"grad_norm": 0.37541714310646057,
"learning_rate": 0.0004296741474788691,
"loss": 3.2796,
"step": 48800
},
{
"epoch": 14.229536848237693,
"grad_norm": 0.3757866322994232,
"learning_rate": 0.00042949927134946077,
"loss": 3.2682,
"step": 48850
},
{
"epoch": 14.244101369064957,
"grad_norm": 0.3648219406604767,
"learning_rate": 0.0004293243952200524,
"loss": 3.2742,
"step": 48900
},
{
"epoch": 14.258665889892223,
"grad_norm": 0.36085084080696106,
"learning_rate": 0.00042914951909064415,
"loss": 3.2738,
"step": 48950
},
{
"epoch": 14.273230410719487,
"grad_norm": 0.3444569706916809,
"learning_rate": 0.0004289746429612358,
"loss": 3.277,
"step": 49000
},
{
"epoch": 14.273230410719487,
"eval_accuracy": 0.37122232817795764,
"eval_loss": 3.556274652481079,
"eval_runtime": 179.9174,
"eval_samples_per_second": 92.492,
"eval_steps_per_second": 5.786,
"step": 49000
},
{
"epoch": 14.287794931546753,
"grad_norm": 0.34347397089004517,
"learning_rate": 0.0004287997668318274,
"loss": 3.2827,
"step": 49050
},
{
"epoch": 14.302359452374017,
"grad_norm": 0.361592173576355,
"learning_rate": 0.0004286248907024191,
"loss": 3.2691,
"step": 49100
},
{
"epoch": 14.316923973201282,
"grad_norm": 0.36173015832901,
"learning_rate": 0.00042845001457301075,
"loss": 3.2783,
"step": 49150
},
{
"epoch": 14.331488494028546,
"grad_norm": 0.3683636784553528,
"learning_rate": 0.00042827513844360244,
"loss": 3.2796,
"step": 49200
},
{
"epoch": 14.346053014855812,
"grad_norm": 0.36255374550819397,
"learning_rate": 0.00042810026231419407,
"loss": 3.2864,
"step": 49250
},
{
"epoch": 14.360617535683076,
"grad_norm": 0.3432329595088959,
"learning_rate": 0.0004279253861847857,
"loss": 3.282,
"step": 49300
},
{
"epoch": 14.37518205651034,
"grad_norm": 0.35291415452957153,
"learning_rate": 0.0004277505100553774,
"loss": 3.2799,
"step": 49350
},
{
"epoch": 14.389746577337606,
"grad_norm": 0.3346937298774719,
"learning_rate": 0.00042757563392596904,
"loss": 3.2942,
"step": 49400
},
{
"epoch": 14.40431109816487,
"grad_norm": 0.3730728328227997,
"learning_rate": 0.0004274007577965608,
"loss": 3.3006,
"step": 49450
},
{
"epoch": 14.418875618992136,
"grad_norm": 0.36969706416130066,
"learning_rate": 0.0004272258816671524,
"loss": 3.2807,
"step": 49500
},
{
"epoch": 14.4334401398194,
"grad_norm": 0.3247165083885193,
"learning_rate": 0.00042705100553774405,
"loss": 3.2888,
"step": 49550
},
{
"epoch": 14.448004660646665,
"grad_norm": 0.3538820147514343,
"learning_rate": 0.00042687612940833574,
"loss": 3.3002,
"step": 49600
},
{
"epoch": 14.46256918147393,
"grad_norm": 0.38062334060668945,
"learning_rate": 0.0004267012532789274,
"loss": 3.2894,
"step": 49650
},
{
"epoch": 14.477133702301193,
"grad_norm": 0.35229548811912537,
"learning_rate": 0.00042652637714951907,
"loss": 3.2941,
"step": 49700
},
{
"epoch": 14.49169822312846,
"grad_norm": 0.37731724977493286,
"learning_rate": 0.0004263515010201107,
"loss": 3.3014,
"step": 49750
},
{
"epoch": 14.506262743955723,
"grad_norm": 0.3561221659183502,
"learning_rate": 0.0004261766248907024,
"loss": 3.2885,
"step": 49800
},
{
"epoch": 14.520827264782989,
"grad_norm": 0.3692299723625183,
"learning_rate": 0.00042600174876129403,
"loss": 3.301,
"step": 49850
},
{
"epoch": 14.535391785610253,
"grad_norm": 0.3614572584629059,
"learning_rate": 0.00042582687263188567,
"loss": 3.3106,
"step": 49900
},
{
"epoch": 14.549956306437519,
"grad_norm": 0.3334461748600006,
"learning_rate": 0.0004256519965024774,
"loss": 3.3026,
"step": 49950
},
{
"epoch": 14.564520827264783,
"grad_norm": 0.34117934107780457,
"learning_rate": 0.00042547712037306905,
"loss": 3.3188,
"step": 50000
},
{
"epoch": 14.564520827264783,
"eval_accuracy": 0.3717561051847634,
"eval_loss": 3.54461407661438,
"eval_runtime": 178.8677,
"eval_samples_per_second": 93.035,
"eval_steps_per_second": 5.82,
"step": 50000
},
{
"epoch": 14.579085348092049,
"grad_norm": 0.3184276521205902,
"learning_rate": 0.00042530224424366074,
"loss": 3.2982,
"step": 50050
},
{
"epoch": 14.593649868919313,
"grad_norm": 0.33841148018836975,
"learning_rate": 0.0004251273681142524,
"loss": 3.3045,
"step": 50100
},
{
"epoch": 14.608214389746577,
"grad_norm": 0.3844097852706909,
"learning_rate": 0.000424952491984844,
"loss": 3.3094,
"step": 50150
},
{
"epoch": 14.622778910573842,
"grad_norm": 0.39223310351371765,
"learning_rate": 0.0004247776158554357,
"loss": 3.3074,
"step": 50200
},
{
"epoch": 14.637343431401106,
"grad_norm": 0.3735564947128296,
"learning_rate": 0.00042460273972602734,
"loss": 3.3114,
"step": 50250
},
{
"epoch": 14.651907952228372,
"grad_norm": 0.3553323447704315,
"learning_rate": 0.00042442786359661903,
"loss": 3.3107,
"step": 50300
},
{
"epoch": 14.666472473055636,
"grad_norm": 0.33939993381500244,
"learning_rate": 0.00042425298746721066,
"loss": 3.3038,
"step": 50350
},
{
"epoch": 14.681036993882902,
"grad_norm": 0.34699302911758423,
"learning_rate": 0.0004240781113378024,
"loss": 3.313,
"step": 50400
},
{
"epoch": 14.695601514710166,
"grad_norm": 0.34914663434028625,
"learning_rate": 0.00042390323520839405,
"loss": 3.3057,
"step": 50450
},
{
"epoch": 14.71016603553743,
"grad_norm": 0.35503461956977844,
"learning_rate": 0.0004237283590789857,
"loss": 3.3047,
"step": 50500
},
{
"epoch": 14.724730556364696,
"grad_norm": 0.3494664132595062,
"learning_rate": 0.00042355348294957737,
"loss": 3.322,
"step": 50550
},
{
"epoch": 14.73929507719196,
"grad_norm": 0.3523366451263428,
"learning_rate": 0.000423378606820169,
"loss": 3.3063,
"step": 50600
},
{
"epoch": 14.753859598019226,
"grad_norm": 0.344511479139328,
"learning_rate": 0.0004232037306907607,
"loss": 3.3114,
"step": 50650
},
{
"epoch": 14.76842411884649,
"grad_norm": 0.372232049703598,
"learning_rate": 0.00042302885456135233,
"loss": 3.3257,
"step": 50700
},
{
"epoch": 14.782988639673755,
"grad_norm": 0.3332023024559021,
"learning_rate": 0.00042285397843194397,
"loss": 3.3077,
"step": 50750
},
{
"epoch": 14.79755316050102,
"grad_norm": 0.3506964445114136,
"learning_rate": 0.00042267910230253566,
"loss": 3.3047,
"step": 50800
},
{
"epoch": 14.812117681328285,
"grad_norm": 0.36583006381988525,
"learning_rate": 0.0004225042261731273,
"loss": 3.3078,
"step": 50850
},
{
"epoch": 14.826682202155549,
"grad_norm": 0.36147186160087585,
"learning_rate": 0.00042232935004371904,
"loss": 3.3099,
"step": 50900
},
{
"epoch": 14.841246722982813,
"grad_norm": 0.3693869113922119,
"learning_rate": 0.0004221544739143107,
"loss": 3.3175,
"step": 50950
},
{
"epoch": 14.855811243810079,
"grad_norm": 0.33687424659729004,
"learning_rate": 0.00042197959778490237,
"loss": 3.3368,
"step": 51000
},
{
"epoch": 14.855811243810079,
"eval_accuracy": 0.3719887138914084,
"eval_loss": 3.539015054702759,
"eval_runtime": 220.2245,
"eval_samples_per_second": 75.564,
"eval_steps_per_second": 4.727,
"step": 51000
},
{
"epoch": 14.870375764637343,
"grad_norm": 0.36331212520599365,
"learning_rate": 0.000421804721655494,
"loss": 3.3188,
"step": 51050
},
{
"epoch": 14.884940285464609,
"grad_norm": 0.3679030239582062,
"learning_rate": 0.00042162984552608564,
"loss": 3.3207,
"step": 51100
},
{
"epoch": 14.899504806291873,
"grad_norm": 0.35701867938041687,
"learning_rate": 0.00042145496939667733,
"loss": 3.3098,
"step": 51150
},
{
"epoch": 14.914069327119138,
"grad_norm": 0.3420349359512329,
"learning_rate": 0.00042128009326726897,
"loss": 3.3243,
"step": 51200
},
{
"epoch": 14.928633847946402,
"grad_norm": 0.34434568881988525,
"learning_rate": 0.00042110521713786066,
"loss": 3.3099,
"step": 51250
},
{
"epoch": 14.943198368773668,
"grad_norm": 0.34737786650657654,
"learning_rate": 0.0004209303410084523,
"loss": 3.3297,
"step": 51300
},
{
"epoch": 14.957762889600932,
"grad_norm": 0.34293320775032043,
"learning_rate": 0.00042075546487904393,
"loss": 3.3189,
"step": 51350
},
{
"epoch": 14.972327410428196,
"grad_norm": 0.35380107164382935,
"learning_rate": 0.0004205805887496357,
"loss": 3.3312,
"step": 51400
},
{
"epoch": 14.986891931255462,
"grad_norm": 0.3664044141769409,
"learning_rate": 0.0004204057126202273,
"loss": 3.3259,
"step": 51450
},
{
"epoch": 15.001456452082726,
"grad_norm": 0.3789098262786865,
"learning_rate": 0.000420230836490819,
"loss": 3.3188,
"step": 51500
},
{
"epoch": 15.016020972909992,
"grad_norm": 0.3919133245944977,
"learning_rate": 0.00042005596036141064,
"loss": 3.211,
"step": 51550
},
{
"epoch": 15.030585493737256,
"grad_norm": 0.3933902978897095,
"learning_rate": 0.0004198810842320023,
"loss": 3.2154,
"step": 51600
},
{
"epoch": 15.045150014564522,
"grad_norm": 0.34845665097236633,
"learning_rate": 0.00041970620810259396,
"loss": 3.2258,
"step": 51650
},
{
"epoch": 15.059714535391786,
"grad_norm": 0.3623676002025604,
"learning_rate": 0.0004195313319731856,
"loss": 3.2207,
"step": 51700
},
{
"epoch": 15.07427905621905,
"grad_norm": 0.3734416365623474,
"learning_rate": 0.0004193564558437773,
"loss": 3.2309,
"step": 51750
},
{
"epoch": 15.088843577046315,
"grad_norm": 0.365877240896225,
"learning_rate": 0.0004191815797143689,
"loss": 3.2182,
"step": 51800
},
{
"epoch": 15.10340809787358,
"grad_norm": 0.3879176676273346,
"learning_rate": 0.00041900670358496067,
"loss": 3.2278,
"step": 51850
},
{
"epoch": 15.117972618700845,
"grad_norm": 0.3594760000705719,
"learning_rate": 0.0004188318274555523,
"loss": 3.2356,
"step": 51900
},
{
"epoch": 15.13253713952811,
"grad_norm": 0.3522253632545471,
"learning_rate": 0.00041865695132614394,
"loss": 3.2536,
"step": 51950
},
{
"epoch": 15.147101660355375,
"grad_norm": 0.40796470642089844,
"learning_rate": 0.00041848207519673563,
"loss": 3.2346,
"step": 52000
},
{
"epoch": 15.147101660355375,
"eval_accuracy": 0.3715526607648969,
"eval_loss": 3.556135892868042,
"eval_runtime": 178.7061,
"eval_samples_per_second": 93.119,
"eval_steps_per_second": 5.825,
"step": 52000
},
{
"epoch": 15.161666181182639,
"grad_norm": 0.3638458251953125,
"learning_rate": 0.00041830719906732727,
"loss": 3.2442,
"step": 52050
},
{
"epoch": 15.176230702009903,
"grad_norm": 0.36014366149902344,
"learning_rate": 0.00041813232293791896,
"loss": 3.2479,
"step": 52100
},
{
"epoch": 15.190795222837169,
"grad_norm": 0.35548239946365356,
"learning_rate": 0.0004179574468085106,
"loss": 3.2596,
"step": 52150
},
{
"epoch": 15.205359743664433,
"grad_norm": 0.3747190833091736,
"learning_rate": 0.00041778257067910223,
"loss": 3.2431,
"step": 52200
},
{
"epoch": 15.219924264491699,
"grad_norm": 0.3518429100513458,
"learning_rate": 0.0004176076945496939,
"loss": 3.254,
"step": 52250
},
{
"epoch": 15.234488785318963,
"grad_norm": 0.39838236570358276,
"learning_rate": 0.00041743281842028556,
"loss": 3.2552,
"step": 52300
},
{
"epoch": 15.249053306146228,
"grad_norm": 0.3936876654624939,
"learning_rate": 0.0004172579422908773,
"loss": 3.2615,
"step": 52350
},
{
"epoch": 15.263617826973492,
"grad_norm": 0.3447533845901489,
"learning_rate": 0.00041708306616146894,
"loss": 3.2685,
"step": 52400
},
{
"epoch": 15.278182347800758,
"grad_norm": 0.3832126259803772,
"learning_rate": 0.00041690819003206063,
"loss": 3.2743,
"step": 52450
},
{
"epoch": 15.292746868628022,
"grad_norm": 0.3800361156463623,
"learning_rate": 0.00041673331390265227,
"loss": 3.2568,
"step": 52500
},
{
"epoch": 15.307311389455286,
"grad_norm": 0.3728283643722534,
"learning_rate": 0.0004165584377732439,
"loss": 3.2628,
"step": 52550
},
{
"epoch": 15.321875910282552,
"grad_norm": 0.3551943898200989,
"learning_rate": 0.0004163835616438356,
"loss": 3.2639,
"step": 52600
},
{
"epoch": 15.336440431109816,
"grad_norm": 0.37213918566703796,
"learning_rate": 0.00041620868551442723,
"loss": 3.2621,
"step": 52650
},
{
"epoch": 15.351004951937082,
"grad_norm": 0.38535019755363464,
"learning_rate": 0.0004160338093850189,
"loss": 3.2665,
"step": 52700
},
{
"epoch": 15.365569472764346,
"grad_norm": 0.38206279277801514,
"learning_rate": 0.00041585893325561056,
"loss": 3.2783,
"step": 52750
},
{
"epoch": 15.380133993591611,
"grad_norm": 0.34792542457580566,
"learning_rate": 0.0004156840571262022,
"loss": 3.282,
"step": 52800
},
{
"epoch": 15.394698514418875,
"grad_norm": 0.3482363820075989,
"learning_rate": 0.00041550918099679394,
"loss": 3.2774,
"step": 52850
},
{
"epoch": 15.409263035246141,
"grad_norm": 0.40007925033569336,
"learning_rate": 0.0004153343048673856,
"loss": 3.2814,
"step": 52900
},
{
"epoch": 15.423827556073405,
"grad_norm": 0.37153443694114685,
"learning_rate": 0.00041515942873797726,
"loss": 3.2838,
"step": 52950
},
{
"epoch": 15.43839207690067,
"grad_norm": 0.35616305470466614,
"learning_rate": 0.0004149845526085689,
"loss": 3.2882,
"step": 53000
},
{
"epoch": 15.43839207690067,
"eval_accuracy": 0.3718679408167247,
"eval_loss": 3.5503857135772705,
"eval_runtime": 178.8769,
"eval_samples_per_second": 93.03,
"eval_steps_per_second": 5.82,
"step": 53000
},
{
"epoch": 15.452956597727935,
"grad_norm": 0.38153186440467834,
"learning_rate": 0.0004148096764791606,
"loss": 3.2855,
"step": 53050
},
{
"epoch": 15.467521118555199,
"grad_norm": 0.364692747592926,
"learning_rate": 0.0004146348003497522,
"loss": 3.2756,
"step": 53100
},
{
"epoch": 15.482085639382465,
"grad_norm": 0.3847792148590088,
"learning_rate": 0.00041445992422034386,
"loss": 3.2871,
"step": 53150
},
{
"epoch": 15.496650160209729,
"grad_norm": 0.37956368923187256,
"learning_rate": 0.00041428504809093555,
"loss": 3.2836,
"step": 53200
},
{
"epoch": 15.511214681036995,
"grad_norm": 0.36245131492614746,
"learning_rate": 0.0004141101719615272,
"loss": 3.2935,
"step": 53250
},
{
"epoch": 15.525779201864259,
"grad_norm": 0.37407535314559937,
"learning_rate": 0.00041393529583211893,
"loss": 3.2729,
"step": 53300
},
{
"epoch": 15.540343722691523,
"grad_norm": 0.34457436203956604,
"learning_rate": 0.00041376041970271057,
"loss": 3.3004,
"step": 53350
},
{
"epoch": 15.554908243518788,
"grad_norm": 0.35518190264701843,
"learning_rate": 0.0004135855435733022,
"loss": 3.2959,
"step": 53400
},
{
"epoch": 15.569472764346052,
"grad_norm": 0.36230894923210144,
"learning_rate": 0.0004134106674438939,
"loss": 3.291,
"step": 53450
},
{
"epoch": 15.584037285173318,
"grad_norm": 0.37828779220581055,
"learning_rate": 0.00041323579131448553,
"loss": 3.2795,
"step": 53500
},
{
"epoch": 15.598601806000582,
"grad_norm": 0.3493488132953644,
"learning_rate": 0.0004130609151850772,
"loss": 3.2829,
"step": 53550
},
{
"epoch": 15.613166326827848,
"grad_norm": 0.40960273146629333,
"learning_rate": 0.00041288603905566886,
"loss": 3.2866,
"step": 53600
},
{
"epoch": 15.627730847655112,
"grad_norm": 0.36528661847114563,
"learning_rate": 0.0004127111629262605,
"loss": 3.2866,
"step": 53650
},
{
"epoch": 15.642295368482376,
"grad_norm": 0.3558300733566284,
"learning_rate": 0.0004125362867968522,
"loss": 3.3008,
"step": 53700
},
{
"epoch": 15.656859889309642,
"grad_norm": 0.3836390972137451,
"learning_rate": 0.0004123614106674438,
"loss": 3.2955,
"step": 53750
},
{
"epoch": 15.671424410136906,
"grad_norm": 0.3821122646331787,
"learning_rate": 0.00041218653453803557,
"loss": 3.2986,
"step": 53800
},
{
"epoch": 15.685988930964172,
"grad_norm": 0.3575958013534546,
"learning_rate": 0.0004120116584086272,
"loss": 3.2853,
"step": 53850
},
{
"epoch": 15.700553451791436,
"grad_norm": 0.4025585353374481,
"learning_rate": 0.0004118367822792189,
"loss": 3.2862,
"step": 53900
},
{
"epoch": 15.715117972618701,
"grad_norm": 0.35863131284713745,
"learning_rate": 0.00041166190614981053,
"loss": 3.306,
"step": 53950
},
{
"epoch": 15.729682493445965,
"grad_norm": 0.3342381715774536,
"learning_rate": 0.00041148703002040217,
"loss": 3.2962,
"step": 54000
},
{
"epoch": 15.729682493445965,
"eval_accuracy": 0.37225413242067934,
"eval_loss": 3.5404388904571533,
"eval_runtime": 181.2437,
"eval_samples_per_second": 91.816,
"eval_steps_per_second": 5.744,
"step": 54000
},
{
"epoch": 15.744247014273231,
"grad_norm": 0.3472493290901184,
"learning_rate": 0.00041131215389099386,
"loss": 3.2979,
"step": 54050
},
{
"epoch": 15.758811535100495,
"grad_norm": 0.3713492751121521,
"learning_rate": 0.0004111372777615855,
"loss": 3.2897,
"step": 54100
},
{
"epoch": 15.77337605592776,
"grad_norm": 0.41221651434898376,
"learning_rate": 0.0004109624016321772,
"loss": 3.2933,
"step": 54150
},
{
"epoch": 15.787940576755025,
"grad_norm": 0.3364807665348053,
"learning_rate": 0.0004107875255027688,
"loss": 3.2879,
"step": 54200
},
{
"epoch": 15.802505097582289,
"grad_norm": 0.34417688846588135,
"learning_rate": 0.00041061264937336045,
"loss": 3.2878,
"step": 54250
},
{
"epoch": 15.817069618409555,
"grad_norm": 0.3484468460083008,
"learning_rate": 0.0004104377732439522,
"loss": 3.3104,
"step": 54300
},
{
"epoch": 15.831634139236819,
"grad_norm": 0.37589672207832336,
"learning_rate": 0.00041026289711454384,
"loss": 3.2954,
"step": 54350
},
{
"epoch": 15.846198660064085,
"grad_norm": 0.3464411795139313,
"learning_rate": 0.0004100880209851355,
"loss": 3.3115,
"step": 54400
},
{
"epoch": 15.860763180891349,
"grad_norm": 0.3678479790687561,
"learning_rate": 0.00040991314485572716,
"loss": 3.3094,
"step": 54450
},
{
"epoch": 15.875327701718614,
"grad_norm": 0.36492377519607544,
"learning_rate": 0.00040973826872631885,
"loss": 3.3113,
"step": 54500
},
{
"epoch": 15.889892222545878,
"grad_norm": 0.35983067750930786,
"learning_rate": 0.0004095633925969105,
"loss": 3.3149,
"step": 54550
},
{
"epoch": 15.904456743373142,
"grad_norm": 0.3600602149963379,
"learning_rate": 0.0004093885164675021,
"loss": 3.3083,
"step": 54600
},
{
"epoch": 15.919021264200408,
"grad_norm": 0.36820098757743835,
"learning_rate": 0.0004092136403380938,
"loss": 3.3118,
"step": 54650
},
{
"epoch": 15.933585785027672,
"grad_norm": 0.38710981607437134,
"learning_rate": 0.00040903876420868545,
"loss": 3.3086,
"step": 54700
},
{
"epoch": 15.948150305854938,
"grad_norm": 0.360516756772995,
"learning_rate": 0.00040886388807927714,
"loss": 3.302,
"step": 54750
},
{
"epoch": 15.962714826682202,
"grad_norm": 0.343983918428421,
"learning_rate": 0.00040868901194986883,
"loss": 3.3074,
"step": 54800
},
{
"epoch": 15.977279347509468,
"grad_norm": 0.36255642771720886,
"learning_rate": 0.00040851413582046047,
"loss": 3.3113,
"step": 54850
},
{
"epoch": 15.991843868336732,
"grad_norm": 0.366834819316864,
"learning_rate": 0.00040833925969105216,
"loss": 3.3066,
"step": 54900
},
{
"epoch": 16.006408389163997,
"grad_norm": 0.38302844762802124,
"learning_rate": 0.0004081643835616438,
"loss": 3.2623,
"step": 54950
},
{
"epoch": 16.02097290999126,
"grad_norm": 0.3558831214904785,
"learning_rate": 0.0004079895074322355,
"loss": 3.1914,
"step": 55000
},
{
"epoch": 16.02097290999126,
"eval_accuracy": 0.3723332758279453,
"eval_loss": 3.5491139888763428,
"eval_runtime": 221.7065,
"eval_samples_per_second": 75.059,
"eval_steps_per_second": 4.695,
"step": 55000
},
{
"epoch": 16.035537430818525,
"grad_norm": 0.3447836935520172,
"learning_rate": 0.0004078146313028271,
"loss": 3.2054,
"step": 55050
},
{
"epoch": 16.05010195164579,
"grad_norm": 0.3696219325065613,
"learning_rate": 0.0004076397551734188,
"loss": 3.2151,
"step": 55100
},
{
"epoch": 16.064666472473057,
"grad_norm": 0.35809266567230225,
"learning_rate": 0.00040746487904401045,
"loss": 3.2091,
"step": 55150
},
{
"epoch": 16.07923099330032,
"grad_norm": 0.37019026279449463,
"learning_rate": 0.0004072900029146021,
"loss": 3.2244,
"step": 55200
},
{
"epoch": 16.093795514127585,
"grad_norm": 0.35395923256874084,
"learning_rate": 0.0004071151267851938,
"loss": 3.2123,
"step": 55250
},
{
"epoch": 16.10836003495485,
"grad_norm": 0.3639342784881592,
"learning_rate": 0.00040694025065578546,
"loss": 3.2293,
"step": 55300
},
{
"epoch": 16.122924555782113,
"grad_norm": 0.3842836320400238,
"learning_rate": 0.00040676537452637716,
"loss": 3.2171,
"step": 55350
},
{
"epoch": 16.13748907660938,
"grad_norm": 0.40237775444984436,
"learning_rate": 0.0004065904983969688,
"loss": 3.2236,
"step": 55400
},
{
"epoch": 16.152053597436645,
"grad_norm": 0.35297468304634094,
"learning_rate": 0.00040641562226756043,
"loss": 3.2304,
"step": 55450
},
{
"epoch": 16.16661811826391,
"grad_norm": 0.3450460731983185,
"learning_rate": 0.0004062407461381521,
"loss": 3.2163,
"step": 55500
},
{
"epoch": 16.181182639091173,
"grad_norm": 0.3639377951622009,
"learning_rate": 0.00040606587000874375,
"loss": 3.2317,
"step": 55550
},
{
"epoch": 16.19574715991844,
"grad_norm": 0.3449926972389221,
"learning_rate": 0.00040589099387933544,
"loss": 3.242,
"step": 55600
},
{
"epoch": 16.210311680745704,
"grad_norm": 0.39611726999282837,
"learning_rate": 0.0004057161177499271,
"loss": 3.2357,
"step": 55650
},
{
"epoch": 16.224876201572968,
"grad_norm": 0.3632814884185791,
"learning_rate": 0.0004055412416205187,
"loss": 3.2357,
"step": 55700
},
{
"epoch": 16.239440722400232,
"grad_norm": 0.38807687163352966,
"learning_rate": 0.0004053663654911104,
"loss": 3.2555,
"step": 55750
},
{
"epoch": 16.254005243227496,
"grad_norm": 0.41072791814804077,
"learning_rate": 0.0004051914893617021,
"loss": 3.2496,
"step": 55800
},
{
"epoch": 16.268569764054764,
"grad_norm": 0.36269184947013855,
"learning_rate": 0.0004050166132322938,
"loss": 3.2409,
"step": 55850
},
{
"epoch": 16.283134284882028,
"grad_norm": 0.37166592478752136,
"learning_rate": 0.0004048417371028854,
"loss": 3.2567,
"step": 55900
},
{
"epoch": 16.29769880570929,
"grad_norm": 0.38969337940216064,
"learning_rate": 0.0004046668609734771,
"loss": 3.2598,
"step": 55950
},
{
"epoch": 16.312263326536556,
"grad_norm": 0.3745807111263275,
"learning_rate": 0.00040449198484406875,
"loss": 3.2381,
"step": 56000
},
{
"epoch": 16.312263326536556,
"eval_accuracy": 0.37207597155588296,
"eval_loss": 3.551668882369995,
"eval_runtime": 178.6199,
"eval_samples_per_second": 93.164,
"eval_steps_per_second": 5.828,
"step": 56000
},
{
"epoch": 16.326827847363823,
"grad_norm": 0.3594777584075928,
"learning_rate": 0.0004043171087146604,
"loss": 3.2672,
"step": 56050
},
{
"epoch": 16.341392368191087,
"grad_norm": 0.37412890791893005,
"learning_rate": 0.0004041422325852521,
"loss": 3.2596,
"step": 56100
},
{
"epoch": 16.35595688901835,
"grad_norm": 0.3892935514450073,
"learning_rate": 0.0004039673564558437,
"loss": 3.2504,
"step": 56150
},
{
"epoch": 16.370521409845615,
"grad_norm": 0.3575972020626068,
"learning_rate": 0.0004037924803264354,
"loss": 3.2628,
"step": 56200
},
{
"epoch": 16.38508593067288,
"grad_norm": 0.33797597885131836,
"learning_rate": 0.00040361760419702704,
"loss": 3.2633,
"step": 56250
},
{
"epoch": 16.399650451500147,
"grad_norm": 0.37844982743263245,
"learning_rate": 0.00040344272806761873,
"loss": 3.2632,
"step": 56300
},
{
"epoch": 16.41421497232741,
"grad_norm": 0.379015177488327,
"learning_rate": 0.0004032678519382104,
"loss": 3.2548,
"step": 56350
},
{
"epoch": 16.428779493154675,
"grad_norm": 0.42286205291748047,
"learning_rate": 0.00040309297580880206,
"loss": 3.2691,
"step": 56400
},
{
"epoch": 16.44334401398194,
"grad_norm": 0.3584016263484955,
"learning_rate": 0.00040291809967939375,
"loss": 3.2678,
"step": 56450
},
{
"epoch": 16.457908534809206,
"grad_norm": 0.4013825058937073,
"learning_rate": 0.0004027432235499854,
"loss": 3.2697,
"step": 56500
},
{
"epoch": 16.47247305563647,
"grad_norm": 0.3746756911277771,
"learning_rate": 0.0004025683474205771,
"loss": 3.2663,
"step": 56550
},
{
"epoch": 16.487037576463734,
"grad_norm": 0.4031076431274414,
"learning_rate": 0.0004023934712911687,
"loss": 3.2805,
"step": 56600
},
{
"epoch": 16.501602097291,
"grad_norm": 0.384000688791275,
"learning_rate": 0.00040221859516176035,
"loss": 3.2609,
"step": 56650
},
{
"epoch": 16.516166618118262,
"grad_norm": 0.3829132914543152,
"learning_rate": 0.00040204371903235204,
"loss": 3.2768,
"step": 56700
},
{
"epoch": 16.53073113894553,
"grad_norm": 0.39709749817848206,
"learning_rate": 0.0004018688429029437,
"loss": 3.2693,
"step": 56750
},
{
"epoch": 16.545295659772794,
"grad_norm": 0.3538981080055237,
"learning_rate": 0.0004016939667735354,
"loss": 3.2675,
"step": 56800
},
{
"epoch": 16.559860180600058,
"grad_norm": 0.3409939706325531,
"learning_rate": 0.00040151909064412705,
"loss": 3.281,
"step": 56850
},
{
"epoch": 16.574424701427322,
"grad_norm": 0.38922184705734253,
"learning_rate": 0.0004013442145147187,
"loss": 3.2702,
"step": 56900
},
{
"epoch": 16.58898922225459,
"grad_norm": 0.35033589601516724,
"learning_rate": 0.0004011693383853104,
"loss": 3.2753,
"step": 56950
},
{
"epoch": 16.603553743081854,
"grad_norm": 0.3954455256462097,
"learning_rate": 0.000400994462255902,
"loss": 3.2719,
"step": 57000
},
{
"epoch": 16.603553743081854,
"eval_accuracy": 0.3724214742758643,
"eval_loss": 3.545311689376831,
"eval_runtime": 178.6369,
"eval_samples_per_second": 93.155,
"eval_steps_per_second": 5.827,
"step": 57000
},
{
"epoch": 16.618118263909118,
"grad_norm": 0.36573052406311035,
"learning_rate": 0.0004008195861264937,
"loss": 3.2737,
"step": 57050
},
{
"epoch": 16.63268278473638,
"grad_norm": 0.35185372829437256,
"learning_rate": 0.00040064470999708534,
"loss": 3.2861,
"step": 57100
},
{
"epoch": 16.647247305563646,
"grad_norm": 0.37779009342193604,
"learning_rate": 0.00040046983386767703,
"loss": 3.2798,
"step": 57150
},
{
"epoch": 16.661811826390913,
"grad_norm": 0.36057236790657043,
"learning_rate": 0.00040029495773826867,
"loss": 3.2909,
"step": 57200
},
{
"epoch": 16.676376347218177,
"grad_norm": 0.39807477593421936,
"learning_rate": 0.0004001200816088603,
"loss": 3.2773,
"step": 57250
},
{
"epoch": 16.69094086804544,
"grad_norm": 0.3643984794616699,
"learning_rate": 0.00039994520547945205,
"loss": 3.2895,
"step": 57300
},
{
"epoch": 16.705505388872705,
"grad_norm": 0.3720785975456238,
"learning_rate": 0.0003997703293500437,
"loss": 3.2855,
"step": 57350
},
{
"epoch": 16.72006990969997,
"grad_norm": 0.37378811836242676,
"learning_rate": 0.0003995954532206354,
"loss": 3.2924,
"step": 57400
},
{
"epoch": 16.734634430527237,
"grad_norm": 0.3754887878894806,
"learning_rate": 0.000399420577091227,
"loss": 3.2776,
"step": 57450
},
{
"epoch": 16.7491989513545,
"grad_norm": 0.3625001311302185,
"learning_rate": 0.00039924570096181865,
"loss": 3.2747,
"step": 57500
},
{
"epoch": 16.763763472181765,
"grad_norm": 0.359059602022171,
"learning_rate": 0.00039907082483241034,
"loss": 3.2948,
"step": 57550
},
{
"epoch": 16.77832799300903,
"grad_norm": 0.3593480885028839,
"learning_rate": 0.000398895948703002,
"loss": 3.2865,
"step": 57600
},
{
"epoch": 16.792892513836296,
"grad_norm": 0.35039687156677246,
"learning_rate": 0.00039872107257359367,
"loss": 3.2893,
"step": 57650
},
{
"epoch": 16.80745703466356,
"grad_norm": 0.38142552971839905,
"learning_rate": 0.0003985461964441853,
"loss": 3.2811,
"step": 57700
},
{
"epoch": 16.822021555490824,
"grad_norm": 0.36417636275291443,
"learning_rate": 0.00039837132031477694,
"loss": 3.2869,
"step": 57750
},
{
"epoch": 16.83658607631809,
"grad_norm": 0.3743918836116791,
"learning_rate": 0.0003981964441853687,
"loss": 3.2941,
"step": 57800
},
{
"epoch": 16.851150597145352,
"grad_norm": 0.3657926619052887,
"learning_rate": 0.0003980215680559603,
"loss": 3.2944,
"step": 57850
},
{
"epoch": 16.86571511797262,
"grad_norm": 0.34246590733528137,
"learning_rate": 0.000397846691926552,
"loss": 3.2907,
"step": 57900
},
{
"epoch": 16.880279638799884,
"grad_norm": 0.35775476694107056,
"learning_rate": 0.00039767181579714365,
"loss": 3.2888,
"step": 57950
},
{
"epoch": 16.894844159627148,
"grad_norm": 0.3570455312728882,
"learning_rate": 0.00039749693966773534,
"loss": 3.3051,
"step": 58000
},
{
"epoch": 16.894844159627148,
"eval_accuracy": 0.3731435255694944,
"eval_loss": 3.533418893814087,
"eval_runtime": 178.6001,
"eval_samples_per_second": 93.175,
"eval_steps_per_second": 5.829,
"step": 58000
},
{
"epoch": 16.909408680454412,
"grad_norm": 0.38562971353530884,
"learning_rate": 0.00039732206353832697,
"loss": 3.292,
"step": 58050
},
{
"epoch": 16.92397320128168,
"grad_norm": 0.39436957240104675,
"learning_rate": 0.0003971471874089186,
"loss": 3.2875,
"step": 58100
},
{
"epoch": 16.938537722108943,
"grad_norm": 0.4385693371295929,
"learning_rate": 0.0003969723112795103,
"loss": 3.2928,
"step": 58150
},
{
"epoch": 16.953102242936207,
"grad_norm": 0.37202712893486023,
"learning_rate": 0.00039679743515010194,
"loss": 3.3134,
"step": 58200
},
{
"epoch": 16.96766676376347,
"grad_norm": 0.3811296224594116,
"learning_rate": 0.0003966225590206937,
"loss": 3.2922,
"step": 58250
},
{
"epoch": 16.982231284590735,
"grad_norm": 0.3608035743236542,
"learning_rate": 0.0003964476828912853,
"loss": 3.3055,
"step": 58300
},
{
"epoch": 16.996795805418003,
"grad_norm": 0.3930389881134033,
"learning_rate": 0.00039627280676187695,
"loss": 3.3021,
"step": 58350
},
{
"epoch": 17.011360326245267,
"grad_norm": 0.38095763325691223,
"learning_rate": 0.00039609793063246864,
"loss": 3.2166,
"step": 58400
},
{
"epoch": 17.02592484707253,
"grad_norm": 0.40292036533355713,
"learning_rate": 0.0003959230545030603,
"loss": 3.1866,
"step": 58450
},
{
"epoch": 17.040489367899795,
"grad_norm": 0.34886404871940613,
"learning_rate": 0.00039574817837365197,
"loss": 3.1835,
"step": 58500
},
{
"epoch": 17.055053888727063,
"grad_norm": 0.35758545994758606,
"learning_rate": 0.0003955733022442436,
"loss": 3.2015,
"step": 58550
},
{
"epoch": 17.069618409554327,
"grad_norm": 0.39563000202178955,
"learning_rate": 0.0003953984261148353,
"loss": 3.2003,
"step": 58600
},
{
"epoch": 17.08418293038159,
"grad_norm": 0.36030465364456177,
"learning_rate": 0.00039522354998542693,
"loss": 3.2011,
"step": 58650
},
{
"epoch": 17.098747451208855,
"grad_norm": 0.3758701980113983,
"learning_rate": 0.00039504867385601857,
"loss": 3.2118,
"step": 58700
},
{
"epoch": 17.11331197203612,
"grad_norm": 0.3549492657184601,
"learning_rate": 0.0003948737977266103,
"loss": 3.2018,
"step": 58750
},
{
"epoch": 17.127876492863386,
"grad_norm": 0.3890039920806885,
"learning_rate": 0.00039469892159720195,
"loss": 3.2095,
"step": 58800
},
{
"epoch": 17.14244101369065,
"grad_norm": 0.3619527220726013,
"learning_rate": 0.00039452404546779364,
"loss": 3.2128,
"step": 58850
},
{
"epoch": 17.157005534517914,
"grad_norm": 0.3767114579677582,
"learning_rate": 0.0003943491693383853,
"loss": 3.2153,
"step": 58900
},
{
"epoch": 17.171570055345178,
"grad_norm": 0.3961483836174011,
"learning_rate": 0.0003941742932089769,
"loss": 3.2248,
"step": 58950
},
{
"epoch": 17.186134576172442,
"grad_norm": 0.3526688516139984,
"learning_rate": 0.0003939994170795686,
"loss": 3.2186,
"step": 59000
},
{
"epoch": 17.186134576172442,
"eval_accuracy": 0.3721778113637467,
"eval_loss": 3.5520782470703125,
"eval_runtime": 178.9303,
"eval_samples_per_second": 93.003,
"eval_steps_per_second": 5.818,
"step": 59000
},
{
"epoch": 17.20069909699971,
"grad_norm": 0.36052054166793823,
"learning_rate": 0.00039382454095016024,
"loss": 3.2326,
"step": 59050
},
{
"epoch": 17.215263617826974,
"grad_norm": 0.3980617821216583,
"learning_rate": 0.00039364966482075193,
"loss": 3.2385,
"step": 59100
},
{
"epoch": 17.229828138654238,
"grad_norm": 0.37004363536834717,
"learning_rate": 0.00039347478869134356,
"loss": 3.2343,
"step": 59150
},
{
"epoch": 17.244392659481502,
"grad_norm": 0.37232598662376404,
"learning_rate": 0.0003932999125619353,
"loss": 3.2274,
"step": 59200
},
{
"epoch": 17.25895718030877,
"grad_norm": 0.3730037212371826,
"learning_rate": 0.00039312503643252695,
"loss": 3.2355,
"step": 59250
},
{
"epoch": 17.273521701136033,
"grad_norm": 0.36988314986228943,
"learning_rate": 0.0003929501603031186,
"loss": 3.2329,
"step": 59300
},
{
"epoch": 17.288086221963297,
"grad_norm": 0.4261464774608612,
"learning_rate": 0.00039277528417371027,
"loss": 3.225,
"step": 59350
},
{
"epoch": 17.30265074279056,
"grad_norm": 0.3772455155849457,
"learning_rate": 0.0003926004080443019,
"loss": 3.2486,
"step": 59400
},
{
"epoch": 17.317215263617825,
"grad_norm": 0.37627851963043213,
"learning_rate": 0.0003924255319148936,
"loss": 3.2467,
"step": 59450
},
{
"epoch": 17.331779784445093,
"grad_norm": 0.3609490990638733,
"learning_rate": 0.00039225065578548523,
"loss": 3.2466,
"step": 59500
},
{
"epoch": 17.346344305272357,
"grad_norm": 0.4445193409919739,
"learning_rate": 0.00039207577965607687,
"loss": 3.2415,
"step": 59550
},
{
"epoch": 17.36090882609962,
"grad_norm": 0.36540132761001587,
"learning_rate": 0.00039190090352666856,
"loss": 3.2564,
"step": 59600
},
{
"epoch": 17.375473346926885,
"grad_norm": 0.3811579644680023,
"learning_rate": 0.0003917260273972602,
"loss": 3.2256,
"step": 59650
},
{
"epoch": 17.390037867754153,
"grad_norm": 0.3652814030647278,
"learning_rate": 0.00039155115126785194,
"loss": 3.2522,
"step": 59700
},
{
"epoch": 17.404602388581417,
"grad_norm": 0.3721769154071808,
"learning_rate": 0.0003913762751384436,
"loss": 3.2463,
"step": 59750
},
{
"epoch": 17.41916690940868,
"grad_norm": 0.36768290400505066,
"learning_rate": 0.00039120139900903527,
"loss": 3.2339,
"step": 59800
},
{
"epoch": 17.433731430235945,
"grad_norm": 0.3575945496559143,
"learning_rate": 0.0003910265228796269,
"loss": 3.2611,
"step": 59850
},
{
"epoch": 17.44829595106321,
"grad_norm": 0.36524489521980286,
"learning_rate": 0.00039085164675021854,
"loss": 3.2509,
"step": 59900
},
{
"epoch": 17.462860471890476,
"grad_norm": 0.35425522923469543,
"learning_rate": 0.00039067677062081023,
"loss": 3.2521,
"step": 59950
},
{
"epoch": 17.47742499271774,
"grad_norm": 0.3456664979457855,
"learning_rate": 0.00039050189449140187,
"loss": 3.2536,
"step": 60000
},
{
"epoch": 17.47742499271774,
"eval_accuracy": 0.37240218821525267,
"eval_loss": 3.5458858013153076,
"eval_runtime": 178.3378,
"eval_samples_per_second": 93.312,
"eval_steps_per_second": 5.837,
"step": 60000
},
{
"epoch": 17.491989513545004,
"grad_norm": 0.3824387788772583,
"learning_rate": 0.00039032701836199356,
"loss": 3.2533,
"step": 60050
},
{
"epoch": 17.506554034372268,
"grad_norm": 0.3639180362224579,
"learning_rate": 0.0003901521422325852,
"loss": 3.2558,
"step": 60100
},
{
"epoch": 17.521118555199536,
"grad_norm": 0.387299120426178,
"learning_rate": 0.00038997726610317683,
"loss": 3.2536,
"step": 60150
},
{
"epoch": 17.5356830760268,
"grad_norm": 0.3662397563457489,
"learning_rate": 0.0003898023899737686,
"loss": 3.259,
"step": 60200
},
{
"epoch": 17.550247596854064,
"grad_norm": 0.37291350960731506,
"learning_rate": 0.0003896275138443602,
"loss": 3.2633,
"step": 60250
},
{
"epoch": 17.564812117681328,
"grad_norm": 0.40766435861587524,
"learning_rate": 0.0003894526377149519,
"loss": 3.2652,
"step": 60300
},
{
"epoch": 17.57937663850859,
"grad_norm": 0.3494330048561096,
"learning_rate": 0.00038927776158554354,
"loss": 3.2648,
"step": 60350
},
{
"epoch": 17.59394115933586,
"grad_norm": 0.3630947768688202,
"learning_rate": 0.0003891028854561352,
"loss": 3.254,
"step": 60400
},
{
"epoch": 17.608505680163123,
"grad_norm": 0.358445942401886,
"learning_rate": 0.00038892800932672686,
"loss": 3.2676,
"step": 60450
},
{
"epoch": 17.623070200990387,
"grad_norm": 0.35599270462989807,
"learning_rate": 0.0003887531331973185,
"loss": 3.2678,
"step": 60500
},
{
"epoch": 17.63763472181765,
"grad_norm": 0.3681202828884125,
"learning_rate": 0.0003885782570679102,
"loss": 3.2697,
"step": 60550
},
{
"epoch": 17.65219924264492,
"grad_norm": 0.34716203808784485,
"learning_rate": 0.0003884033809385018,
"loss": 3.2677,
"step": 60600
},
{
"epoch": 17.666763763472183,
"grad_norm": 0.3918900787830353,
"learning_rate": 0.00038822850480909357,
"loss": 3.263,
"step": 60650
},
{
"epoch": 17.681328284299447,
"grad_norm": 0.3595735430717468,
"learning_rate": 0.0003880536286796852,
"loss": 3.278,
"step": 60700
},
{
"epoch": 17.69589280512671,
"grad_norm": 0.3838953673839569,
"learning_rate": 0.00038787875255027684,
"loss": 3.268,
"step": 60750
},
{
"epoch": 17.710457325953975,
"grad_norm": 0.3666781187057495,
"learning_rate": 0.00038770387642086853,
"loss": 3.2855,
"step": 60800
},
{
"epoch": 17.725021846781242,
"grad_norm": 0.3640023171901703,
"learning_rate": 0.00038752900029146017,
"loss": 3.2769,
"step": 60850
},
{
"epoch": 17.739586367608506,
"grad_norm": 0.3869996964931488,
"learning_rate": 0.00038735412416205186,
"loss": 3.2637,
"step": 60900
},
{
"epoch": 17.75415088843577,
"grad_norm": 0.3861514925956726,
"learning_rate": 0.0003871792480326435,
"loss": 3.2769,
"step": 60950
},
{
"epoch": 17.768715409263034,
"grad_norm": 0.39806482195854187,
"learning_rate": 0.00038700437190323513,
"loss": 3.2864,
"step": 61000
},
{
"epoch": 17.768715409263034,
"eval_accuracy": 0.37317821695900927,
"eval_loss": 3.53695011138916,
"eval_runtime": 178.9207,
"eval_samples_per_second": 93.008,
"eval_steps_per_second": 5.818,
"step": 61000
},
{
"epoch": 17.7832799300903,
"grad_norm": 0.3672393262386322,
"learning_rate": 0.0003868294957738268,
"loss": 3.2849,
"step": 61050
},
{
"epoch": 17.797844450917566,
"grad_norm": 0.3629036247730255,
"learning_rate": 0.00038665461964441846,
"loss": 3.2865,
"step": 61100
},
{
"epoch": 17.81240897174483,
"grad_norm": 0.37374356389045715,
"learning_rate": 0.0003864797435150102,
"loss": 3.2839,
"step": 61150
},
{
"epoch": 17.826973492572094,
"grad_norm": 0.3959326148033142,
"learning_rate": 0.00038630486738560184,
"loss": 3.2759,
"step": 61200
},
{
"epoch": 17.841538013399358,
"grad_norm": 0.407258540391922,
"learning_rate": 0.00038612999125619353,
"loss": 3.294,
"step": 61250
},
{
"epoch": 17.856102534226626,
"grad_norm": 0.37516331672668457,
"learning_rate": 0.00038595511512678517,
"loss": 3.2826,
"step": 61300
},
{
"epoch": 17.87066705505389,
"grad_norm": 0.3762381970882416,
"learning_rate": 0.0003857802389973768,
"loss": 3.2829,
"step": 61350
},
{
"epoch": 17.885231575881154,
"grad_norm": 0.3950086534023285,
"learning_rate": 0.0003856053628679685,
"loss": 3.2828,
"step": 61400
},
{
"epoch": 17.899796096708418,
"grad_norm": 0.35681092739105225,
"learning_rate": 0.00038543048673856013,
"loss": 3.2767,
"step": 61450
},
{
"epoch": 17.91436061753568,
"grad_norm": 0.3618324398994446,
"learning_rate": 0.0003852556106091518,
"loss": 3.2892,
"step": 61500
},
{
"epoch": 17.92892513836295,
"grad_norm": 0.3859667479991913,
"learning_rate": 0.00038508073447974346,
"loss": 3.2809,
"step": 61550
},
{
"epoch": 17.943489659190213,
"grad_norm": 0.37979528307914734,
"learning_rate": 0.0003849058583503351,
"loss": 3.2884,
"step": 61600
},
{
"epoch": 17.958054180017477,
"grad_norm": 0.3932032287120819,
"learning_rate": 0.00038473098222092684,
"loss": 3.2719,
"step": 61650
},
{
"epoch": 17.97261870084474,
"grad_norm": 0.383821040391922,
"learning_rate": 0.0003845561060915185,
"loss": 3.2818,
"step": 61700
},
{
"epoch": 17.98718322167201,
"grad_norm": 0.3670404553413391,
"learning_rate": 0.00038438122996211016,
"loss": 3.2897,
"step": 61750
},
{
"epoch": 18.001747742499273,
"grad_norm": 0.38702115416526794,
"learning_rate": 0.0003842063538327018,
"loss": 3.2594,
"step": 61800
},
{
"epoch": 18.016312263326537,
"grad_norm": 0.3678998351097107,
"learning_rate": 0.0003840314777032935,
"loss": 3.182,
"step": 61850
},
{
"epoch": 18.0308767841538,
"grad_norm": 0.4147825241088867,
"learning_rate": 0.0003838566015738851,
"loss": 3.1858,
"step": 61900
},
{
"epoch": 18.045441304981065,
"grad_norm": 0.3900906443595886,
"learning_rate": 0.00038368172544447676,
"loss": 3.1817,
"step": 61950
},
{
"epoch": 18.060005825808332,
"grad_norm": 0.3811590373516083,
"learning_rate": 0.00038350684931506845,
"loss": 3.1842,
"step": 62000
},
{
"epoch": 18.060005825808332,
"eval_accuracy": 0.3725568294939373,
"eval_loss": 3.5498850345611572,
"eval_runtime": 178.3022,
"eval_samples_per_second": 93.33,
"eval_steps_per_second": 5.838,
"step": 62000
},
{
"epoch": 18.074570346635596,
"grad_norm": 0.3556840121746063,
"learning_rate": 0.0003833319731856601,
"loss": 3.1909,
"step": 62050
},
{
"epoch": 18.08913486746286,
"grad_norm": 0.37406861782073975,
"learning_rate": 0.00038315709705625183,
"loss": 3.2023,
"step": 62100
},
{
"epoch": 18.103699388290124,
"grad_norm": 0.38093480467796326,
"learning_rate": 0.00038298222092684347,
"loss": 3.2011,
"step": 62150
},
{
"epoch": 18.11826390911739,
"grad_norm": 0.36246082186698914,
"learning_rate": 0.0003828073447974351,
"loss": 3.1999,
"step": 62200
},
{
"epoch": 18.132828429944656,
"grad_norm": 0.38521018624305725,
"learning_rate": 0.0003826324686680268,
"loss": 3.2039,
"step": 62250
},
{
"epoch": 18.14739295077192,
"grad_norm": 0.387359619140625,
"learning_rate": 0.00038245759253861843,
"loss": 3.2073,
"step": 62300
},
{
"epoch": 18.161957471599184,
"grad_norm": 0.3872774839401245,
"learning_rate": 0.0003822827164092101,
"loss": 3.2114,
"step": 62350
},
{
"epoch": 18.176521992426448,
"grad_norm": 0.3680144250392914,
"learning_rate": 0.00038210784027980176,
"loss": 3.2058,
"step": 62400
},
{
"epoch": 18.191086513253715,
"grad_norm": 0.37997880578041077,
"learning_rate": 0.0003819329641503934,
"loss": 3.2089,
"step": 62450
},
{
"epoch": 18.20565103408098,
"grad_norm": 0.4042533338069916,
"learning_rate": 0.0003817580880209851,
"loss": 3.2144,
"step": 62500
},
{
"epoch": 18.220215554908243,
"grad_norm": 0.3523563742637634,
"learning_rate": 0.0003815832118915767,
"loss": 3.2195,
"step": 62550
},
{
"epoch": 18.234780075735507,
"grad_norm": 0.36140862107276917,
"learning_rate": 0.00038140833576216847,
"loss": 3.2074,
"step": 62600
},
{
"epoch": 18.24934459656277,
"grad_norm": 0.36438101530075073,
"learning_rate": 0.0003812334596327601,
"loss": 3.215,
"step": 62650
},
{
"epoch": 18.26390911739004,
"grad_norm": 0.3548491299152374,
"learning_rate": 0.0003810585835033518,
"loss": 3.2305,
"step": 62700
},
{
"epoch": 18.278473638217303,
"grad_norm": 0.36885324120521545,
"learning_rate": 0.00038088370737394343,
"loss": 3.2123,
"step": 62750
},
{
"epoch": 18.293038159044567,
"grad_norm": 0.3839961588382721,
"learning_rate": 0.00038070883124453507,
"loss": 3.2353,
"step": 62800
},
{
"epoch": 18.30760267987183,
"grad_norm": 0.380561888217926,
"learning_rate": 0.00038053395511512676,
"loss": 3.2258,
"step": 62850
},
{
"epoch": 18.3221672006991,
"grad_norm": 0.3721316456794739,
"learning_rate": 0.0003803590789857184,
"loss": 3.2348,
"step": 62900
},
{
"epoch": 18.336731721526363,
"grad_norm": 0.38659441471099854,
"learning_rate": 0.0003801842028563101,
"loss": 3.2352,
"step": 62950
},
{
"epoch": 18.351296242353627,
"grad_norm": 0.38408663868904114,
"learning_rate": 0.0003800093267269017,
"loss": 3.2292,
"step": 63000
},
{
"epoch": 18.351296242353627,
"eval_accuracy": 0.3726717226720931,
"eval_loss": 3.544954299926758,
"eval_runtime": 178.702,
"eval_samples_per_second": 93.122,
"eval_steps_per_second": 5.825,
"step": 63000
},
{
"epoch": 18.36586076318089,
"grad_norm": 0.38154080510139465,
"learning_rate": 0.00037983445059749335,
"loss": 3.2345,
"step": 63050
},
{
"epoch": 18.380425284008155,
"grad_norm": 0.38055160641670227,
"learning_rate": 0.0003796595744680851,
"loss": 3.2304,
"step": 63100
},
{
"epoch": 18.394989804835422,
"grad_norm": 0.3660554885864258,
"learning_rate": 0.00037948469833867674,
"loss": 3.2421,
"step": 63150
},
{
"epoch": 18.409554325662686,
"grad_norm": 0.39061740040779114,
"learning_rate": 0.0003793098222092684,
"loss": 3.2387,
"step": 63200
},
{
"epoch": 18.42411884648995,
"grad_norm": 0.3903264105319977,
"learning_rate": 0.00037913494607986006,
"loss": 3.2289,
"step": 63250
},
{
"epoch": 18.438683367317214,
"grad_norm": 0.3617711365222931,
"learning_rate": 0.00037896006995045175,
"loss": 3.2362,
"step": 63300
},
{
"epoch": 18.45324788814448,
"grad_norm": 0.3767446279525757,
"learning_rate": 0.0003787851938210434,
"loss": 3.2389,
"step": 63350
},
{
"epoch": 18.467812408971746,
"grad_norm": 0.3788781762123108,
"learning_rate": 0.000378610317691635,
"loss": 3.2386,
"step": 63400
},
{
"epoch": 18.48237692979901,
"grad_norm": 0.3798232674598694,
"learning_rate": 0.0003784354415622267,
"loss": 3.2459,
"step": 63450
},
{
"epoch": 18.496941450626274,
"grad_norm": 0.3854493200778961,
"learning_rate": 0.00037826056543281835,
"loss": 3.2479,
"step": 63500
},
{
"epoch": 18.511505971453538,
"grad_norm": 0.37762451171875,
"learning_rate": 0.0003780856893034101,
"loss": 3.2443,
"step": 63550
},
{
"epoch": 18.526070492280805,
"grad_norm": 0.42830953001976013,
"learning_rate": 0.00037791081317400173,
"loss": 3.2505,
"step": 63600
},
{
"epoch": 18.54063501310807,
"grad_norm": 0.3428388833999634,
"learning_rate": 0.00037773593704459337,
"loss": 3.2526,
"step": 63650
},
{
"epoch": 18.555199533935333,
"grad_norm": 0.3522073030471802,
"learning_rate": 0.00037756106091518506,
"loss": 3.2507,
"step": 63700
},
{
"epoch": 18.569764054762597,
"grad_norm": 0.39283233880996704,
"learning_rate": 0.0003773861847857767,
"loss": 3.2488,
"step": 63750
},
{
"epoch": 18.584328575589865,
"grad_norm": 0.3995060622692108,
"learning_rate": 0.0003772113086563684,
"loss": 3.2485,
"step": 63800
},
{
"epoch": 18.59889309641713,
"grad_norm": 0.3654598295688629,
"learning_rate": 0.00037703643252696,
"loss": 3.2569,
"step": 63850
},
{
"epoch": 18.613457617244393,
"grad_norm": 0.4324260354042053,
"learning_rate": 0.0003768615563975517,
"loss": 3.2575,
"step": 63900
},
{
"epoch": 18.628022138071657,
"grad_norm": 0.35259896516799927,
"learning_rate": 0.00037668668026814335,
"loss": 3.2635,
"step": 63950
},
{
"epoch": 18.64258665889892,
"grad_norm": 0.35881707072257996,
"learning_rate": 0.000376511804138735,
"loss": 3.2734,
"step": 64000
},
{
"epoch": 18.64258665889892,
"eval_accuracy": 0.37319409267963466,
"eval_loss": 3.538569927215576,
"eval_runtime": 178.7072,
"eval_samples_per_second": 93.119,
"eval_steps_per_second": 5.825,
"step": 64000
},
{
"epoch": 18.65715117972619,
"grad_norm": 0.39576172828674316,
"learning_rate": 0.00037633692800932673,
"loss": 3.2583,
"step": 64050
},
{
"epoch": 18.671715700553452,
"grad_norm": 0.36197108030319214,
"learning_rate": 0.00037616205187991837,
"loss": 3.2633,
"step": 64100
},
{
"epoch": 18.686280221380716,
"grad_norm": 0.35696882009506226,
"learning_rate": 0.00037598717575051006,
"loss": 3.2624,
"step": 64150
},
{
"epoch": 18.70084474220798,
"grad_norm": 0.36636996269226074,
"learning_rate": 0.0003758122996211017,
"loss": 3.2643,
"step": 64200
},
{
"epoch": 18.715409263035244,
"grad_norm": 0.39471927285194397,
"learning_rate": 0.00037563742349169333,
"loss": 3.2639,
"step": 64250
},
{
"epoch": 18.729973783862512,
"grad_norm": 0.36265599727630615,
"learning_rate": 0.000375462547362285,
"loss": 3.2608,
"step": 64300
},
{
"epoch": 18.744538304689776,
"grad_norm": 0.38616278767585754,
"learning_rate": 0.00037528767123287665,
"loss": 3.2625,
"step": 64350
},
{
"epoch": 18.75910282551704,
"grad_norm": 0.3862897753715515,
"learning_rate": 0.00037511279510346834,
"loss": 3.2614,
"step": 64400
},
{
"epoch": 18.773667346344304,
"grad_norm": 0.377992182970047,
"learning_rate": 0.00037493791897406,
"loss": 3.2648,
"step": 64450
},
{
"epoch": 18.78823186717157,
"grad_norm": 0.4095982015132904,
"learning_rate": 0.0003747630428446516,
"loss": 3.2688,
"step": 64500
},
{
"epoch": 18.802796387998836,
"grad_norm": 0.4064045548439026,
"learning_rate": 0.00037458816671524336,
"loss": 3.2684,
"step": 64550
},
{
"epoch": 18.8173609088261,
"grad_norm": 0.377204954624176,
"learning_rate": 0.000374413290585835,
"loss": 3.2699,
"step": 64600
},
{
"epoch": 18.831925429653364,
"grad_norm": 0.38369378447532654,
"learning_rate": 0.0003742384144564267,
"loss": 3.2662,
"step": 64650
},
{
"epoch": 18.846489950480628,
"grad_norm": 0.3455315828323364,
"learning_rate": 0.0003740635383270183,
"loss": 3.2626,
"step": 64700
},
{
"epoch": 18.861054471307895,
"grad_norm": 0.3835621774196625,
"learning_rate": 0.00037388866219761,
"loss": 3.2531,
"step": 64750
},
{
"epoch": 18.87561899213516,
"grad_norm": 0.37047526240348816,
"learning_rate": 0.00037371378606820165,
"loss": 3.2693,
"step": 64800
},
{
"epoch": 18.890183512962423,
"grad_norm": 0.3814605474472046,
"learning_rate": 0.0003735389099387933,
"loss": 3.2607,
"step": 64850
},
{
"epoch": 18.904748033789687,
"grad_norm": 0.36107689142227173,
"learning_rate": 0.000373364033809385,
"loss": 3.2606,
"step": 64900
},
{
"epoch": 18.919312554616955,
"grad_norm": 0.37017542123794556,
"learning_rate": 0.0003731891576799766,
"loss": 3.2702,
"step": 64950
},
{
"epoch": 18.93387707544422,
"grad_norm": 0.35455116629600525,
"learning_rate": 0.00037301428155056836,
"loss": 3.2742,
"step": 65000
},
{
"epoch": 18.93387707544422,
"eval_accuracy": 0.37389015483061133,
"eval_loss": 3.5325088500976562,
"eval_runtime": 178.611,
"eval_samples_per_second": 93.169,
"eval_steps_per_second": 5.828,
"step": 65000
},
{
"epoch": 18.948441596271483,
"grad_norm": 0.3578329384326935,
"learning_rate": 0.00037283940542116,
"loss": 3.2631,
"step": 65050
},
{
"epoch": 18.963006117098747,
"grad_norm": 0.3659336268901825,
"learning_rate": 0.00037266452929175163,
"loss": 3.2701,
"step": 65100
},
{
"epoch": 18.97757063792601,
"grad_norm": 0.3941463232040405,
"learning_rate": 0.0003724896531623433,
"loss": 3.2729,
"step": 65150
},
{
"epoch": 18.99213515875328,
"grad_norm": 0.35798850655555725,
"learning_rate": 0.00037231477703293496,
"loss": 3.2721,
"step": 65200
},
{
"epoch": 19.006699679580542,
"grad_norm": 0.3821282684803009,
"learning_rate": 0.00037213990090352665,
"loss": 3.2245,
"step": 65250
},
{
"epoch": 19.021264200407806,
"grad_norm": 0.3936713933944702,
"learning_rate": 0.0003719650247741183,
"loss": 3.1673,
"step": 65300
},
{
"epoch": 19.03582872123507,
"grad_norm": 0.3834037780761719,
"learning_rate": 0.00037179014864471,
"loss": 3.1683,
"step": 65350
},
{
"epoch": 19.050393242062338,
"grad_norm": 0.41335737705230713,
"learning_rate": 0.0003716152725153016,
"loss": 3.1729,
"step": 65400
},
{
"epoch": 19.064957762889602,
"grad_norm": 0.3922509551048279,
"learning_rate": 0.00037144039638589325,
"loss": 3.1782,
"step": 65450
},
{
"epoch": 19.079522283716866,
"grad_norm": 0.4212184548377991,
"learning_rate": 0.000371265520256485,
"loss": 3.1944,
"step": 65500
},
{
"epoch": 19.09408680454413,
"grad_norm": 0.38961464166641235,
"learning_rate": 0.00037109064412707663,
"loss": 3.1903,
"step": 65550
},
{
"epoch": 19.108651325371394,
"grad_norm": 0.38067349791526794,
"learning_rate": 0.0003709157679976683,
"loss": 3.1827,
"step": 65600
},
{
"epoch": 19.12321584619866,
"grad_norm": 0.3817111849784851,
"learning_rate": 0.00037074089186825995,
"loss": 3.1853,
"step": 65650
},
{
"epoch": 19.137780367025925,
"grad_norm": 0.3943799138069153,
"learning_rate": 0.0003705660157388516,
"loss": 3.1897,
"step": 65700
},
{
"epoch": 19.15234488785319,
"grad_norm": 0.34692874550819397,
"learning_rate": 0.0003703911396094433,
"loss": 3.1987,
"step": 65750
},
{
"epoch": 19.166909408680453,
"grad_norm": 0.3758201003074646,
"learning_rate": 0.0003702162634800349,
"loss": 3.1943,
"step": 65800
},
{
"epoch": 19.181473929507717,
"grad_norm": 0.3872537314891815,
"learning_rate": 0.0003700413873506266,
"loss": 3.2068,
"step": 65850
},
{
"epoch": 19.196038450334985,
"grad_norm": 0.3869698941707611,
"learning_rate": 0.00036986651122121824,
"loss": 3.2054,
"step": 65900
},
{
"epoch": 19.21060297116225,
"grad_norm": 0.374907910823822,
"learning_rate": 0.00036969163509181,
"loss": 3.2016,
"step": 65950
},
{
"epoch": 19.225167491989513,
"grad_norm": 0.41257426142692566,
"learning_rate": 0.0003695167589624016,
"loss": 3.2181,
"step": 66000
},
{
"epoch": 19.225167491989513,
"eval_accuracy": 0.37258681696622975,
"eval_loss": 3.549304246902466,
"eval_runtime": 178.7203,
"eval_samples_per_second": 93.112,
"eval_steps_per_second": 5.825,
"step": 66000
},
{
"epoch": 19.239732012816777,
"grad_norm": 0.3578282594680786,
"learning_rate": 0.00036934188283299326,
"loss": 3.2035,
"step": 66050
},
{
"epoch": 19.254296533644045,
"grad_norm": 0.3656151294708252,
"learning_rate": 0.00036916700670358495,
"loss": 3.2043,
"step": 66100
},
{
"epoch": 19.26886105447131,
"grad_norm": 0.38482174277305603,
"learning_rate": 0.0003689921305741766,
"loss": 3.2088,
"step": 66150
},
{
"epoch": 19.283425575298573,
"grad_norm": 0.3795786201953888,
"learning_rate": 0.0003688172544447683,
"loss": 3.2117,
"step": 66200
},
{
"epoch": 19.297990096125837,
"grad_norm": 0.397713840007782,
"learning_rate": 0.0003686423783153599,
"loss": 3.2176,
"step": 66250
},
{
"epoch": 19.3125546169531,
"grad_norm": 0.3665197491645813,
"learning_rate": 0.00036846750218595155,
"loss": 3.2059,
"step": 66300
},
{
"epoch": 19.327119137780368,
"grad_norm": 0.39614352583885193,
"learning_rate": 0.00036829262605654324,
"loss": 3.227,
"step": 66350
},
{
"epoch": 19.341683658607632,
"grad_norm": 0.3724098801612854,
"learning_rate": 0.0003681177499271349,
"loss": 3.2127,
"step": 66400
},
{
"epoch": 19.356248179434896,
"grad_norm": 0.38602399826049805,
"learning_rate": 0.0003679428737977266,
"loss": 3.2227,
"step": 66450
},
{
"epoch": 19.37081270026216,
"grad_norm": 0.36235538125038147,
"learning_rate": 0.00036776799766831826,
"loss": 3.2195,
"step": 66500
},
{
"epoch": 19.385377221089428,
"grad_norm": 0.36831751465797424,
"learning_rate": 0.0003675931215389099,
"loss": 3.2191,
"step": 66550
},
{
"epoch": 19.39994174191669,
"grad_norm": 0.3906136453151703,
"learning_rate": 0.0003674182454095016,
"loss": 3.217,
"step": 66600
},
{
"epoch": 19.414506262743956,
"grad_norm": 0.40357115864753723,
"learning_rate": 0.0003672433692800932,
"loss": 3.2266,
"step": 66650
},
{
"epoch": 19.42907078357122,
"grad_norm": 0.37824922800064087,
"learning_rate": 0.0003670684931506849,
"loss": 3.2299,
"step": 66700
},
{
"epoch": 19.443635304398484,
"grad_norm": 0.3971153795719147,
"learning_rate": 0.00036689361702127655,
"loss": 3.2347,
"step": 66750
},
{
"epoch": 19.45819982522575,
"grad_norm": 0.391778826713562,
"learning_rate": 0.00036671874089186824,
"loss": 3.2212,
"step": 66800
},
{
"epoch": 19.472764346053015,
"grad_norm": 0.41135820746421814,
"learning_rate": 0.00036654386476245987,
"loss": 3.2307,
"step": 66850
},
{
"epoch": 19.48732886688028,
"grad_norm": 0.38100093603134155,
"learning_rate": 0.0003663689886330515,
"loss": 3.2395,
"step": 66900
},
{
"epoch": 19.501893387707543,
"grad_norm": 0.4028393626213074,
"learning_rate": 0.00036619411250364325,
"loss": 3.2361,
"step": 66950
},
{
"epoch": 19.51645790853481,
"grad_norm": 0.40780872106552124,
"learning_rate": 0.0003660192363742349,
"loss": 3.2331,
"step": 67000
},
{
"epoch": 19.51645790853481,
"eval_accuracy": 0.37296371833367026,
"eval_loss": 3.543234348297119,
"eval_runtime": 178.2342,
"eval_samples_per_second": 93.366,
"eval_steps_per_second": 5.841,
"step": 67000
},
{
"epoch": 19.531022429362075,
"grad_norm": 0.36076855659484863,
"learning_rate": 0.0003658443602448266,
"loss": 3.2384,
"step": 67050
},
{
"epoch": 19.54558695018934,
"grad_norm": 0.4246351420879364,
"learning_rate": 0.0003656694841154182,
"loss": 3.2335,
"step": 67100
},
{
"epoch": 19.560151471016603,
"grad_norm": 0.38414379954338074,
"learning_rate": 0.00036549460798600985,
"loss": 3.2434,
"step": 67150
},
{
"epoch": 19.574715991843867,
"grad_norm": 0.34863388538360596,
"learning_rate": 0.00036531973185660154,
"loss": 3.2359,
"step": 67200
},
{
"epoch": 19.589280512671134,
"grad_norm": 0.39756739139556885,
"learning_rate": 0.0003651448557271932,
"loss": 3.2489,
"step": 67250
},
{
"epoch": 19.6038450334984,
"grad_norm": 0.3935822546482086,
"learning_rate": 0.00036496997959778487,
"loss": 3.2575,
"step": 67300
},
{
"epoch": 19.618409554325662,
"grad_norm": 0.3777560591697693,
"learning_rate": 0.0003647951034683765,
"loss": 3.2377,
"step": 67350
},
{
"epoch": 19.632974075152926,
"grad_norm": 0.38349226117134094,
"learning_rate": 0.00036462022733896825,
"loss": 3.2485,
"step": 67400
},
{
"epoch": 19.647538595980194,
"grad_norm": 0.38052335381507874,
"learning_rate": 0.0003644453512095599,
"loss": 3.2459,
"step": 67450
},
{
"epoch": 19.662103116807458,
"grad_norm": 0.38704562187194824,
"learning_rate": 0.0003642704750801515,
"loss": 3.2401,
"step": 67500
},
{
"epoch": 19.676667637634722,
"grad_norm": 0.43219882249832153,
"learning_rate": 0.0003640955989507432,
"loss": 3.2525,
"step": 67550
},
{
"epoch": 19.691232158461986,
"grad_norm": 0.3853197991847992,
"learning_rate": 0.00036392072282133485,
"loss": 3.2386,
"step": 67600
},
{
"epoch": 19.70579667928925,
"grad_norm": 0.37223896384239197,
"learning_rate": 0.00036374584669192654,
"loss": 3.2412,
"step": 67650
},
{
"epoch": 19.720361200116518,
"grad_norm": 0.36086782813072205,
"learning_rate": 0.0003635709705625182,
"loss": 3.2501,
"step": 67700
},
{
"epoch": 19.73492572094378,
"grad_norm": 0.38250860571861267,
"learning_rate": 0.0003633960944331098,
"loss": 3.2624,
"step": 67750
},
{
"epoch": 19.749490241771046,
"grad_norm": 0.373307466506958,
"learning_rate": 0.0003632212183037015,
"loss": 3.2549,
"step": 67800
},
{
"epoch": 19.76405476259831,
"grad_norm": 0.3858838677406311,
"learning_rate": 0.00036304634217429314,
"loss": 3.2465,
"step": 67850
},
{
"epoch": 19.778619283425574,
"grad_norm": 0.36848515272140503,
"learning_rate": 0.0003628714660448849,
"loss": 3.2571,
"step": 67900
},
{
"epoch": 19.79318380425284,
"grad_norm": 0.3879449963569641,
"learning_rate": 0.0003626965899154765,
"loss": 3.2531,
"step": 67950
},
{
"epoch": 19.807748325080105,
"grad_norm": 0.3670133352279663,
"learning_rate": 0.0003625217137860682,
"loss": 3.2519,
"step": 68000
},
{
"epoch": 19.807748325080105,
"eval_accuracy": 0.37371728587269015,
"eval_loss": 3.535742998123169,
"eval_runtime": 178.7112,
"eval_samples_per_second": 93.117,
"eval_steps_per_second": 5.825,
"step": 68000
},
{
"epoch": 19.82231284590737,
"grad_norm": 0.4126584827899933,
"learning_rate": 0.00036234683765665985,
"loss": 3.2575,
"step": 68050
},
{
"epoch": 19.836877366734633,
"grad_norm": 0.4170861542224884,
"learning_rate": 0.0003621719615272515,
"loss": 3.251,
"step": 68100
},
{
"epoch": 19.8514418875619,
"grad_norm": 0.3947071135044098,
"learning_rate": 0.00036199708539784317,
"loss": 3.2655,
"step": 68150
},
{
"epoch": 19.866006408389165,
"grad_norm": 0.37961897253990173,
"learning_rate": 0.0003618222092684348,
"loss": 3.2535,
"step": 68200
},
{
"epoch": 19.88057092921643,
"grad_norm": 0.3961268961429596,
"learning_rate": 0.0003616473331390265,
"loss": 3.2499,
"step": 68250
},
{
"epoch": 19.895135450043693,
"grad_norm": 0.37590292096138,
"learning_rate": 0.00036147245700961813,
"loss": 3.252,
"step": 68300
},
{
"epoch": 19.909699970870957,
"grad_norm": 0.35986328125,
"learning_rate": 0.00036129758088020977,
"loss": 3.2624,
"step": 68350
},
{
"epoch": 19.924264491698224,
"grad_norm": 0.3746156692504883,
"learning_rate": 0.0003611227047508015,
"loss": 3.2635,
"step": 68400
},
{
"epoch": 19.93882901252549,
"grad_norm": 0.3684529662132263,
"learning_rate": 0.00036094782862139315,
"loss": 3.2655,
"step": 68450
},
{
"epoch": 19.953393533352752,
"grad_norm": 0.38268017768859863,
"learning_rate": 0.00036077295249198484,
"loss": 3.2666,
"step": 68500
},
{
"epoch": 19.967958054180016,
"grad_norm": 0.3657938838005066,
"learning_rate": 0.0003605980763625765,
"loss": 3.2621,
"step": 68550
},
{
"epoch": 19.982522575007284,
"grad_norm": 0.3825475871562958,
"learning_rate": 0.0003604232002331681,
"loss": 3.2576,
"step": 68600
},
{
"epoch": 19.997087095834548,
"grad_norm": 0.3889371156692505,
"learning_rate": 0.0003602483241037598,
"loss": 3.2623,
"step": 68650
},
{
"epoch": 20.011651616661812,
"grad_norm": 0.39976590871810913,
"learning_rate": 0.00036007344797435144,
"loss": 3.1758,
"step": 68700
},
{
"epoch": 20.026216137489076,
"grad_norm": 0.39569732546806335,
"learning_rate": 0.00035989857184494313,
"loss": 3.1563,
"step": 68750
},
{
"epoch": 20.04078065831634,
"grad_norm": 0.38413846492767334,
"learning_rate": 0.00035972369571553477,
"loss": 3.1617,
"step": 68800
},
{
"epoch": 20.055345179143607,
"grad_norm": 0.3950614333152771,
"learning_rate": 0.0003595488195861265,
"loss": 3.1581,
"step": 68850
},
{
"epoch": 20.06990969997087,
"grad_norm": 0.3673596978187561,
"learning_rate": 0.00035937394345671815,
"loss": 3.1545,
"step": 68900
},
{
"epoch": 20.084474220798135,
"grad_norm": 0.3909459412097931,
"learning_rate": 0.0003591990673273098,
"loss": 3.1761,
"step": 68950
},
{
"epoch": 20.0990387416254,
"grad_norm": 0.3746775984764099,
"learning_rate": 0.0003590241911979015,
"loss": 3.1681,
"step": 69000
},
{
"epoch": 20.0990387416254,
"eval_accuracy": 0.37325547879938625,
"eval_loss": 3.54992938041687,
"eval_runtime": 178.4066,
"eval_samples_per_second": 93.276,
"eval_steps_per_second": 5.835,
"step": 69000
},
{
"epoch": 20.113603262452667,
"grad_norm": 0.38828980922698975,
"learning_rate": 0.0003588493150684931,
"loss": 3.1727,
"step": 69050
},
{
"epoch": 20.12816778327993,
"grad_norm": 0.3792520761489868,
"learning_rate": 0.0003586744389390848,
"loss": 3.1815,
"step": 69100
},
{
"epoch": 20.142732304107195,
"grad_norm": 0.41158244013786316,
"learning_rate": 0.00035849956280967644,
"loss": 3.1746,
"step": 69150
},
{
"epoch": 20.15729682493446,
"grad_norm": 0.38685640692710876,
"learning_rate": 0.0003583246866802681,
"loss": 3.1866,
"step": 69200
},
{
"epoch": 20.171861345761723,
"grad_norm": 0.392080157995224,
"learning_rate": 0.00035814981055085976,
"loss": 3.1875,
"step": 69250
},
{
"epoch": 20.18642586658899,
"grad_norm": 0.3973603844642639,
"learning_rate": 0.0003579749344214514,
"loss": 3.1907,
"step": 69300
},
{
"epoch": 20.200990387416255,
"grad_norm": 0.39429065585136414,
"learning_rate": 0.00035780005829204315,
"loss": 3.2021,
"step": 69350
},
{
"epoch": 20.21555490824352,
"grad_norm": 0.3873383104801178,
"learning_rate": 0.0003576251821626348,
"loss": 3.2113,
"step": 69400
},
{
"epoch": 20.230119429070783,
"grad_norm": 0.39196303486824036,
"learning_rate": 0.00035745030603322647,
"loss": 3.1806,
"step": 69450
},
{
"epoch": 20.244683949898047,
"grad_norm": 0.38600245118141174,
"learning_rate": 0.0003572754299038181,
"loss": 3.1949,
"step": 69500
},
{
"epoch": 20.259248470725314,
"grad_norm": 0.40740031003952026,
"learning_rate": 0.00035710055377440974,
"loss": 3.2046,
"step": 69550
},
{
"epoch": 20.273812991552578,
"grad_norm": 0.3854810297489166,
"learning_rate": 0.00035692567764500143,
"loss": 3.2124,
"step": 69600
},
{
"epoch": 20.288377512379842,
"grad_norm": 0.4143630266189575,
"learning_rate": 0.00035675080151559307,
"loss": 3.1975,
"step": 69650
},
{
"epoch": 20.302942033207106,
"grad_norm": 0.39811912178993225,
"learning_rate": 0.00035657592538618476,
"loss": 3.2157,
"step": 69700
},
{
"epoch": 20.317506554034374,
"grad_norm": 0.37693876028060913,
"learning_rate": 0.0003564010492567764,
"loss": 3.2072,
"step": 69750
},
{
"epoch": 20.332071074861638,
"grad_norm": 0.36333632469177246,
"learning_rate": 0.00035622617312736803,
"loss": 3.2053,
"step": 69800
},
{
"epoch": 20.346635595688902,
"grad_norm": 0.402942419052124,
"learning_rate": 0.0003560512969979598,
"loss": 3.2061,
"step": 69850
},
{
"epoch": 20.361200116516166,
"grad_norm": 0.38568899035453796,
"learning_rate": 0.0003558764208685514,
"loss": 3.2177,
"step": 69900
},
{
"epoch": 20.37576463734343,
"grad_norm": 0.36854901909828186,
"learning_rate": 0.0003557015447391431,
"loss": 3.2177,
"step": 69950
},
{
"epoch": 20.390329158170697,
"grad_norm": 0.35696056485176086,
"learning_rate": 0.00035552666860973474,
"loss": 3.2174,
"step": 70000
},
{
"epoch": 20.390329158170697,
"eval_accuracy": 0.3733148657543184,
"eval_loss": 3.548022508621216,
"eval_runtime": 178.9547,
"eval_samples_per_second": 92.99,
"eval_steps_per_second": 5.817,
"step": 70000
},
{
"epoch": 20.40489367899796,
"grad_norm": 0.40440744161605835,
"learning_rate": 0.00035535179248032643,
"loss": 3.2185,
"step": 70050
},
{
"epoch": 20.419458199825225,
"grad_norm": 0.42058974504470825,
"learning_rate": 0.00035517691635091807,
"loss": 3.217,
"step": 70100
},
{
"epoch": 20.43402272065249,
"grad_norm": 0.3718623220920563,
"learning_rate": 0.0003550020402215097,
"loss": 3.2316,
"step": 70150
},
{
"epoch": 20.448587241479757,
"grad_norm": 0.3532809019088745,
"learning_rate": 0.0003548271640921014,
"loss": 3.2117,
"step": 70200
},
{
"epoch": 20.46315176230702,
"grad_norm": 0.38799864053726196,
"learning_rate": 0.00035465228796269303,
"loss": 3.2139,
"step": 70250
},
{
"epoch": 20.477716283134285,
"grad_norm": 0.42314207553863525,
"learning_rate": 0.0003544774118332848,
"loss": 3.2205,
"step": 70300
},
{
"epoch": 20.49228080396155,
"grad_norm": 0.40982669591903687,
"learning_rate": 0.0003543025357038764,
"loss": 3.2287,
"step": 70350
},
{
"epoch": 20.506845324788813,
"grad_norm": 0.37632712721824646,
"learning_rate": 0.00035412765957446805,
"loss": 3.2222,
"step": 70400
},
{
"epoch": 20.52140984561608,
"grad_norm": 0.38797253370285034,
"learning_rate": 0.00035395278344505974,
"loss": 3.228,
"step": 70450
},
{
"epoch": 20.535974366443345,
"grad_norm": 0.39570584893226624,
"learning_rate": 0.0003537779073156514,
"loss": 3.2251,
"step": 70500
},
{
"epoch": 20.55053888727061,
"grad_norm": 0.39038118720054626,
"learning_rate": 0.00035360303118624306,
"loss": 3.2203,
"step": 70550
},
{
"epoch": 20.565103408097873,
"grad_norm": 0.3837476968765259,
"learning_rate": 0.0003534281550568347,
"loss": 3.2286,
"step": 70600
},
{
"epoch": 20.57966792892514,
"grad_norm": 0.3945882022380829,
"learning_rate": 0.0003532532789274264,
"loss": 3.2226,
"step": 70650
},
{
"epoch": 20.594232449752404,
"grad_norm": 0.39531904458999634,
"learning_rate": 0.000353078402798018,
"loss": 3.2334,
"step": 70700
},
{
"epoch": 20.608796970579668,
"grad_norm": 0.38708940148353577,
"learning_rate": 0.00035290352666860966,
"loss": 3.2401,
"step": 70750
},
{
"epoch": 20.623361491406932,
"grad_norm": 0.3743561804294586,
"learning_rate": 0.0003527286505392014,
"loss": 3.2474,
"step": 70800
},
{
"epoch": 20.637926012234196,
"grad_norm": 0.3948131799697876,
"learning_rate": 0.00035255377440979304,
"loss": 3.2348,
"step": 70850
},
{
"epoch": 20.652490533061464,
"grad_norm": 0.36936649680137634,
"learning_rate": 0.00035237889828038473,
"loss": 3.2371,
"step": 70900
},
{
"epoch": 20.667055053888728,
"grad_norm": 0.4126057028770447,
"learning_rate": 0.00035220402215097637,
"loss": 3.233,
"step": 70950
},
{
"epoch": 20.68161957471599,
"grad_norm": 0.37366172671318054,
"learning_rate": 0.000352029146021568,
"loss": 3.2276,
"step": 71000
},
{
"epoch": 20.68161957471599,
"eval_accuracy": 0.37395671525930757,
"eval_loss": 3.5362870693206787,
"eval_runtime": 179.3905,
"eval_samples_per_second": 92.764,
"eval_steps_per_second": 5.803,
"step": 71000
},
{
"epoch": 20.696184095543256,
"grad_norm": 0.3675983250141144,
"learning_rate": 0.0003518542698921597,
"loss": 3.2517,
"step": 71050
},
{
"epoch": 20.710748616370523,
"grad_norm": 0.3695129156112671,
"learning_rate": 0.00035167939376275133,
"loss": 3.2383,
"step": 71100
},
{
"epoch": 20.725313137197787,
"grad_norm": 0.38997822999954224,
"learning_rate": 0.000351504517633343,
"loss": 3.2221,
"step": 71150
},
{
"epoch": 20.73987765802505,
"grad_norm": 0.3975091576576233,
"learning_rate": 0.00035132964150393466,
"loss": 3.2457,
"step": 71200
},
{
"epoch": 20.754442178852315,
"grad_norm": 0.41574394702911377,
"learning_rate": 0.0003511547653745263,
"loss": 3.2452,
"step": 71250
},
{
"epoch": 20.76900669967958,
"grad_norm": 0.39745384454727173,
"learning_rate": 0.00035097988924511804,
"loss": 3.2531,
"step": 71300
},
{
"epoch": 20.783571220506847,
"grad_norm": 0.3609507083892822,
"learning_rate": 0.0003508050131157097,
"loss": 3.2371,
"step": 71350
},
{
"epoch": 20.79813574133411,
"grad_norm": 0.3867054879665375,
"learning_rate": 0.00035063013698630137,
"loss": 3.2428,
"step": 71400
},
{
"epoch": 20.812700262161375,
"grad_norm": 0.40387028455734253,
"learning_rate": 0.000350455260856893,
"loss": 3.237,
"step": 71450
},
{
"epoch": 20.82726478298864,
"grad_norm": 0.3769632875919342,
"learning_rate": 0.0003502803847274847,
"loss": 3.2406,
"step": 71500
},
{
"epoch": 20.841829303815903,
"grad_norm": 0.36971697211265564,
"learning_rate": 0.00035010550859807633,
"loss": 3.2429,
"step": 71550
},
{
"epoch": 20.85639382464317,
"grad_norm": 0.38049620389938354,
"learning_rate": 0.00034993063246866797,
"loss": 3.2409,
"step": 71600
},
{
"epoch": 20.870958345470434,
"grad_norm": 0.37809380888938904,
"learning_rate": 0.00034975575633925966,
"loss": 3.2387,
"step": 71650
},
{
"epoch": 20.8855228662977,
"grad_norm": 0.4029906988143921,
"learning_rate": 0.0003495808802098513,
"loss": 3.2457,
"step": 71700
},
{
"epoch": 20.900087387124962,
"grad_norm": 0.44036680459976196,
"learning_rate": 0.00034940600408044304,
"loss": 3.2435,
"step": 71750
},
{
"epoch": 20.91465190795223,
"grad_norm": 0.40069833397865295,
"learning_rate": 0.0003492311279510347,
"loss": 3.2447,
"step": 71800
},
{
"epoch": 20.929216428779494,
"grad_norm": 0.4169695973396301,
"learning_rate": 0.0003490562518216263,
"loss": 3.2302,
"step": 71850
},
{
"epoch": 20.943780949606758,
"grad_norm": 0.3733760118484497,
"learning_rate": 0.000348881375692218,
"loss": 3.254,
"step": 71900
},
{
"epoch": 20.958345470434022,
"grad_norm": 0.3739670515060425,
"learning_rate": 0.00034870649956280964,
"loss": 3.2525,
"step": 71950
},
{
"epoch": 20.972909991261286,
"grad_norm": 0.36286690831184387,
"learning_rate": 0.0003485316234334013,
"loss": 3.2571,
"step": 72000
},
{
"epoch": 20.972909991261286,
"eval_accuracy": 0.37421237316034206,
"eval_loss": 3.5296003818511963,
"eval_runtime": 179.1931,
"eval_samples_per_second": 92.866,
"eval_steps_per_second": 5.809,
"step": 72000
},
{
"epoch": 20.987474512088554,
"grad_norm": 0.36834150552749634,
"learning_rate": 0.00034835674730399296,
"loss": 3.2607,
"step": 72050
},
{
"epoch": 21.002039032915818,
"grad_norm": 0.40818142890930176,
"learning_rate": 0.00034818187117458465,
"loss": 3.2429,
"step": 72100
},
{
"epoch": 21.01660355374308,
"grad_norm": 0.3603350818157196,
"learning_rate": 0.0003480069950451763,
"loss": 3.152,
"step": 72150
},
{
"epoch": 21.031168074570346,
"grad_norm": 0.36682602763175964,
"learning_rate": 0.0003478321189157679,
"loss": 3.1391,
"step": 72200
},
{
"epoch": 21.045732595397613,
"grad_norm": 0.3988490402698517,
"learning_rate": 0.00034765724278635967,
"loss": 3.1514,
"step": 72250
},
{
"epoch": 21.060297116224877,
"grad_norm": 0.38823550939559937,
"learning_rate": 0.0003474823666569513,
"loss": 3.1507,
"step": 72300
},
{
"epoch": 21.07486163705214,
"grad_norm": 0.4026493430137634,
"learning_rate": 0.000347307490527543,
"loss": 3.1611,
"step": 72350
},
{
"epoch": 21.089426157879405,
"grad_norm": 0.3982233703136444,
"learning_rate": 0.00034713261439813463,
"loss": 3.1523,
"step": 72400
},
{
"epoch": 21.10399067870667,
"grad_norm": 0.39218148589134216,
"learning_rate": 0.00034695773826872627,
"loss": 3.1553,
"step": 72450
},
{
"epoch": 21.118555199533937,
"grad_norm": 0.4108741879463196,
"learning_rate": 0.00034678286213931796,
"loss": 3.1623,
"step": 72500
},
{
"epoch": 21.1331197203612,
"grad_norm": 0.38565295934677124,
"learning_rate": 0.0003466079860099096,
"loss": 3.1706,
"step": 72550
},
{
"epoch": 21.147684241188465,
"grad_norm": 0.4134363532066345,
"learning_rate": 0.0003464331098805013,
"loss": 3.1678,
"step": 72600
},
{
"epoch": 21.16224876201573,
"grad_norm": 0.3783572018146515,
"learning_rate": 0.0003462582337510929,
"loss": 3.1789,
"step": 72650
},
{
"epoch": 21.176813282842993,
"grad_norm": 0.35808396339416504,
"learning_rate": 0.00034608335762168467,
"loss": 3.1715,
"step": 72700
},
{
"epoch": 21.19137780367026,
"grad_norm": 0.3888563811779022,
"learning_rate": 0.0003459084814922763,
"loss": 3.1744,
"step": 72750
},
{
"epoch": 21.205942324497524,
"grad_norm": 0.3799726068973541,
"learning_rate": 0.00034573360536286794,
"loss": 3.1799,
"step": 72800
},
{
"epoch": 21.22050684532479,
"grad_norm": 0.3735267221927643,
"learning_rate": 0.00034555872923345963,
"loss": 3.1833,
"step": 72850
},
{
"epoch": 21.235071366152052,
"grad_norm": 0.41824302077293396,
"learning_rate": 0.00034538385310405127,
"loss": 3.193,
"step": 72900
},
{
"epoch": 21.24963588697932,
"grad_norm": 0.3764243423938751,
"learning_rate": 0.00034520897697464296,
"loss": 3.1962,
"step": 72950
},
{
"epoch": 21.264200407806584,
"grad_norm": 0.3964778482913971,
"learning_rate": 0.0003450341008452346,
"loss": 3.1985,
"step": 73000
},
{
"epoch": 21.264200407806584,
"eval_accuracy": 0.37337625187407003,
"eval_loss": 3.5458767414093018,
"eval_runtime": 179.8315,
"eval_samples_per_second": 92.537,
"eval_steps_per_second": 5.789,
"step": 73000
},
{
"epoch": 21.278764928633848,
"grad_norm": 0.38459548354148865,
"learning_rate": 0.00034485922471582623,
"loss": 3.1902,
"step": 73050
},
{
"epoch": 21.293329449461112,
"grad_norm": 0.3928430378437042,
"learning_rate": 0.0003446843485864179,
"loss": 3.1909,
"step": 73100
},
{
"epoch": 21.307893970288376,
"grad_norm": 0.39092567563056946,
"learning_rate": 0.00034450947245700955,
"loss": 3.186,
"step": 73150
},
{
"epoch": 21.322458491115643,
"grad_norm": 0.37497106194496155,
"learning_rate": 0.0003443345963276013,
"loss": 3.1996,
"step": 73200
},
{
"epoch": 21.337023011942907,
"grad_norm": 0.39056339859962463,
"learning_rate": 0.00034415972019819294,
"loss": 3.1937,
"step": 73250
},
{
"epoch": 21.35158753277017,
"grad_norm": 0.3654477894306183,
"learning_rate": 0.00034398484406878457,
"loss": 3.1956,
"step": 73300
},
{
"epoch": 21.366152053597435,
"grad_norm": 0.39564356207847595,
"learning_rate": 0.00034380996793937626,
"loss": 3.2004,
"step": 73350
},
{
"epoch": 21.380716574424703,
"grad_norm": 0.39092445373535156,
"learning_rate": 0.0003436350918099679,
"loss": 3.2117,
"step": 73400
},
{
"epoch": 21.395281095251967,
"grad_norm": 0.38866856694221497,
"learning_rate": 0.0003434602156805596,
"loss": 3.1905,
"step": 73450
},
{
"epoch": 21.40984561607923,
"grad_norm": 0.3942776918411255,
"learning_rate": 0.0003432853395511512,
"loss": 3.2098,
"step": 73500
},
{
"epoch": 21.424410136906495,
"grad_norm": 0.3794627785682678,
"learning_rate": 0.0003431104634217429,
"loss": 3.2067,
"step": 73550
},
{
"epoch": 21.43897465773376,
"grad_norm": 0.38869741559028625,
"learning_rate": 0.00034293558729233455,
"loss": 3.197,
"step": 73600
},
{
"epoch": 21.453539178561027,
"grad_norm": 0.3953700661659241,
"learning_rate": 0.0003427607111629262,
"loss": 3.2201,
"step": 73650
},
{
"epoch": 21.46810369938829,
"grad_norm": 0.4092461168766022,
"learning_rate": 0.00034258583503351793,
"loss": 3.2107,
"step": 73700
},
{
"epoch": 21.482668220215555,
"grad_norm": 0.37649816274642944,
"learning_rate": 0.00034241095890410957,
"loss": 3.2197,
"step": 73750
},
{
"epoch": 21.49723274104282,
"grad_norm": 0.38819989562034607,
"learning_rate": 0.00034223608277470126,
"loss": 3.2178,
"step": 73800
},
{
"epoch": 21.511797261870086,
"grad_norm": 0.3887033760547638,
"learning_rate": 0.0003420612066452929,
"loss": 3.2111,
"step": 73850
},
{
"epoch": 21.52636178269735,
"grad_norm": 0.36329957842826843,
"learning_rate": 0.00034188633051588453,
"loss": 3.207,
"step": 73900
},
{
"epoch": 21.540926303524614,
"grad_norm": 0.39315828680992126,
"learning_rate": 0.0003417114543864762,
"loss": 3.22,
"step": 73950
},
{
"epoch": 21.555490824351878,
"grad_norm": 0.3785039782524109,
"learning_rate": 0.00034153657825706786,
"loss": 3.2095,
"step": 74000
},
{
"epoch": 21.555490824351878,
"eval_accuracy": 0.3738740439141248,
"eval_loss": 3.539914608001709,
"eval_runtime": 179.8789,
"eval_samples_per_second": 92.512,
"eval_steps_per_second": 5.787,
"step": 74000
},
{
"epoch": 21.570055345179142,
"grad_norm": 0.3970089256763458,
"learning_rate": 0.00034136170212765955,
"loss": 3.22,
"step": 74050
},
{
"epoch": 21.58461986600641,
"grad_norm": 0.3790573477745056,
"learning_rate": 0.0003411868259982512,
"loss": 3.2221,
"step": 74100
},
{
"epoch": 21.599184386833674,
"grad_norm": 0.3819750249385834,
"learning_rate": 0.00034101194986884293,
"loss": 3.2227,
"step": 74150
},
{
"epoch": 21.613748907660938,
"grad_norm": 0.39000168442726135,
"learning_rate": 0.00034083707373943456,
"loss": 3.2129,
"step": 74200
},
{
"epoch": 21.6283134284882,
"grad_norm": 0.3961773216724396,
"learning_rate": 0.0003406621976100262,
"loss": 3.2271,
"step": 74250
},
{
"epoch": 21.64287794931547,
"grad_norm": 0.4604828953742981,
"learning_rate": 0.0003404873214806179,
"loss": 3.2178,
"step": 74300
},
{
"epoch": 21.657442470142733,
"grad_norm": 0.4082699120044708,
"learning_rate": 0.00034031244535120953,
"loss": 3.2296,
"step": 74350
},
{
"epoch": 21.672006990969997,
"grad_norm": 0.36979642510414124,
"learning_rate": 0.0003401375692218012,
"loss": 3.2126,
"step": 74400
},
{
"epoch": 21.68657151179726,
"grad_norm": 0.41598883271217346,
"learning_rate": 0.00033996269309239285,
"loss": 3.2172,
"step": 74450
},
{
"epoch": 21.701136032624525,
"grad_norm": 0.4014417231082916,
"learning_rate": 0.0003397878169629845,
"loss": 3.2295,
"step": 74500
},
{
"epoch": 21.715700553451793,
"grad_norm": 0.39168688654899597,
"learning_rate": 0.0003396129408335762,
"loss": 3.2404,
"step": 74550
},
{
"epoch": 21.730265074279057,
"grad_norm": 0.4000439941883087,
"learning_rate": 0.0003394380647041678,
"loss": 3.2364,
"step": 74600
},
{
"epoch": 21.74482959510632,
"grad_norm": 0.3941012918949127,
"learning_rate": 0.00033926318857475956,
"loss": 3.2289,
"step": 74650
},
{
"epoch": 21.759394115933585,
"grad_norm": 0.37271010875701904,
"learning_rate": 0.0003390883124453512,
"loss": 3.2265,
"step": 74700
},
{
"epoch": 21.77395863676085,
"grad_norm": 0.4070070683956146,
"learning_rate": 0.0003389134363159429,
"loss": 3.2338,
"step": 74750
},
{
"epoch": 21.788523157588116,
"grad_norm": 0.4027371108531952,
"learning_rate": 0.0003387385601865345,
"loss": 3.227,
"step": 74800
},
{
"epoch": 21.80308767841538,
"grad_norm": 0.38957029581069946,
"learning_rate": 0.00033856368405712616,
"loss": 3.2263,
"step": 74850
},
{
"epoch": 21.817652199242644,
"grad_norm": 0.4069308936595917,
"learning_rate": 0.00033838880792771785,
"loss": 3.2442,
"step": 74900
},
{
"epoch": 21.83221672006991,
"grad_norm": 0.4276354908943176,
"learning_rate": 0.0003382139317983095,
"loss": 3.2518,
"step": 74950
},
{
"epoch": 21.846781240897176,
"grad_norm": 0.3802242577075958,
"learning_rate": 0.0003380390556689012,
"loss": 3.2244,
"step": 75000
},
{
"epoch": 21.846781240897176,
"eval_accuracy": 0.3742635282601351,
"eval_loss": 3.531430721282959,
"eval_runtime": 179.9299,
"eval_samples_per_second": 92.486,
"eval_steps_per_second": 5.786,
"step": 75000
},
{
"epoch": 21.86134576172444,
"grad_norm": 0.38759249448776245,
"learning_rate": 0.0003378641795394928,
"loss": 3.2361,
"step": 75050
},
{
"epoch": 21.875910282551704,
"grad_norm": 0.387056827545166,
"learning_rate": 0.00033768930341008445,
"loss": 3.2324,
"step": 75100
},
{
"epoch": 21.890474803378968,
"grad_norm": 0.37463316321372986,
"learning_rate": 0.0003375144272806762,
"loss": 3.2475,
"step": 75150
},
{
"epoch": 21.905039324206232,
"grad_norm": 0.39495569467544556,
"learning_rate": 0.00033733955115126783,
"loss": 3.2459,
"step": 75200
},
{
"epoch": 21.9196038450335,
"grad_norm": 0.3841392993927002,
"learning_rate": 0.0003371646750218595,
"loss": 3.2408,
"step": 75250
},
{
"epoch": 21.934168365860764,
"grad_norm": 0.40644872188568115,
"learning_rate": 0.00033698979889245116,
"loss": 3.237,
"step": 75300
},
{
"epoch": 21.948732886688028,
"grad_norm": 0.3798723816871643,
"learning_rate": 0.0003368149227630428,
"loss": 3.237,
"step": 75350
},
{
"epoch": 21.96329740751529,
"grad_norm": 0.38789594173431396,
"learning_rate": 0.0003366400466336345,
"loss": 3.26,
"step": 75400
},
{
"epoch": 21.97786192834256,
"grad_norm": 0.3960188329219818,
"learning_rate": 0.0003364651705042261,
"loss": 3.2589,
"step": 75450
},
{
"epoch": 21.992426449169823,
"grad_norm": 0.39203280210494995,
"learning_rate": 0.0003362902943748178,
"loss": 3.2426,
"step": 75500
},
{
"epoch": 22.006990969997087,
"grad_norm": 0.38960763812065125,
"learning_rate": 0.00033611541824540945,
"loss": 3.1892,
"step": 75550
},
{
"epoch": 22.02155549082435,
"grad_norm": 0.395844429731369,
"learning_rate": 0.0003359405421160012,
"loss": 3.1304,
"step": 75600
},
{
"epoch": 22.036120011651615,
"grad_norm": 0.4585193991661072,
"learning_rate": 0.0003357656659865928,
"loss": 3.1417,
"step": 75650
},
{
"epoch": 22.050684532478883,
"grad_norm": 0.39369523525238037,
"learning_rate": 0.00033559078985718446,
"loss": 3.1339,
"step": 75700
},
{
"epoch": 22.065249053306147,
"grad_norm": 0.3878481090068817,
"learning_rate": 0.00033541591372777615,
"loss": 3.1498,
"step": 75750
},
{
"epoch": 22.07981357413341,
"grad_norm": 0.40822944045066833,
"learning_rate": 0.0003352410375983678,
"loss": 3.1596,
"step": 75800
},
{
"epoch": 22.094378094960675,
"grad_norm": 0.39915433526039124,
"learning_rate": 0.0003350661614689595,
"loss": 3.149,
"step": 75850
},
{
"epoch": 22.108942615787942,
"grad_norm": 0.4160260558128357,
"learning_rate": 0.0003348912853395511,
"loss": 3.1563,
"step": 75900
},
{
"epoch": 22.123507136615206,
"grad_norm": 0.44865837693214417,
"learning_rate": 0.00033471640921014275,
"loss": 3.1637,
"step": 75950
},
{
"epoch": 22.13807165744247,
"grad_norm": 0.38839584589004517,
"learning_rate": 0.00033454153308073444,
"loss": 3.1653,
"step": 76000
},
{
"epoch": 22.13807165744247,
"eval_accuracy": 0.37341882232493223,
"eval_loss": 3.5493438243865967,
"eval_runtime": 180.0454,
"eval_samples_per_second": 92.427,
"eval_steps_per_second": 5.782,
"step": 76000
},
{
"epoch": 22.152636178269734,
"grad_norm": 0.40942075848579407,
"learning_rate": 0.0003343666569513261,
"loss": 3.1537,
"step": 76050
},
{
"epoch": 22.167200699097,
"grad_norm": 0.37509503960609436,
"learning_rate": 0.0003341917808219178,
"loss": 3.1677,
"step": 76100
},
{
"epoch": 22.181765219924266,
"grad_norm": 0.4073181450366974,
"learning_rate": 0.00033401690469250946,
"loss": 3.1711,
"step": 76150
},
{
"epoch": 22.19632974075153,
"grad_norm": 0.3973526954650879,
"learning_rate": 0.00033384202856310115,
"loss": 3.163,
"step": 76200
},
{
"epoch": 22.210894261578794,
"grad_norm": 0.38937580585479736,
"learning_rate": 0.0003336671524336928,
"loss": 3.1599,
"step": 76250
},
{
"epoch": 22.225458782406058,
"grad_norm": 0.4311583936214447,
"learning_rate": 0.0003334922763042844,
"loss": 3.1733,
"step": 76300
},
{
"epoch": 22.240023303233322,
"grad_norm": 0.3999621570110321,
"learning_rate": 0.0003333174001748761,
"loss": 3.1737,
"step": 76350
},
{
"epoch": 22.25458782406059,
"grad_norm": 0.4125518500804901,
"learning_rate": 0.00033314252404546775,
"loss": 3.1615,
"step": 76400
},
{
"epoch": 22.269152344887853,
"grad_norm": 0.3764672875404358,
"learning_rate": 0.00033296764791605944,
"loss": 3.1752,
"step": 76450
},
{
"epoch": 22.283716865715117,
"grad_norm": 0.3720638155937195,
"learning_rate": 0.0003327927717866511,
"loss": 3.1851,
"step": 76500
},
{
"epoch": 22.29828138654238,
"grad_norm": 0.42662569880485535,
"learning_rate": 0.0003326178956572427,
"loss": 3.1922,
"step": 76550
},
{
"epoch": 22.31284590736965,
"grad_norm": 0.41372689604759216,
"learning_rate": 0.00033244301952783446,
"loss": 3.183,
"step": 76600
},
{
"epoch": 22.327410428196913,
"grad_norm": 0.3920452892780304,
"learning_rate": 0.0003322681433984261,
"loss": 3.1926,
"step": 76650
},
{
"epoch": 22.341974949024177,
"grad_norm": 0.4548211693763733,
"learning_rate": 0.0003320932672690178,
"loss": 3.1928,
"step": 76700
},
{
"epoch": 22.35653946985144,
"grad_norm": 0.377517968416214,
"learning_rate": 0.0003319183911396094,
"loss": 3.1863,
"step": 76750
},
{
"epoch": 22.371103990678705,
"grad_norm": 0.44053518772125244,
"learning_rate": 0.0003317435150102011,
"loss": 3.1934,
"step": 76800
},
{
"epoch": 22.385668511505973,
"grad_norm": 0.3874462842941284,
"learning_rate": 0.00033156863888079275,
"loss": 3.1817,
"step": 76850
},
{
"epoch": 22.400233032333237,
"grad_norm": 0.3760935962200165,
"learning_rate": 0.0003313937627513844,
"loss": 3.1908,
"step": 76900
},
{
"epoch": 22.4147975531605,
"grad_norm": 0.40698912739753723,
"learning_rate": 0.00033121888662197607,
"loss": 3.1974,
"step": 76950
},
{
"epoch": 22.429362073987765,
"grad_norm": 0.38324347138404846,
"learning_rate": 0.0003310440104925677,
"loss": 3.1924,
"step": 77000
},
{
"epoch": 22.429362073987765,
"eval_accuracy": 0.37394048674489044,
"eval_loss": 3.5421411991119385,
"eval_runtime": 178.6723,
"eval_samples_per_second": 93.137,
"eval_steps_per_second": 5.826,
"step": 77000
},
{
"epoch": 22.443926594815032,
"grad_norm": 0.4046727120876312,
"learning_rate": 0.00033086913436315945,
"loss": 3.1997,
"step": 77050
},
{
"epoch": 22.458491115642296,
"grad_norm": 0.36527130007743835,
"learning_rate": 0.0003306942582337511,
"loss": 3.2036,
"step": 77100
},
{
"epoch": 22.47305563646956,
"grad_norm": 0.39856159687042236,
"learning_rate": 0.0003305193821043427,
"loss": 3.2087,
"step": 77150
},
{
"epoch": 22.487620157296824,
"grad_norm": 0.40473848581314087,
"learning_rate": 0.0003303445059749344,
"loss": 3.214,
"step": 77200
},
{
"epoch": 22.502184678124088,
"grad_norm": 0.3842359185218811,
"learning_rate": 0.00033016962984552605,
"loss": 3.2017,
"step": 77250
},
{
"epoch": 22.516749198951356,
"grad_norm": 0.37999603152275085,
"learning_rate": 0.00032999475371611774,
"loss": 3.1993,
"step": 77300
},
{
"epoch": 22.53131371977862,
"grad_norm": 0.4159919023513794,
"learning_rate": 0.0003298198775867094,
"loss": 3.2092,
"step": 77350
},
{
"epoch": 22.545878240605884,
"grad_norm": 0.38509103655815125,
"learning_rate": 0.000329645001457301,
"loss": 3.2052,
"step": 77400
},
{
"epoch": 22.560442761433148,
"grad_norm": 0.3961235284805298,
"learning_rate": 0.0003294701253278927,
"loss": 3.2127,
"step": 77450
},
{
"epoch": 22.575007282260415,
"grad_norm": 0.42305219173431396,
"learning_rate": 0.00032929524919848434,
"loss": 3.2055,
"step": 77500
},
{
"epoch": 22.58957180308768,
"grad_norm": 0.3830503523349762,
"learning_rate": 0.0003291203730690761,
"loss": 3.2134,
"step": 77550
},
{
"epoch": 22.604136323914943,
"grad_norm": 0.3828100562095642,
"learning_rate": 0.0003289454969396677,
"loss": 3.2212,
"step": 77600
},
{
"epoch": 22.618700844742207,
"grad_norm": 0.4239085912704468,
"learning_rate": 0.0003287706208102594,
"loss": 3.2097,
"step": 77650
},
{
"epoch": 22.63326536556947,
"grad_norm": 0.3623301386833191,
"learning_rate": 0.00032859574468085105,
"loss": 3.2136,
"step": 77700
},
{
"epoch": 22.64782988639674,
"grad_norm": 0.39616623520851135,
"learning_rate": 0.0003284208685514427,
"loss": 3.2143,
"step": 77750
},
{
"epoch": 22.662394407224003,
"grad_norm": 0.3920958936214447,
"learning_rate": 0.0003282459924220344,
"loss": 3.2097,
"step": 77800
},
{
"epoch": 22.676958928051267,
"grad_norm": 0.3976283669471741,
"learning_rate": 0.000328071116292626,
"loss": 3.2166,
"step": 77850
},
{
"epoch": 22.69152344887853,
"grad_norm": 0.39355382323265076,
"learning_rate": 0.0003278962401632177,
"loss": 3.2129,
"step": 77900
},
{
"epoch": 22.7060879697058,
"grad_norm": 0.38081255555152893,
"learning_rate": 0.00032772136403380934,
"loss": 3.2249,
"step": 77950
},
{
"epoch": 22.720652490533062,
"grad_norm": 0.397743284702301,
"learning_rate": 0.000327546487904401,
"loss": 3.2211,
"step": 78000
},
{
"epoch": 22.720652490533062,
"eval_accuracy": 0.37458386502297686,
"eval_loss": 3.534937620162964,
"eval_runtime": 178.7352,
"eval_samples_per_second": 93.104,
"eval_steps_per_second": 5.824,
"step": 78000
},
{
"epoch": 22.735217011360326,
"grad_norm": 0.36255332827568054,
"learning_rate": 0.0003273716117749927,
"loss": 3.2126,
"step": 78050
},
{
"epoch": 22.74978153218759,
"grad_norm": 0.41010552644729614,
"learning_rate": 0.00032719673564558435,
"loss": 3.2159,
"step": 78100
},
{
"epoch": 22.764346053014854,
"grad_norm": 0.371345192193985,
"learning_rate": 0.00032702185951617605,
"loss": 3.2146,
"step": 78150
},
{
"epoch": 22.778910573842122,
"grad_norm": 0.38916873931884766,
"learning_rate": 0.0003268469833867677,
"loss": 3.2231,
"step": 78200
},
{
"epoch": 22.793475094669386,
"grad_norm": 0.37377578020095825,
"learning_rate": 0.00032667210725735937,
"loss": 3.223,
"step": 78250
},
{
"epoch": 22.80803961549665,
"grad_norm": 0.41026216745376587,
"learning_rate": 0.000326497231127951,
"loss": 3.2325,
"step": 78300
},
{
"epoch": 22.822604136323914,
"grad_norm": 0.3754938542842865,
"learning_rate": 0.00032632235499854264,
"loss": 3.2346,
"step": 78350
},
{
"epoch": 22.837168657151178,
"grad_norm": 0.38638442754745483,
"learning_rate": 0.00032614747886913433,
"loss": 3.2235,
"step": 78400
},
{
"epoch": 22.851733177978446,
"grad_norm": 0.3845442533493042,
"learning_rate": 0.00032597260273972597,
"loss": 3.2299,
"step": 78450
},
{
"epoch": 22.86629769880571,
"grad_norm": 0.3820870518684387,
"learning_rate": 0.0003257977266103177,
"loss": 3.2265,
"step": 78500
},
{
"epoch": 22.880862219632974,
"grad_norm": 0.39607709646224976,
"learning_rate": 0.00032562285048090935,
"loss": 3.2228,
"step": 78550
},
{
"epoch": 22.895426740460238,
"grad_norm": 0.3708309531211853,
"learning_rate": 0.000325447974351501,
"loss": 3.2371,
"step": 78600
},
{
"epoch": 22.909991261287505,
"grad_norm": 0.37607479095458984,
"learning_rate": 0.0003252730982220927,
"loss": 3.2399,
"step": 78650
},
{
"epoch": 22.92455578211477,
"grad_norm": 0.40992024540901184,
"learning_rate": 0.0003250982220926843,
"loss": 3.2251,
"step": 78700
},
{
"epoch": 22.939120302942033,
"grad_norm": 0.41538867354393005,
"learning_rate": 0.000324923345963276,
"loss": 3.2247,
"step": 78750
},
{
"epoch": 22.953684823769297,
"grad_norm": 0.3930741846561432,
"learning_rate": 0.00032474846983386764,
"loss": 3.2153,
"step": 78800
},
{
"epoch": 22.96824934459656,
"grad_norm": 0.3772921562194824,
"learning_rate": 0.00032457359370445933,
"loss": 3.2273,
"step": 78850
},
{
"epoch": 22.98281386542383,
"grad_norm": 0.41579151153564453,
"learning_rate": 0.00032439871757505097,
"loss": 3.2488,
"step": 78900
},
{
"epoch": 22.997378386251093,
"grad_norm": 0.37834632396698,
"learning_rate": 0.0003242238414456426,
"loss": 3.2379,
"step": 78950
},
{
"epoch": 23.011942907078357,
"grad_norm": 0.37508898973464966,
"learning_rate": 0.00032404896531623435,
"loss": 3.148,
"step": 79000
},
{
"epoch": 23.011942907078357,
"eval_accuracy": 0.3738874500782085,
"eval_loss": 3.542969226837158,
"eval_runtime": 178.5346,
"eval_samples_per_second": 93.209,
"eval_steps_per_second": 5.831,
"step": 79000
},
{
"epoch": 23.02650742790562,
"grad_norm": 0.39909064769744873,
"learning_rate": 0.000323874089186826,
"loss": 3.1349,
"step": 79050
},
{
"epoch": 23.04107194873289,
"grad_norm": 0.39390829205513,
"learning_rate": 0.0003236992130574177,
"loss": 3.1389,
"step": 79100
},
{
"epoch": 23.055636469560152,
"grad_norm": 0.4094650149345398,
"learning_rate": 0.0003235243369280093,
"loss": 3.133,
"step": 79150
},
{
"epoch": 23.070200990387416,
"grad_norm": 0.41038405895233154,
"learning_rate": 0.00032334946079860095,
"loss": 3.1384,
"step": 79200
},
{
"epoch": 23.08476551121468,
"grad_norm": 0.4140039384365082,
"learning_rate": 0.00032317458466919264,
"loss": 3.1473,
"step": 79250
},
{
"epoch": 23.099330032041944,
"grad_norm": 0.4052468240261078,
"learning_rate": 0.0003229997085397843,
"loss": 3.1486,
"step": 79300
},
{
"epoch": 23.113894552869212,
"grad_norm": 0.4668196141719818,
"learning_rate": 0.00032282483241037596,
"loss": 3.15,
"step": 79350
},
{
"epoch": 23.128459073696476,
"grad_norm": 0.4077889919281006,
"learning_rate": 0.0003226499562809676,
"loss": 3.1468,
"step": 79400
},
{
"epoch": 23.14302359452374,
"grad_norm": 0.4005262553691864,
"learning_rate": 0.00032247508015155924,
"loss": 3.1539,
"step": 79450
},
{
"epoch": 23.157588115351004,
"grad_norm": 0.4289659261703491,
"learning_rate": 0.000322300204022151,
"loss": 3.158,
"step": 79500
},
{
"epoch": 23.17215263617827,
"grad_norm": 0.4152050316333771,
"learning_rate": 0.0003221253278927426,
"loss": 3.1563,
"step": 79550
},
{
"epoch": 23.186717157005535,
"grad_norm": 0.3984936773777008,
"learning_rate": 0.0003219504517633343,
"loss": 3.1531,
"step": 79600
},
{
"epoch": 23.2012816778328,
"grad_norm": 0.3805445730686188,
"learning_rate": 0.00032177557563392594,
"loss": 3.1668,
"step": 79650
},
{
"epoch": 23.215846198660063,
"grad_norm": 0.4050346314907074,
"learning_rate": 0.00032160069950451763,
"loss": 3.1607,
"step": 79700
},
{
"epoch": 23.230410719487327,
"grad_norm": 0.3911287188529968,
"learning_rate": 0.00032142582337510927,
"loss": 3.1687,
"step": 79750
},
{
"epoch": 23.244975240314595,
"grad_norm": 0.4178531765937805,
"learning_rate": 0.0003212509472457009,
"loss": 3.1675,
"step": 79800
},
{
"epoch": 23.25953976114186,
"grad_norm": 0.38798150420188904,
"learning_rate": 0.0003210760711162926,
"loss": 3.1744,
"step": 79850
},
{
"epoch": 23.274104281969123,
"grad_norm": 0.43264156579971313,
"learning_rate": 0.00032090119498688423,
"loss": 3.1695,
"step": 79900
},
{
"epoch": 23.288668802796387,
"grad_norm": 0.41300347447395325,
"learning_rate": 0.0003207263188574759,
"loss": 3.1785,
"step": 79950
},
{
"epoch": 23.30323332362365,
"grad_norm": 0.4206102192401886,
"learning_rate": 0.0003205514427280676,
"loss": 3.1766,
"step": 80000
},
{
"epoch": 23.30323332362365,
"eval_accuracy": 0.3740295083783234,
"eval_loss": 3.546813488006592,
"eval_runtime": 178.6519,
"eval_samples_per_second": 93.148,
"eval_steps_per_second": 5.827,
"step": 80000
},
{
"epoch": 23.31779784445092,
"grad_norm": 0.40217486023902893,
"learning_rate": 0.00032037656659865925,
"loss": 3.1198,
"step": 80050
},
{
"epoch": 23.332362365278183,
"grad_norm": 0.3986579477787018,
"learning_rate": 0.00032020169046925094,
"loss": 3.131,
"step": 80100
},
{
"epoch": 23.346926886105447,
"grad_norm": 0.3992542326450348,
"learning_rate": 0.0003200268143398426,
"loss": 3.1436,
"step": 80150
},
{
"epoch": 23.36149140693271,
"grad_norm": 0.41430288553237915,
"learning_rate": 0.00031985193821043427,
"loss": 3.1413,
"step": 80200
},
{
"epoch": 23.376055927759978,
"grad_norm": 0.39568737149238586,
"learning_rate": 0.0003196770620810259,
"loss": 3.1504,
"step": 80250
},
{
"epoch": 23.390620448587242,
"grad_norm": 0.38190150260925293,
"learning_rate": 0.0003195021859516176,
"loss": 3.1547,
"step": 80300
},
{
"epoch": 23.405184969414506,
"grad_norm": 0.4019649922847748,
"learning_rate": 0.00031932730982220923,
"loss": 3.158,
"step": 80350
},
{
"epoch": 23.41974949024177,
"grad_norm": 0.3808055818080902,
"learning_rate": 0.00031915243369280087,
"loss": 3.1502,
"step": 80400
},
{
"epoch": 23.434314011069034,
"grad_norm": 0.42824339866638184,
"learning_rate": 0.00031897755756339256,
"loss": 3.1643,
"step": 80450
},
{
"epoch": 23.448878531896302,
"grad_norm": 0.3883609473705292,
"learning_rate": 0.0003188026814339842,
"loss": 3.1631,
"step": 80500
},
{
"epoch": 23.463443052723566,
"grad_norm": 0.3974464237689972,
"learning_rate": 0.00031862780530457594,
"loss": 3.1658,
"step": 80550
},
{
"epoch": 23.47800757355083,
"grad_norm": 0.39729073643684387,
"learning_rate": 0.0003184529291751676,
"loss": 3.1728,
"step": 80600
},
{
"epoch": 23.492572094378094,
"grad_norm": 0.4245384633541107,
"learning_rate": 0.0003182780530457592,
"loss": 3.1712,
"step": 80650
},
{
"epoch": 23.50713661520536,
"grad_norm": 0.3828080892562866,
"learning_rate": 0.0003181031769163509,
"loss": 3.1603,
"step": 80700
},
{
"epoch": 23.521701136032625,
"grad_norm": 0.4366213083267212,
"learning_rate": 0.00031792830078694254,
"loss": 3.1628,
"step": 80750
},
{
"epoch": 23.53626565685989,
"grad_norm": 0.39621713757514954,
"learning_rate": 0.0003177534246575342,
"loss": 3.1789,
"step": 80800
},
{
"epoch": 23.550830177687153,
"grad_norm": 0.3925442397594452,
"learning_rate": 0.00031757854852812586,
"loss": 3.17,
"step": 80850
},
{
"epoch": 23.565394698514417,
"grad_norm": 0.4557037055492401,
"learning_rate": 0.00031740367239871755,
"loss": 3.1698,
"step": 80900
},
{
"epoch": 23.579959219341685,
"grad_norm": 0.41575995087623596,
"learning_rate": 0.0003172287962693092,
"loss": 3.1684,
"step": 80950
},
{
"epoch": 23.59452374016895,
"grad_norm": 0.39075565338134766,
"learning_rate": 0.0003170539201399008,
"loss": 3.1807,
"step": 81000
},
{
"epoch": 23.59452374016895,
"eval_accuracy": 0.37377808400278895,
"eval_loss": 3.5497612953186035,
"eval_runtime": 179.6798,
"eval_samples_per_second": 92.615,
"eval_steps_per_second": 5.794,
"step": 81000
},
{
"epoch": 23.609088260996213,
"grad_norm": 0.4015454053878784,
"learning_rate": 0.00031687904401049257,
"loss": 3.182,
"step": 81050
},
{
"epoch": 23.623652781823477,
"grad_norm": 0.4120422899723053,
"learning_rate": 0.0003167041678810842,
"loss": 3.1766,
"step": 81100
},
{
"epoch": 23.638217302650745,
"grad_norm": 0.39650651812553406,
"learning_rate": 0.0003165292917516759,
"loss": 3.1766,
"step": 81150
},
{
"epoch": 23.65278182347801,
"grad_norm": 0.42096227407455444,
"learning_rate": 0.00031635441562226753,
"loss": 3.1859,
"step": 81200
},
{
"epoch": 23.667346344305273,
"grad_norm": 0.49677303433418274,
"learning_rate": 0.00031617953949285917,
"loss": 3.1784,
"step": 81250
},
{
"epoch": 23.681910865132537,
"grad_norm": 0.4429994225502014,
"learning_rate": 0.00031600466336345086,
"loss": 3.2021,
"step": 81300
},
{
"epoch": 23.6964753859598,
"grad_norm": 0.398057222366333,
"learning_rate": 0.0003158297872340425,
"loss": 3.1849,
"step": 81350
},
{
"epoch": 23.711039906787068,
"grad_norm": 0.43820124864578247,
"learning_rate": 0.0003156549111046342,
"loss": 3.1758,
"step": 81400
},
{
"epoch": 23.725604427614332,
"grad_norm": 0.4369910955429077,
"learning_rate": 0.0003154800349752258,
"loss": 3.1826,
"step": 81450
},
{
"epoch": 23.740168948441596,
"grad_norm": 0.39258837699890137,
"learning_rate": 0.00031530515884581757,
"loss": 3.186,
"step": 81500
},
{
"epoch": 23.75473346926886,
"grad_norm": 0.4479694962501526,
"learning_rate": 0.0003151302827164092,
"loss": 3.1904,
"step": 81550
},
{
"epoch": 23.769297990096128,
"grad_norm": 0.39652347564697266,
"learning_rate": 0.00031495540658700084,
"loss": 3.1899,
"step": 81600
},
{
"epoch": 23.78386251092339,
"grad_norm": 0.4215203523635864,
"learning_rate": 0.00031478053045759253,
"loss": 3.1985,
"step": 81650
},
{
"epoch": 23.798427031750656,
"grad_norm": 0.39612090587615967,
"learning_rate": 0.00031460565432818417,
"loss": 3.1824,
"step": 81700
},
{
"epoch": 23.81299155257792,
"grad_norm": 0.4307905435562134,
"learning_rate": 0.00031443077819877586,
"loss": 3.1988,
"step": 81750
},
{
"epoch": 23.827556073405184,
"grad_norm": 0.4627580940723419,
"learning_rate": 0.0003142559020693675,
"loss": 3.2016,
"step": 81800
},
{
"epoch": 23.84212059423245,
"grad_norm": 0.39896416664123535,
"learning_rate": 0.00031408102593995913,
"loss": 3.1818,
"step": 81850
},
{
"epoch": 23.856685115059715,
"grad_norm": 0.416999489068985,
"learning_rate": 0.0003139061498105508,
"loss": 3.1896,
"step": 81900
},
{
"epoch": 23.87124963588698,
"grad_norm": 0.390245646238327,
"learning_rate": 0.00031373127368114245,
"loss": 3.2024,
"step": 81950
},
{
"epoch": 23.885814156714243,
"grad_norm": 0.43539178371429443,
"learning_rate": 0.0003135563975517342,
"loss": 3.2014,
"step": 82000
},
{
"epoch": 23.885814156714243,
"eval_accuracy": 0.3740790171070886,
"eval_loss": 3.5404930114746094,
"eval_runtime": 180.5158,
"eval_samples_per_second": 92.186,
"eval_steps_per_second": 5.767,
"step": 82000
},
{
"epoch": 23.900378677541507,
"grad_norm": 0.4117051661014557,
"learning_rate": 0.00031338152142232584,
"loss": 3.1863,
"step": 82050
},
{
"epoch": 23.914943198368775,
"grad_norm": 0.3843604624271393,
"learning_rate": 0.00031320664529291747,
"loss": 3.2044,
"step": 82100
},
{
"epoch": 23.92950771919604,
"grad_norm": 0.39629417657852173,
"learning_rate": 0.00031303176916350916,
"loss": 3.2066,
"step": 82150
},
{
"epoch": 23.944072240023303,
"grad_norm": 0.40928950905799866,
"learning_rate": 0.0003128568930341008,
"loss": 3.2048,
"step": 82200
},
{
"epoch": 23.958636760850567,
"grad_norm": 0.40854644775390625,
"learning_rate": 0.0003126820169046925,
"loss": 3.2035,
"step": 82250
},
{
"epoch": 23.973201281677834,
"grad_norm": 0.41340750455856323,
"learning_rate": 0.0003125071407752841,
"loss": 3.1906,
"step": 82300
},
{
"epoch": 23.9877658025051,
"grad_norm": 0.4238535761833191,
"learning_rate": 0.0003123322646458758,
"loss": 3.2094,
"step": 82350
},
{
"epoch": 24.002330323332362,
"grad_norm": 0.4978727698326111,
"learning_rate": 0.00031215738851646745,
"loss": 3.1901,
"step": 82400
},
{
"epoch": 24.016894844159626,
"grad_norm": 0.43836721777915955,
"learning_rate": 0.0003119825123870591,
"loss": 3.1183,
"step": 82450
},
{
"epoch": 24.03145936498689,
"grad_norm": 0.4122955799102783,
"learning_rate": 0.00031180763625765083,
"loss": 3.1271,
"step": 82500
},
{
"epoch": 24.046023885814158,
"grad_norm": 0.43053796887397766,
"learning_rate": 0.00031163276012824247,
"loss": 3.1254,
"step": 82550
},
{
"epoch": 24.060588406641422,
"grad_norm": 0.4107215702533722,
"learning_rate": 0.00031145788399883416,
"loss": 3.146,
"step": 82600
},
{
"epoch": 24.075152927468686,
"grad_norm": 0.3937588334083557,
"learning_rate": 0.0003112830078694258,
"loss": 3.1431,
"step": 82650
},
{
"epoch": 24.08971744829595,
"grad_norm": 0.40734755992889404,
"learning_rate": 0.00031110813174001743,
"loss": 3.1434,
"step": 82700
},
{
"epoch": 24.104281969123218,
"grad_norm": 0.3780282139778137,
"learning_rate": 0.0003109332556106091,
"loss": 3.1484,
"step": 82750
},
{
"epoch": 24.11884648995048,
"grad_norm": 0.41484034061431885,
"learning_rate": 0.00031075837948120076,
"loss": 3.1533,
"step": 82800
},
{
"epoch": 24.133411010777746,
"grad_norm": 0.3923600912094116,
"learning_rate": 0.00031058350335179245,
"loss": 3.1487,
"step": 82850
},
{
"epoch": 24.14797553160501,
"grad_norm": 0.43652015924453735,
"learning_rate": 0.0003104086272223841,
"loss": 3.1473,
"step": 82900
},
{
"epoch": 24.162540052432274,
"grad_norm": 0.4114309549331665,
"learning_rate": 0.00031023375109297583,
"loss": 3.1559,
"step": 82950
},
{
"epoch": 24.17710457325954,
"grad_norm": 0.4024522006511688,
"learning_rate": 0.00031005887496356746,
"loss": 3.159,
"step": 83000
},
{
"epoch": 24.17710457325954,
"eval_accuracy": 0.37380078040338677,
"eval_loss": 3.5526421070098877,
"eval_runtime": 180.4121,
"eval_samples_per_second": 92.239,
"eval_steps_per_second": 5.77,
"step": 83000
},
{
"epoch": 24.191669094086805,
"grad_norm": 0.42163506150245667,
"learning_rate": 0.0003098839988341591,
"loss": 3.1565,
"step": 83050
},
{
"epoch": 24.20623361491407,
"grad_norm": 0.4099547266960144,
"learning_rate": 0.0003097091227047508,
"loss": 3.1583,
"step": 83100
},
{
"epoch": 24.220798135741333,
"grad_norm": 0.40957939624786377,
"learning_rate": 0.00030953424657534243,
"loss": 3.1522,
"step": 83150
},
{
"epoch": 24.235362656568597,
"grad_norm": 0.4154437482357025,
"learning_rate": 0.0003093593704459341,
"loss": 3.1502,
"step": 83200
},
{
"epoch": 24.249927177395865,
"grad_norm": 0.3992171585559845,
"learning_rate": 0.00030918449431652575,
"loss": 3.1741,
"step": 83250
},
{
"epoch": 24.26449169822313,
"grad_norm": 0.3820195198059082,
"learning_rate": 0.0003090096181871174,
"loss": 3.1665,
"step": 83300
},
{
"epoch": 24.279056219050393,
"grad_norm": 0.4085935652256012,
"learning_rate": 0.0003088347420577091,
"loss": 3.1679,
"step": 83350
},
{
"epoch": 24.293620739877657,
"grad_norm": 0.4251910448074341,
"learning_rate": 0.0003086598659283007,
"loss": 3.1759,
"step": 83400
},
{
"epoch": 24.308185260704924,
"grad_norm": 0.4546334445476532,
"learning_rate": 0.00030848498979889246,
"loss": 3.1591,
"step": 83450
},
{
"epoch": 24.32274978153219,
"grad_norm": 0.4355502426624298,
"learning_rate": 0.0003083101136694841,
"loss": 3.171,
"step": 83500
},
{
"epoch": 24.337314302359452,
"grad_norm": 0.38806089758872986,
"learning_rate": 0.0003081352375400758,
"loss": 3.1722,
"step": 83550
},
{
"epoch": 24.351878823186716,
"grad_norm": 0.39891743659973145,
"learning_rate": 0.0003079603614106674,
"loss": 3.1802,
"step": 83600
},
{
"epoch": 24.36644334401398,
"grad_norm": 0.3836468458175659,
"learning_rate": 0.00030778548528125906,
"loss": 3.1727,
"step": 83650
},
{
"epoch": 24.381007864841248,
"grad_norm": 0.3941711187362671,
"learning_rate": 0.00030761060915185075,
"loss": 3.1839,
"step": 83700
},
{
"epoch": 24.395572385668512,
"grad_norm": 0.4084647297859192,
"learning_rate": 0.0003074357330224424,
"loss": 3.1842,
"step": 83750
},
{
"epoch": 24.410136906495776,
"grad_norm": 0.44058483839035034,
"learning_rate": 0.0003072608568930341,
"loss": 3.1778,
"step": 83800
},
{
"epoch": 24.42470142732304,
"grad_norm": 0.371565580368042,
"learning_rate": 0.0003070859807636257,
"loss": 3.1763,
"step": 83850
},
{
"epoch": 24.439265948150307,
"grad_norm": 0.410878449678421,
"learning_rate": 0.00030691110463421735,
"loss": 3.1825,
"step": 83900
},
{
"epoch": 24.45383046897757,
"grad_norm": 0.4056607484817505,
"learning_rate": 0.0003067362285048091,
"loss": 3.1818,
"step": 83950
},
{
"epoch": 24.468394989804835,
"grad_norm": 0.3960700035095215,
"learning_rate": 0.00030656135237540073,
"loss": 3.1787,
"step": 84000
},
{
"epoch": 24.468394989804835,
"eval_accuracy": 0.3739261397973623,
"eval_loss": 3.5454070568084717,
"eval_runtime": 180.4732,
"eval_samples_per_second": 92.208,
"eval_steps_per_second": 5.768,
"step": 84000
},
{
"epoch": 24.4829595106321,
"grad_norm": 0.40793466567993164,
"learning_rate": 0.0003063864762459924,
"loss": 3.1835,
"step": 84050
},
{
"epoch": 24.497524031459363,
"grad_norm": 0.4240623116493225,
"learning_rate": 0.00030621160011658406,
"loss": 3.1869,
"step": 84100
},
{
"epoch": 24.51208855228663,
"grad_norm": 0.38526567816734314,
"learning_rate": 0.0003060367239871757,
"loss": 3.1853,
"step": 84150
},
{
"epoch": 24.526653073113895,
"grad_norm": 0.4033205807209015,
"learning_rate": 0.0003058618478577674,
"loss": 3.2023,
"step": 84200
},
{
"epoch": 24.54121759394116,
"grad_norm": 0.39986664056777954,
"learning_rate": 0.000305686971728359,
"loss": 3.1882,
"step": 84250
},
{
"epoch": 24.555782114768423,
"grad_norm": 0.42409780621528625,
"learning_rate": 0.0003055120955989507,
"loss": 3.1849,
"step": 84300
},
{
"epoch": 24.57034663559569,
"grad_norm": 0.40158599615097046,
"learning_rate": 0.00030533721946954235,
"loss": 3.1887,
"step": 84350
},
{
"epoch": 24.584911156422955,
"grad_norm": 0.3864559531211853,
"learning_rate": 0.0003051623433401341,
"loss": 3.195,
"step": 84400
},
{
"epoch": 24.59947567725022,
"grad_norm": 0.39239874482154846,
"learning_rate": 0.00030498746721072573,
"loss": 3.1957,
"step": 84450
},
{
"epoch": 24.614040198077483,
"grad_norm": 0.41269341111183167,
"learning_rate": 0.00030481259108131736,
"loss": 3.2007,
"step": 84500
},
{
"epoch": 24.628604718904747,
"grad_norm": 0.430519163608551,
"learning_rate": 0.00030463771495190905,
"loss": 3.1983,
"step": 84550
},
{
"epoch": 24.643169239732014,
"grad_norm": 0.4243789613246918,
"learning_rate": 0.0003044628388225007,
"loss": 3.2019,
"step": 84600
},
{
"epoch": 24.657733760559278,
"grad_norm": 0.4113166630268097,
"learning_rate": 0.0003042879626930924,
"loss": 3.2001,
"step": 84650
},
{
"epoch": 24.672298281386542,
"grad_norm": 0.3855622112751007,
"learning_rate": 0.000304113086563684,
"loss": 3.196,
"step": 84700
},
{
"epoch": 24.686862802213806,
"grad_norm": 0.4105675220489502,
"learning_rate": 0.00030393821043427565,
"loss": 3.2068,
"step": 84750
},
{
"epoch": 24.701427323041074,
"grad_norm": 0.42943140864372253,
"learning_rate": 0.00030376333430486734,
"loss": 3.1916,
"step": 84800
},
{
"epoch": 24.715991843868338,
"grad_norm": 0.39312615990638733,
"learning_rate": 0.000303588458175459,
"loss": 3.1973,
"step": 84850
},
{
"epoch": 24.7305563646956,
"grad_norm": 0.4099068343639374,
"learning_rate": 0.0003034135820460507,
"loss": 3.201,
"step": 84900
},
{
"epoch": 24.745120885522866,
"grad_norm": 0.4250319302082062,
"learning_rate": 0.00030323870591664236,
"loss": 3.2122,
"step": 84950
},
{
"epoch": 24.75968540635013,
"grad_norm": 0.4025106132030487,
"learning_rate": 0.00030306382978723405,
"loss": 3.2001,
"step": 85000
},
{
"epoch": 24.75968540635013,
"eval_accuracy": 0.37493806998981954,
"eval_loss": 3.5354461669921875,
"eval_runtime": 180.1336,
"eval_samples_per_second": 92.381,
"eval_steps_per_second": 5.779,
"step": 85000
},
{
"epoch": 24.774249927177397,
"grad_norm": 0.4010268449783325,
"learning_rate": 0.0003028889536578257,
"loss": 3.2078,
"step": 85050
},
{
"epoch": 24.78881444800466,
"grad_norm": 0.3985311686992645,
"learning_rate": 0.0003027140775284173,
"loss": 3.1985,
"step": 85100
},
{
"epoch": 24.803378968831925,
"grad_norm": 0.392164945602417,
"learning_rate": 0.000302539201399009,
"loss": 3.1969,
"step": 85150
},
{
"epoch": 24.81794348965919,
"grad_norm": 0.39922991394996643,
"learning_rate": 0.00030236432526960065,
"loss": 3.2053,
"step": 85200
},
{
"epoch": 24.832508010486453,
"grad_norm": 0.41001641750335693,
"learning_rate": 0.00030218944914019234,
"loss": 3.214,
"step": 85250
},
{
"epoch": 24.84707253131372,
"grad_norm": 0.42852386832237244,
"learning_rate": 0.000302014573010784,
"loss": 3.2108,
"step": 85300
},
{
"epoch": 24.861637052140985,
"grad_norm": 0.40917831659317017,
"learning_rate": 0.0003018396968813756,
"loss": 3.2083,
"step": 85350
},
{
"epoch": 24.87620157296825,
"grad_norm": 0.3986416161060333,
"learning_rate": 0.00030166482075196736,
"loss": 3.2032,
"step": 85400
},
{
"epoch": 24.890766093795513,
"grad_norm": 0.3986012041568756,
"learning_rate": 0.000301489944622559,
"loss": 3.2009,
"step": 85450
},
{
"epoch": 24.90533061462278,
"grad_norm": 0.40824413299560547,
"learning_rate": 0.0003013150684931507,
"loss": 3.2016,
"step": 85500
},
{
"epoch": 24.919895135450044,
"grad_norm": 0.41101622581481934,
"learning_rate": 0.0003011401923637423,
"loss": 3.2052,
"step": 85550
},
{
"epoch": 24.93445965627731,
"grad_norm": 0.3962431848049164,
"learning_rate": 0.000300965316234334,
"loss": 3.2238,
"step": 85600
},
{
"epoch": 24.949024177104572,
"grad_norm": 0.4147668480873108,
"learning_rate": 0.00030079044010492565,
"loss": 3.2103,
"step": 85650
},
{
"epoch": 24.963588697931836,
"grad_norm": 0.3646432161331177,
"learning_rate": 0.0003006155639755173,
"loss": 3.2083,
"step": 85700
},
{
"epoch": 24.978153218759104,
"grad_norm": 0.40107569098472595,
"learning_rate": 0.00030044068784610897,
"loss": 3.2216,
"step": 85750
},
{
"epoch": 24.992717739586368,
"grad_norm": 0.428681343793869,
"learning_rate": 0.0003002658117167006,
"loss": 3.2139,
"step": 85800
},
{
"epoch": 25.007282260413632,
"grad_norm": 0.40074416995048523,
"learning_rate": 0.00030009093558729235,
"loss": 3.1639,
"step": 85850
},
{
"epoch": 25.021846781240896,
"grad_norm": 0.3931594789028168,
"learning_rate": 0.000299916059457884,
"loss": 3.1114,
"step": 85900
},
{
"epoch": 25.036411302068164,
"grad_norm": 0.4038519263267517,
"learning_rate": 0.0002997411833284756,
"loss": 3.1225,
"step": 85950
},
{
"epoch": 25.050975822895428,
"grad_norm": 0.39107102155685425,
"learning_rate": 0.0002995663071990673,
"loss": 3.1291,
"step": 86000
},
{
"epoch": 25.050975822895428,
"eval_accuracy": 0.3742310712313009,
"eval_loss": 3.5472183227539062,
"eval_runtime": 180.135,
"eval_samples_per_second": 92.381,
"eval_steps_per_second": 5.779,
"step": 86000
},
{
"epoch": 25.06554034372269,
"grad_norm": 0.4421219527721405,
"learning_rate": 0.00029939143106965895,
"loss": 3.1221,
"step": 86050
},
{
"epoch": 25.080104864549956,
"grad_norm": 0.41089800000190735,
"learning_rate": 0.00029921655494025064,
"loss": 3.1253,
"step": 86100
},
{
"epoch": 25.09466938537722,
"grad_norm": 0.40636569261550903,
"learning_rate": 0.0002990416788108423,
"loss": 3.1289,
"step": 86150
},
{
"epoch": 25.109233906204487,
"grad_norm": 0.4226664900779724,
"learning_rate": 0.00029886680268143397,
"loss": 3.1304,
"step": 86200
},
{
"epoch": 25.12379842703175,
"grad_norm": 0.4057096540927887,
"learning_rate": 0.0002986919265520256,
"loss": 3.1321,
"step": 86250
},
{
"epoch": 25.138362947859015,
"grad_norm": 0.40901872515678406,
"learning_rate": 0.0002985170504226173,
"loss": 3.1259,
"step": 86300
},
{
"epoch": 25.15292746868628,
"grad_norm": 0.4164910912513733,
"learning_rate": 0.00029834217429320893,
"loss": 3.1403,
"step": 86350
},
{
"epoch": 25.167491989513547,
"grad_norm": 0.40740352869033813,
"learning_rate": 0.0002981672981638006,
"loss": 3.1476,
"step": 86400
},
{
"epoch": 25.18205651034081,
"grad_norm": 0.4132038652896881,
"learning_rate": 0.00029799242203439226,
"loss": 3.1554,
"step": 86450
},
{
"epoch": 25.196621031168075,
"grad_norm": 0.4355293810367584,
"learning_rate": 0.00029781754590498395,
"loss": 3.1588,
"step": 86500
},
{
"epoch": 25.21118555199534,
"grad_norm": 0.4148186147212982,
"learning_rate": 0.00029764266977557564,
"loss": 3.1372,
"step": 86550
},
{
"epoch": 25.225750072822603,
"grad_norm": 0.43595439195632935,
"learning_rate": 0.0002974677936461673,
"loss": 3.1552,
"step": 86600
},
{
"epoch": 25.24031459364987,
"grad_norm": 0.45525822043418884,
"learning_rate": 0.0002972929175167589,
"loss": 3.1512,
"step": 86650
},
{
"epoch": 25.254879114477134,
"grad_norm": 0.4085412323474884,
"learning_rate": 0.0002971180413873506,
"loss": 3.1533,
"step": 86700
},
{
"epoch": 25.2694436353044,
"grad_norm": 0.41310837864875793,
"learning_rate": 0.00029694316525794224,
"loss": 3.1533,
"step": 86750
},
{
"epoch": 25.284008156131662,
"grad_norm": 0.3980827331542969,
"learning_rate": 0.00029676828912853393,
"loss": 3.1537,
"step": 86800
},
{
"epoch": 25.298572676958926,
"grad_norm": 0.42113929986953735,
"learning_rate": 0.0002965934129991256,
"loss": 3.1535,
"step": 86850
},
{
"epoch": 25.313137197786194,
"grad_norm": 0.4222269058227539,
"learning_rate": 0.00029641853686971726,
"loss": 3.1623,
"step": 86900
},
{
"epoch": 25.327701718613458,
"grad_norm": 0.4171670079231262,
"learning_rate": 0.0002962436607403089,
"loss": 3.153,
"step": 86950
},
{
"epoch": 25.342266239440722,
"grad_norm": 0.4400247633457184,
"learning_rate": 0.0002960687846109006,
"loss": 3.1614,
"step": 87000
},
{
"epoch": 25.342266239440722,
"eval_accuracy": 0.37419508626454995,
"eval_loss": 3.54506778717041,
"eval_runtime": 179.9944,
"eval_samples_per_second": 92.453,
"eval_steps_per_second": 5.784,
"step": 87000
},
{
"epoch": 25.356830760267986,
"grad_norm": 0.4598556160926819,
"learning_rate": 0.00029589390848149227,
"loss": 3.1667,
"step": 87050
},
{
"epoch": 25.371395281095253,
"grad_norm": 0.41732457280158997,
"learning_rate": 0.0002957190323520839,
"loss": 3.1687,
"step": 87100
},
{
"epoch": 25.385959801922517,
"grad_norm": 0.3933047950267792,
"learning_rate": 0.0002955441562226756,
"loss": 3.1493,
"step": 87150
},
{
"epoch": 25.40052432274978,
"grad_norm": 0.3980228900909424,
"learning_rate": 0.00029536928009326723,
"loss": 3.1678,
"step": 87200
},
{
"epoch": 25.415088843577045,
"grad_norm": 0.42772722244262695,
"learning_rate": 0.00029519440396385887,
"loss": 3.1628,
"step": 87250
},
{
"epoch": 25.42965336440431,
"grad_norm": 0.41940030455589294,
"learning_rate": 0.00029501952783445056,
"loss": 3.1814,
"step": 87300
},
{
"epoch": 25.444217885231577,
"grad_norm": 0.42683565616607666,
"learning_rate": 0.00029484465170504225,
"loss": 3.1786,
"step": 87350
},
{
"epoch": 25.45878240605884,
"grad_norm": 0.40677493810653687,
"learning_rate": 0.0002946697755756339,
"loss": 3.1701,
"step": 87400
},
{
"epoch": 25.473346926886105,
"grad_norm": 0.4015192687511444,
"learning_rate": 0.0002944948994462256,
"loss": 3.1792,
"step": 87450
},
{
"epoch": 25.48791144771337,
"grad_norm": 0.3968818783760071,
"learning_rate": 0.00029432002331681727,
"loss": 3.1833,
"step": 87500
},
{
"epoch": 25.502475968540637,
"grad_norm": 0.41406768560409546,
"learning_rate": 0.0002941451471874089,
"loss": 3.1799,
"step": 87550
},
{
"epoch": 25.5170404893679,
"grad_norm": 0.4274561405181885,
"learning_rate": 0.00029397027105800054,
"loss": 3.1923,
"step": 87600
},
{
"epoch": 25.531605010195165,
"grad_norm": 0.38853442668914795,
"learning_rate": 0.00029379539492859223,
"loss": 3.1808,
"step": 87650
},
{
"epoch": 25.54616953102243,
"grad_norm": 0.3968064486980438,
"learning_rate": 0.00029362051879918387,
"loss": 3.1786,
"step": 87700
},
{
"epoch": 25.560734051849693,
"grad_norm": 0.44367527961730957,
"learning_rate": 0.00029344564266977556,
"loss": 3.185,
"step": 87750
},
{
"epoch": 25.57529857267696,
"grad_norm": 0.4026988744735718,
"learning_rate": 0.00029327076654036725,
"loss": 3.1778,
"step": 87800
},
{
"epoch": 25.589863093504224,
"grad_norm": 0.39344534277915955,
"learning_rate": 0.0002930958904109589,
"loss": 3.1752,
"step": 87850
},
{
"epoch": 25.604427614331488,
"grad_norm": 0.39949455857276917,
"learning_rate": 0.0002929210142815505,
"loss": 3.1927,
"step": 87900
},
{
"epoch": 25.618992135158752,
"grad_norm": 0.4150342643260956,
"learning_rate": 0.0002927461381521422,
"loss": 3.1774,
"step": 87950
},
{
"epoch": 25.63355665598602,
"grad_norm": 0.3983399271965027,
"learning_rate": 0.0002925712620227339,
"loss": 3.1755,
"step": 88000
},
{
"epoch": 25.63355665598602,
"eval_accuracy": 0.3747817823401071,
"eval_loss": 3.536299228668213,
"eval_runtime": 179.9386,
"eval_samples_per_second": 92.482,
"eval_steps_per_second": 5.785,
"step": 88000
},
{
"epoch": 25.648121176813284,
"grad_norm": 0.4258618950843811,
"learning_rate": 0.00029239638589332554,
"loss": 3.1961,
"step": 88050
},
{
"epoch": 25.662685697640548,
"grad_norm": 0.41384416818618774,
"learning_rate": 0.0002922215097639172,
"loss": 3.1849,
"step": 88100
},
{
"epoch": 25.67725021846781,
"grad_norm": 0.4001385569572449,
"learning_rate": 0.00029204663363450886,
"loss": 3.1824,
"step": 88150
},
{
"epoch": 25.691814739295076,
"grad_norm": 0.39791566133499146,
"learning_rate": 0.0002918717575051005,
"loss": 3.199,
"step": 88200
},
{
"epoch": 25.706379260122343,
"grad_norm": 0.4087889492511749,
"learning_rate": 0.0002916968813756922,
"loss": 3.1977,
"step": 88250
},
{
"epoch": 25.720943780949607,
"grad_norm": 0.40469250082969666,
"learning_rate": 0.0002915220052462839,
"loss": 3.1841,
"step": 88300
},
{
"epoch": 25.73550830177687,
"grad_norm": 0.40084928274154663,
"learning_rate": 0.0002913471291168755,
"loss": 3.1995,
"step": 88350
},
{
"epoch": 25.750072822604135,
"grad_norm": 0.4317362904548645,
"learning_rate": 0.00029117225298746715,
"loss": 3.1922,
"step": 88400
},
{
"epoch": 25.764637343431403,
"grad_norm": 0.3823014497756958,
"learning_rate": 0.00029099737685805884,
"loss": 3.1865,
"step": 88450
},
{
"epoch": 25.779201864258667,
"grad_norm": 0.37905919551849365,
"learning_rate": 0.00029082250072865053,
"loss": 3.1759,
"step": 88500
},
{
"epoch": 25.79376638508593,
"grad_norm": 0.41488322615623474,
"learning_rate": 0.00029064762459924217,
"loss": 3.1959,
"step": 88550
},
{
"epoch": 25.808330905913195,
"grad_norm": 0.4414539039134979,
"learning_rate": 0.00029047274846983386,
"loss": 3.1907,
"step": 88600
},
{
"epoch": 25.82289542674046,
"grad_norm": 0.4049603044986725,
"learning_rate": 0.0002902978723404255,
"loss": 3.185,
"step": 88650
},
{
"epoch": 25.837459947567726,
"grad_norm": 0.41079235076904297,
"learning_rate": 0.00029012299621101713,
"loss": 3.1952,
"step": 88700
},
{
"epoch": 25.85202446839499,
"grad_norm": 0.4270045757293701,
"learning_rate": 0.0002899481200816088,
"loss": 3.2003,
"step": 88750
},
{
"epoch": 25.866588989222254,
"grad_norm": 0.3901161551475525,
"learning_rate": 0.0002897732439522005,
"loss": 3.1942,
"step": 88800
},
{
"epoch": 25.88115351004952,
"grad_norm": 0.4142347276210785,
"learning_rate": 0.00028959836782279215,
"loss": 3.2045,
"step": 88850
},
{
"epoch": 25.895718030876782,
"grad_norm": 0.40068092942237854,
"learning_rate": 0.00028942349169338384,
"loss": 3.1998,
"step": 88900
},
{
"epoch": 25.91028255170405,
"grad_norm": 0.3928789794445038,
"learning_rate": 0.00028924861556397553,
"loss": 3.2075,
"step": 88950
},
{
"epoch": 25.924847072531314,
"grad_norm": 0.41609933972358704,
"learning_rate": 0.00028907373943456717,
"loss": 3.2004,
"step": 89000
},
{
"epoch": 25.924847072531314,
"eval_accuracy": 0.3750450841066279,
"eval_loss": 3.532590866088867,
"eval_runtime": 180.0164,
"eval_samples_per_second": 92.442,
"eval_steps_per_second": 5.783,
"step": 89000
},
{
"epoch": 25.939411593358578,
"grad_norm": 0.4000127613544464,
"learning_rate": 0.0002888988633051588,
"loss": 3.2011,
"step": 89050
},
{
"epoch": 25.953976114185842,
"grad_norm": 0.41976398229599,
"learning_rate": 0.0002887239871757505,
"loss": 3.2051,
"step": 89100
},
{
"epoch": 25.96854063501311,
"grad_norm": 0.4085160791873932,
"learning_rate": 0.00028854911104634213,
"loss": 3.2028,
"step": 89150
},
{
"epoch": 25.983105155840374,
"grad_norm": 0.4441681206226349,
"learning_rate": 0.0002883742349169338,
"loss": 3.2012,
"step": 89200
},
{
"epoch": 25.997669676667638,
"grad_norm": 0.40044912695884705,
"learning_rate": 0.0002881993587875255,
"loss": 3.2076,
"step": 89250
},
{
"epoch": 26.0122341974949,
"grad_norm": 0.39951109886169434,
"learning_rate": 0.00028802448265811715,
"loss": 3.1123,
"step": 89300
},
{
"epoch": 26.026798718322166,
"grad_norm": 0.43108323216438293,
"learning_rate": 0.0002878496065287088,
"loss": 3.0933,
"step": 89350
},
{
"epoch": 26.041363239149433,
"grad_norm": 0.4239446818828583,
"learning_rate": 0.0002876747303993005,
"loss": 3.1119,
"step": 89400
},
{
"epoch": 26.055927759976697,
"grad_norm": 0.4508779048919678,
"learning_rate": 0.00028749985426989216,
"loss": 3.1111,
"step": 89450
},
{
"epoch": 26.07049228080396,
"grad_norm": 0.4007803201675415,
"learning_rate": 0.0002873249781404838,
"loss": 3.119,
"step": 89500
},
{
"epoch": 26.085056801631225,
"grad_norm": 0.4217778444290161,
"learning_rate": 0.0002871501020110755,
"loss": 3.1271,
"step": 89550
},
{
"epoch": 26.099621322458493,
"grad_norm": 0.426954448223114,
"learning_rate": 0.0002869752258816671,
"loss": 3.1251,
"step": 89600
},
{
"epoch": 26.114185843285757,
"grad_norm": 0.40633267164230347,
"learning_rate": 0.00028680034975225876,
"loss": 3.1286,
"step": 89650
},
{
"epoch": 26.12875036411302,
"grad_norm": 0.4173829257488251,
"learning_rate": 0.00028662547362285045,
"loss": 3.1363,
"step": 89700
},
{
"epoch": 26.143314884940285,
"grad_norm": 0.39154112339019775,
"learning_rate": 0.00028645059749344214,
"loss": 3.1298,
"step": 89750
},
{
"epoch": 26.15787940576755,
"grad_norm": 0.4263724088668823,
"learning_rate": 0.0002862757213640338,
"loss": 3.1379,
"step": 89800
},
{
"epoch": 26.172443926594816,
"grad_norm": 0.4325745105743408,
"learning_rate": 0.00028610084523462547,
"loss": 3.1325,
"step": 89850
},
{
"epoch": 26.18700844742208,
"grad_norm": 0.4075644612312317,
"learning_rate": 0.0002859259691052171,
"loss": 3.1412,
"step": 89900
},
{
"epoch": 26.201572968249344,
"grad_norm": 0.4166417717933655,
"learning_rate": 0.0002857510929758088,
"loss": 3.1338,
"step": 89950
},
{
"epoch": 26.21613748907661,
"grad_norm": 0.42380568385124207,
"learning_rate": 0.00028557621684640043,
"loss": 3.1347,
"step": 90000
},
{
"epoch": 26.21613748907661,
"eval_accuracy": 0.37397917646404427,
"eval_loss": 3.5486440658569336,
"eval_runtime": 180.117,
"eval_samples_per_second": 92.39,
"eval_steps_per_second": 5.78,
"step": 90000
},
{
"epoch": 26.230702009903876,
"grad_norm": 0.38530007004737854,
"learning_rate": 0.0002854013407169921,
"loss": 3.1367,
"step": 90050
},
{
"epoch": 26.24526653073114,
"grad_norm": 0.41927337646484375,
"learning_rate": 0.00028522646458758376,
"loss": 3.1331,
"step": 90100
},
{
"epoch": 26.259831051558404,
"grad_norm": 0.41560065746307373,
"learning_rate": 0.00028505158845817545,
"loss": 3.1429,
"step": 90150
},
{
"epoch": 26.274395572385668,
"grad_norm": 0.41110533475875854,
"learning_rate": 0.0002848767123287671,
"loss": 3.1418,
"step": 90200
},
{
"epoch": 26.288960093212932,
"grad_norm": 0.42608729004859924,
"learning_rate": 0.0002847018361993588,
"loss": 3.1524,
"step": 90250
},
{
"epoch": 26.3035246140402,
"grad_norm": 0.46872904896736145,
"learning_rate": 0.0002845269600699504,
"loss": 3.1366,
"step": 90300
},
{
"epoch": 26.318089134867463,
"grad_norm": 0.42742592096328735,
"learning_rate": 0.0002843520839405421,
"loss": 3.149,
"step": 90350
},
{
"epoch": 26.332653655694727,
"grad_norm": 0.42787429690361023,
"learning_rate": 0.0002841772078111338,
"loss": 3.1612,
"step": 90400
},
{
"epoch": 26.34721817652199,
"grad_norm": 0.4321795701980591,
"learning_rate": 0.00028400233168172543,
"loss": 3.1546,
"step": 90450
},
{
"epoch": 26.361782697349255,
"grad_norm": 0.4483564496040344,
"learning_rate": 0.00028382745555231707,
"loss": 3.1611,
"step": 90500
},
{
"epoch": 26.376347218176523,
"grad_norm": 0.41695746779441833,
"learning_rate": 0.00028365257942290876,
"loss": 3.162,
"step": 90550
},
{
"epoch": 26.390911739003787,
"grad_norm": 0.4270467162132263,
"learning_rate": 0.0002834777032935004,
"loss": 3.1594,
"step": 90600
},
{
"epoch": 26.40547625983105,
"grad_norm": 0.4106120467185974,
"learning_rate": 0.0002833028271640921,
"loss": 3.1573,
"step": 90650
},
{
"epoch": 26.420040780658315,
"grad_norm": 0.40699175000190735,
"learning_rate": 0.00028312795103468377,
"loss": 3.1656,
"step": 90700
},
{
"epoch": 26.434605301485583,
"grad_norm": 0.4184217154979706,
"learning_rate": 0.0002829530749052754,
"loss": 3.1588,
"step": 90750
},
{
"epoch": 26.449169822312847,
"grad_norm": 0.4090009927749634,
"learning_rate": 0.00028277819877586705,
"loss": 3.1679,
"step": 90800
},
{
"epoch": 26.46373434314011,
"grad_norm": 0.4072172939777374,
"learning_rate": 0.00028260332264645874,
"loss": 3.1526,
"step": 90850
},
{
"epoch": 26.478298863967375,
"grad_norm": 0.4247322380542755,
"learning_rate": 0.0002824284465170504,
"loss": 3.1634,
"step": 90900
},
{
"epoch": 26.49286338479464,
"grad_norm": 0.40537166595458984,
"learning_rate": 0.00028225357038764206,
"loss": 3.1682,
"step": 90950
},
{
"epoch": 26.507427905621906,
"grad_norm": 0.3876939117908478,
"learning_rate": 0.00028207869425823375,
"loss": 3.1675,
"step": 91000
},
{
"epoch": 26.507427905621906,
"eval_accuracy": 0.3747385063016615,
"eval_loss": 3.5393879413604736,
"eval_runtime": 180.0198,
"eval_samples_per_second": 92.44,
"eval_steps_per_second": 5.783,
"step": 91000
},
{
"epoch": 26.52199242644917,
"grad_norm": 0.4608912765979767,
"learning_rate": 0.0002819038181288254,
"loss": 3.1717,
"step": 91050
},
{
"epoch": 26.536556947276434,
"grad_norm": 0.3875090777873993,
"learning_rate": 0.000281728941999417,
"loss": 3.1683,
"step": 91100
},
{
"epoch": 26.551121468103698,
"grad_norm": 0.3892468512058258,
"learning_rate": 0.0002815540658700087,
"loss": 3.1792,
"step": 91150
},
{
"epoch": 26.565685988930966,
"grad_norm": 0.43506941199302673,
"learning_rate": 0.0002813791897406004,
"loss": 3.1765,
"step": 91200
},
{
"epoch": 26.58025050975823,
"grad_norm": 0.44329991936683655,
"learning_rate": 0.00028120431361119204,
"loss": 3.1647,
"step": 91250
},
{
"epoch": 26.594815030585494,
"grad_norm": 0.4021358788013458,
"learning_rate": 0.00028102943748178373,
"loss": 3.1723,
"step": 91300
},
{
"epoch": 26.609379551412758,
"grad_norm": 0.4087882936000824,
"learning_rate": 0.00028085456135237537,
"loss": 3.1593,
"step": 91350
},
{
"epoch": 26.623944072240022,
"grad_norm": 0.40286004543304443,
"learning_rate": 0.00028067968522296706,
"loss": 3.173,
"step": 91400
},
{
"epoch": 26.63850859306729,
"grad_norm": 0.39587125182151794,
"learning_rate": 0.0002805048090935587,
"loss": 3.1768,
"step": 91450
},
{
"epoch": 26.653073113894553,
"grad_norm": 0.43658262491226196,
"learning_rate": 0.0002803299329641504,
"loss": 3.1648,
"step": 91500
},
{
"epoch": 26.667637634721817,
"grad_norm": 0.4274442493915558,
"learning_rate": 0.000280155056834742,
"loss": 3.1822,
"step": 91550
},
{
"epoch": 26.68220215554908,
"grad_norm": 0.42451024055480957,
"learning_rate": 0.0002799801807053337,
"loss": 3.1862,
"step": 91600
},
{
"epoch": 26.69676667637635,
"grad_norm": 0.44854724407196045,
"learning_rate": 0.00027980530457592535,
"loss": 3.1736,
"step": 91650
},
{
"epoch": 26.711331197203613,
"grad_norm": 0.4080348312854767,
"learning_rate": 0.00027963042844651704,
"loss": 3.1805,
"step": 91700
},
{
"epoch": 26.725895718030877,
"grad_norm": 0.43170300126075745,
"learning_rate": 0.0002794555523171087,
"loss": 3.1766,
"step": 91750
},
{
"epoch": 26.74046023885814,
"grad_norm": 0.4210398197174072,
"learning_rate": 0.00027928067618770037,
"loss": 3.1802,
"step": 91800
},
{
"epoch": 26.755024759685405,
"grad_norm": 0.4239577651023865,
"learning_rate": 0.00027910580005829206,
"loss": 3.1812,
"step": 91850
},
{
"epoch": 26.769589280512673,
"grad_norm": 0.4054132103919983,
"learning_rate": 0.0002789309239288837,
"loss": 3.1792,
"step": 91900
},
{
"epoch": 26.784153801339937,
"grad_norm": 0.4064629375934601,
"learning_rate": 0.00027875604779947533,
"loss": 3.1861,
"step": 91950
},
{
"epoch": 26.7987183221672,
"grad_norm": 0.40748798847198486,
"learning_rate": 0.000278581171670067,
"loss": 3.192,
"step": 92000
},
{
"epoch": 26.7987183221672,
"eval_accuracy": 0.3752517036706195,
"eval_loss": 3.530374050140381,
"eval_runtime": 180.0451,
"eval_samples_per_second": 92.427,
"eval_steps_per_second": 5.782,
"step": 92000
},
{
"epoch": 26.813282842994465,
"grad_norm": 0.4335620403289795,
"learning_rate": 0.00027840629554065865,
"loss": 3.1725,
"step": 92050
},
{
"epoch": 26.827847363821732,
"grad_norm": 0.41802123188972473,
"learning_rate": 0.00027823141941125034,
"loss": 3.1948,
"step": 92100
},
{
"epoch": 26.842411884648996,
"grad_norm": 0.41359513998031616,
"learning_rate": 0.00027805654328184204,
"loss": 3.1822,
"step": 92150
},
{
"epoch": 26.85697640547626,
"grad_norm": 0.3970206081867218,
"learning_rate": 0.00027788166715243367,
"loss": 3.2007,
"step": 92200
},
{
"epoch": 26.871540926303524,
"grad_norm": 0.4091810882091522,
"learning_rate": 0.0002777067910230253,
"loss": 3.192,
"step": 92250
},
{
"epoch": 26.886105447130788,
"grad_norm": 0.4705309271812439,
"learning_rate": 0.000277531914893617,
"loss": 3.2025,
"step": 92300
},
{
"epoch": 26.900669967958056,
"grad_norm": 0.447348952293396,
"learning_rate": 0.00027735703876420863,
"loss": 3.1939,
"step": 92350
},
{
"epoch": 26.91523448878532,
"grad_norm": 0.43095237016677856,
"learning_rate": 0.0002771821626348003,
"loss": 3.1773,
"step": 92400
},
{
"epoch": 26.929799009612584,
"grad_norm": 0.4291156232357025,
"learning_rate": 0.000277007286505392,
"loss": 3.1828,
"step": 92450
},
{
"epoch": 26.944363530439848,
"grad_norm": 0.4368513524532318,
"learning_rate": 0.00027683241037598365,
"loss": 3.2057,
"step": 92500
},
{
"epoch": 26.95892805126711,
"grad_norm": 0.3827671408653259,
"learning_rate": 0.0002766575342465753,
"loss": 3.2017,
"step": 92550
},
{
"epoch": 26.97349257209438,
"grad_norm": 0.4416309893131256,
"learning_rate": 0.000276482658117167,
"loss": 3.1994,
"step": 92600
},
{
"epoch": 26.988057092921643,
"grad_norm": 0.41274315118789673,
"learning_rate": 0.00027630778198775867,
"loss": 3.1861,
"step": 92650
},
{
"epoch": 27.002621613748907,
"grad_norm": 0.4293462336063385,
"learning_rate": 0.0002761329058583503,
"loss": 3.1783,
"step": 92700
},
{
"epoch": 27.01718613457617,
"grad_norm": 0.4199707806110382,
"learning_rate": 0.000275958029728942,
"loss": 3.0981,
"step": 92750
},
{
"epoch": 27.03175065540344,
"grad_norm": 0.4206013083457947,
"learning_rate": 0.00027578315359953363,
"loss": 3.0893,
"step": 92800
},
{
"epoch": 27.046315176230703,
"grad_norm": 0.38021305203437805,
"learning_rate": 0.00027560827747012527,
"loss": 3.1103,
"step": 92850
},
{
"epoch": 27.060879697057967,
"grad_norm": 0.38645049929618835,
"learning_rate": 0.00027543340134071696,
"loss": 3.107,
"step": 92900
},
{
"epoch": 27.07544421788523,
"grad_norm": 0.40561676025390625,
"learning_rate": 0.00027525852521130865,
"loss": 3.1084,
"step": 92950
},
{
"epoch": 27.090008738712495,
"grad_norm": 0.398436576128006,
"learning_rate": 0.0002750836490819003,
"loss": 3.1155,
"step": 93000
},
{
"epoch": 27.090008738712495,
"eval_accuracy": 0.3744006474471665,
"eval_loss": 3.5486044883728027,
"eval_runtime": 180.0468,
"eval_samples_per_second": 92.426,
"eval_steps_per_second": 5.782,
"step": 93000
},
{
"epoch": 27.104573259539762,
"grad_norm": 0.43171611428260803,
"learning_rate": 0.000274908772952492,
"loss": 3.1036,
"step": 93050
},
{
"epoch": 27.119137780367026,
"grad_norm": 0.42062097787857056,
"learning_rate": 0.0002747338968230836,
"loss": 3.1142,
"step": 93100
},
{
"epoch": 27.13370230119429,
"grad_norm": 0.4473419189453125,
"learning_rate": 0.0002745590206936753,
"loss": 3.1121,
"step": 93150
},
{
"epoch": 27.148266822021554,
"grad_norm": 0.401583194732666,
"learning_rate": 0.00027438414456426694,
"loss": 3.1183,
"step": 93200
},
{
"epoch": 27.162831342848822,
"grad_norm": 0.43579012155532837,
"learning_rate": 0.00027420926843485863,
"loss": 3.1282,
"step": 93250
},
{
"epoch": 27.177395863676086,
"grad_norm": 0.4297228753566742,
"learning_rate": 0.00027403439230545026,
"loss": 3.1189,
"step": 93300
},
{
"epoch": 27.19196038450335,
"grad_norm": 0.4076518416404724,
"learning_rate": 0.00027385951617604195,
"loss": 3.1232,
"step": 93350
},
{
"epoch": 27.206524905330614,
"grad_norm": 0.4205459952354431,
"learning_rate": 0.0002736846400466336,
"loss": 3.1296,
"step": 93400
},
{
"epoch": 27.221089426157878,
"grad_norm": 0.39670172333717346,
"learning_rate": 0.0002735097639172253,
"loss": 3.1249,
"step": 93450
},
{
"epoch": 27.235653946985146,
"grad_norm": 0.4148896336555481,
"learning_rate": 0.0002733348877878169,
"loss": 3.1185,
"step": 93500
},
{
"epoch": 27.25021846781241,
"grad_norm": 0.4132000803947449,
"learning_rate": 0.0002731600116584086,
"loss": 3.1365,
"step": 93550
},
{
"epoch": 27.264782988639674,
"grad_norm": 0.4182775020599365,
"learning_rate": 0.0002729851355290003,
"loss": 3.1306,
"step": 93600
},
{
"epoch": 27.279347509466938,
"grad_norm": 0.43432140350341797,
"learning_rate": 0.00027281025939959193,
"loss": 3.1452,
"step": 93650
},
{
"epoch": 27.2939120302942,
"grad_norm": 0.4565012454986572,
"learning_rate": 0.00027263538327018357,
"loss": 3.1386,
"step": 93700
},
{
"epoch": 27.30847655112147,
"grad_norm": 0.445963978767395,
"learning_rate": 0.00027246050714077526,
"loss": 3.1269,
"step": 93750
},
{
"epoch": 27.323041071948733,
"grad_norm": 0.4095284044742584,
"learning_rate": 0.0002722856310113669,
"loss": 3.1549,
"step": 93800
},
{
"epoch": 27.337605592775997,
"grad_norm": 0.4368303716182709,
"learning_rate": 0.0002721107548819586,
"loss": 3.151,
"step": 93850
},
{
"epoch": 27.35217011360326,
"grad_norm": 0.46256592869758606,
"learning_rate": 0.0002719358787525503,
"loss": 3.1448,
"step": 93900
},
{
"epoch": 27.36673463443053,
"grad_norm": 0.412517249584198,
"learning_rate": 0.0002717610026231419,
"loss": 3.1458,
"step": 93950
},
{
"epoch": 27.381299155257793,
"grad_norm": 0.417863667011261,
"learning_rate": 0.00027158612649373355,
"loss": 3.1508,
"step": 94000
},
{
"epoch": 27.381299155257793,
"eval_accuracy": 0.37466524279092345,
"eval_loss": 3.546901226043701,
"eval_runtime": 180.1597,
"eval_samples_per_second": 92.368,
"eval_steps_per_second": 5.778,
"step": 94000
},
{
"epoch": 27.395863676085057,
"grad_norm": 0.49049368500709534,
"learning_rate": 0.00027141125036432524,
"loss": 3.1541,
"step": 94050
},
{
"epoch": 27.41042819691232,
"grad_norm": 0.4306406080722809,
"learning_rate": 0.00027123637423491693,
"loss": 3.1577,
"step": 94100
},
{
"epoch": 27.424992717739585,
"grad_norm": 0.40683841705322266,
"learning_rate": 0.00027106149810550857,
"loss": 3.1543,
"step": 94150
},
{
"epoch": 27.439557238566852,
"grad_norm": 0.4452398717403412,
"learning_rate": 0.00027088662197610026,
"loss": 3.1457,
"step": 94200
},
{
"epoch": 27.454121759394116,
"grad_norm": 0.4268343448638916,
"learning_rate": 0.0002707117458466919,
"loss": 3.1603,
"step": 94250
},
{
"epoch": 27.46868628022138,
"grad_norm": 0.45762568712234497,
"learning_rate": 0.00027053686971728353,
"loss": 3.1528,
"step": 94300
},
{
"epoch": 27.483250801048644,
"grad_norm": 0.41289466619491577,
"learning_rate": 0.0002703619935878752,
"loss": 3.1503,
"step": 94350
},
{
"epoch": 27.497815321875912,
"grad_norm": 0.43442097306251526,
"learning_rate": 0.0002701871174584669,
"loss": 3.155,
"step": 94400
},
{
"epoch": 27.512379842703176,
"grad_norm": 0.40458592772483826,
"learning_rate": 0.00027001224132905855,
"loss": 3.1625,
"step": 94450
},
{
"epoch": 27.52694436353044,
"grad_norm": 0.3933391869068146,
"learning_rate": 0.00026983736519965024,
"loss": 3.1555,
"step": 94500
},
{
"epoch": 27.541508884357704,
"grad_norm": 0.4415944814682007,
"learning_rate": 0.0002696624890702419,
"loss": 3.1709,
"step": 94550
},
{
"epoch": 27.556073405184968,
"grad_norm": 0.43649542331695557,
"learning_rate": 0.00026948761294083356,
"loss": 3.1747,
"step": 94600
},
{
"epoch": 27.570637926012235,
"grad_norm": 0.4254384934902191,
"learning_rate": 0.0002693127368114252,
"loss": 3.1716,
"step": 94650
},
{
"epoch": 27.5852024468395,
"grad_norm": 0.4311431646347046,
"learning_rate": 0.0002691378606820169,
"loss": 3.1693,
"step": 94700
},
{
"epoch": 27.599766967666763,
"grad_norm": 0.42994821071624756,
"learning_rate": 0.0002689629845526085,
"loss": 3.1747,
"step": 94750
},
{
"epoch": 27.614331488494027,
"grad_norm": 0.44075125455856323,
"learning_rate": 0.0002687881084232002,
"loss": 3.1774,
"step": 94800
},
{
"epoch": 27.628896009321295,
"grad_norm": 0.4176417589187622,
"learning_rate": 0.00026861323229379185,
"loss": 3.1704,
"step": 94850
},
{
"epoch": 27.64346053014856,
"grad_norm": 0.4279754161834717,
"learning_rate": 0.00026843835616438354,
"loss": 3.1623,
"step": 94900
},
{
"epoch": 27.658025050975823,
"grad_norm": 0.42134353518486023,
"learning_rate": 0.0002682634800349752,
"loss": 3.1738,
"step": 94950
},
{
"epoch": 27.672589571803087,
"grad_norm": 0.40898674726486206,
"learning_rate": 0.00026808860390556687,
"loss": 3.1786,
"step": 95000
},
{
"epoch": 27.672589571803087,
"eval_accuracy": 0.3749798172551679,
"eval_loss": 3.540032386779785,
"eval_runtime": 179.8274,
"eval_samples_per_second": 92.539,
"eval_steps_per_second": 5.789,
"step": 95000
},
{
"epoch": 27.68715409263035,
"grad_norm": 0.43924784660339355,
"learning_rate": 0.00026791372777615856,
"loss": 3.1834,
"step": 95050
},
{
"epoch": 27.70171861345762,
"grad_norm": 0.44329679012298584,
"learning_rate": 0.0002677388516467502,
"loss": 3.1736,
"step": 95100
},
{
"epoch": 27.716283134284883,
"grad_norm": 0.4295293688774109,
"learning_rate": 0.00026756397551734183,
"loss": 3.1759,
"step": 95150
},
{
"epoch": 27.730847655112147,
"grad_norm": 0.42074301838874817,
"learning_rate": 0.0002673890993879335,
"loss": 3.1631,
"step": 95200
},
{
"epoch": 27.74541217593941,
"grad_norm": 0.4105515480041504,
"learning_rate": 0.00026721422325852516,
"loss": 3.16,
"step": 95250
},
{
"epoch": 27.759976696766678,
"grad_norm": 0.42497944831848145,
"learning_rate": 0.00026703934712911685,
"loss": 3.1814,
"step": 95300
},
{
"epoch": 27.774541217593942,
"grad_norm": 0.4068467915058136,
"learning_rate": 0.00026686447099970854,
"loss": 3.1802,
"step": 95350
},
{
"epoch": 27.789105738421206,
"grad_norm": 0.41870132088661194,
"learning_rate": 0.0002666895948703002,
"loss": 3.1699,
"step": 95400
},
{
"epoch": 27.80367025924847,
"grad_norm": 0.39957690238952637,
"learning_rate": 0.0002665147187408918,
"loss": 3.1722,
"step": 95450
},
{
"epoch": 27.818234780075734,
"grad_norm": 0.42152419686317444,
"learning_rate": 0.0002663398426114835,
"loss": 3.1755,
"step": 95500
},
{
"epoch": 27.832799300903,
"grad_norm": 0.46125051379203796,
"learning_rate": 0.0002661649664820752,
"loss": 3.1714,
"step": 95550
},
{
"epoch": 27.847363821730266,
"grad_norm": 0.43029168248176575,
"learning_rate": 0.00026599009035266683,
"loss": 3.175,
"step": 95600
},
{
"epoch": 27.86192834255753,
"grad_norm": 0.4232831597328186,
"learning_rate": 0.0002658152142232585,
"loss": 3.164,
"step": 95650
},
{
"epoch": 27.876492863384794,
"grad_norm": 0.41490527987480164,
"learning_rate": 0.00026564033809385016,
"loss": 3.1884,
"step": 95700
},
{
"epoch": 27.891057384212058,
"grad_norm": 0.4738544523715973,
"learning_rate": 0.0002654654619644418,
"loss": 3.1874,
"step": 95750
},
{
"epoch": 27.905621905039325,
"grad_norm": 0.4612700343132019,
"learning_rate": 0.0002652905858350335,
"loss": 3.1621,
"step": 95800
},
{
"epoch": 27.92018642586659,
"grad_norm": 0.4170343577861786,
"learning_rate": 0.00026511570970562517,
"loss": 3.1649,
"step": 95850
},
{
"epoch": 27.934750946693853,
"grad_norm": 0.4307102560997009,
"learning_rate": 0.0002649408335762168,
"loss": 3.1862,
"step": 95900
},
{
"epoch": 27.949315467521117,
"grad_norm": 0.3994083106517792,
"learning_rate": 0.0002647659574468085,
"loss": 3.1892,
"step": 95950
},
{
"epoch": 27.963879988348385,
"grad_norm": 0.4175775647163391,
"learning_rate": 0.0002645910813174002,
"loss": 3.1759,
"step": 96000
},
{
"epoch": 27.963879988348385,
"eval_accuracy": 0.37527063693743945,
"eval_loss": 3.531499147415161,
"eval_runtime": 180.0444,
"eval_samples_per_second": 92.427,
"eval_steps_per_second": 5.782,
"step": 96000
},
{
"epoch": 27.97844450917565,
"grad_norm": 0.4074898660182953,
"learning_rate": 0.0002644162051879918,
"loss": 3.1731,
"step": 96050
},
{
"epoch": 27.993009030002913,
"grad_norm": 0.4351227581501007,
"learning_rate": 0.00026424132905858346,
"loss": 3.2046,
"step": 96100
},
{
"epoch": 28.007573550830177,
"grad_norm": 0.42597511410713196,
"learning_rate": 0.00026406645292917515,
"loss": 3.1421,
"step": 96150
},
{
"epoch": 28.02213807165744,
"grad_norm": 0.4269953966140747,
"learning_rate": 0.0002638915767997668,
"loss": 3.0972,
"step": 96200
},
{
"epoch": 28.03670259248471,
"grad_norm": 0.46322062611579895,
"learning_rate": 0.0002637167006703585,
"loss": 3.083,
"step": 96250
},
{
"epoch": 28.051267113311972,
"grad_norm": 0.41330623626708984,
"learning_rate": 0.00026354182454095017,
"loss": 3.0867,
"step": 96300
},
{
"epoch": 28.065831634139236,
"grad_norm": 0.4628863036632538,
"learning_rate": 0.0002633669484115418,
"loss": 3.1134,
"step": 96350
},
{
"epoch": 28.0803961549665,
"grad_norm": 0.4199768304824829,
"learning_rate": 0.00026319207228213344,
"loss": 3.0953,
"step": 96400
},
{
"epoch": 28.094960675793768,
"grad_norm": 0.43701887130737305,
"learning_rate": 0.00026301719615272513,
"loss": 3.1007,
"step": 96450
},
{
"epoch": 28.109525196621032,
"grad_norm": 0.427827924489975,
"learning_rate": 0.0002628423200233168,
"loss": 3.1013,
"step": 96500
},
{
"epoch": 28.124089717448296,
"grad_norm": 0.4370115101337433,
"learning_rate": 0.00026266744389390846,
"loss": 3.1081,
"step": 96550
},
{
"epoch": 28.13865423827556,
"grad_norm": 0.41825947165489197,
"learning_rate": 0.00026249256776450015,
"loss": 3.1187,
"step": 96600
},
{
"epoch": 28.153218759102824,
"grad_norm": 0.4340101182460785,
"learning_rate": 0.0002623176916350918,
"loss": 3.108,
"step": 96650
},
{
"epoch": 28.16778327993009,
"grad_norm": 0.43236619234085083,
"learning_rate": 0.0002621428155056834,
"loss": 3.1045,
"step": 96700
},
{
"epoch": 28.182347800757356,
"grad_norm": 0.42964163422584534,
"learning_rate": 0.0002619679393762751,
"loss": 3.1131,
"step": 96750
},
{
"epoch": 28.19691232158462,
"grad_norm": 0.42945995926856995,
"learning_rate": 0.0002617930632468668,
"loss": 3.1188,
"step": 96800
},
{
"epoch": 28.211476842411884,
"grad_norm": 0.4098386764526367,
"learning_rate": 0.00026161818711745844,
"loss": 3.1192,
"step": 96850
},
{
"epoch": 28.22604136323915,
"grad_norm": 0.4059397578239441,
"learning_rate": 0.0002614433109880501,
"loss": 3.1344,
"step": 96900
},
{
"epoch": 28.240605884066415,
"grad_norm": 0.4252930283546448,
"learning_rate": 0.00026126843485864176,
"loss": 3.1394,
"step": 96950
},
{
"epoch": 28.25517040489368,
"grad_norm": 0.4543604254722595,
"learning_rate": 0.00026109355872923345,
"loss": 3.1214,
"step": 97000
},
{
"epoch": 28.25517040489368,
"eval_accuracy": 0.374433810063584,
"eval_loss": 3.551743507385254,
"eval_runtime": 179.8752,
"eval_samples_per_second": 92.514,
"eval_steps_per_second": 5.787,
"step": 97000
},
{
"epoch": 28.269734925720943,
"grad_norm": 0.4379526674747467,
"learning_rate": 0.0002609186825998251,
"loss": 3.1253,
"step": 97050
},
{
"epoch": 28.284299446548207,
"grad_norm": 0.44791752099990845,
"learning_rate": 0.0002607438064704168,
"loss": 3.1335,
"step": 97100
},
{
"epoch": 28.298863967375475,
"grad_norm": 0.4074171781539917,
"learning_rate": 0.0002605689303410084,
"loss": 3.1384,
"step": 97150
},
{
"epoch": 28.31342848820274,
"grad_norm": 0.4251929819583893,
"learning_rate": 0.00026039405421160005,
"loss": 3.1338,
"step": 97200
},
{
"epoch": 28.327993009030003,
"grad_norm": 0.4182596802711487,
"learning_rate": 0.00026021917808219174,
"loss": 3.1361,
"step": 97250
},
{
"epoch": 28.342557529857267,
"grad_norm": 0.418155699968338,
"learning_rate": 0.00026004430195278343,
"loss": 3.1416,
"step": 97300
},
{
"epoch": 28.35712205068453,
"grad_norm": 0.3963245153427124,
"learning_rate": 0.00025986942582337507,
"loss": 3.1434,
"step": 97350
},
{
"epoch": 28.3716865715118,
"grad_norm": 0.4373180568218231,
"learning_rate": 0.00025969454969396676,
"loss": 3.1385,
"step": 97400
},
{
"epoch": 28.386251092339062,
"grad_norm": 0.46445873379707336,
"learning_rate": 0.00025951967356455845,
"loss": 3.1333,
"step": 97450
},
{
"epoch": 28.400815613166326,
"grad_norm": 0.43002384901046753,
"learning_rate": 0.0002593447974351501,
"loss": 3.1397,
"step": 97500
},
{
"epoch": 28.41538013399359,
"grad_norm": 0.4203263521194458,
"learning_rate": 0.0002591699213057417,
"loss": 3.1423,
"step": 97550
},
{
"epoch": 28.429944654820858,
"grad_norm": 0.422507107257843,
"learning_rate": 0.0002589950451763334,
"loss": 3.1467,
"step": 97600
},
{
"epoch": 28.444509175648122,
"grad_norm": 0.4163612425327301,
"learning_rate": 0.00025882016904692505,
"loss": 3.1577,
"step": 97650
},
{
"epoch": 28.459073696475386,
"grad_norm": 0.42211979627609253,
"learning_rate": 0.00025864529291751674,
"loss": 3.1456,
"step": 97700
},
{
"epoch": 28.47363821730265,
"grad_norm": 0.4921233057975769,
"learning_rate": 0.00025847041678810843,
"loss": 3.1399,
"step": 97750
},
{
"epoch": 28.488202738129914,
"grad_norm": 0.4806009829044342,
"learning_rate": 0.00025829554065870007,
"loss": 3.1502,
"step": 97800
},
{
"epoch": 28.50276725895718,
"grad_norm": 0.44099995493888855,
"learning_rate": 0.0002581206645292917,
"loss": 3.1552,
"step": 97850
},
{
"epoch": 28.517331779784445,
"grad_norm": 0.4322744607925415,
"learning_rate": 0.0002579457883998834,
"loss": 3.1645,
"step": 97900
},
{
"epoch": 28.53189630061171,
"grad_norm": 0.4310096800327301,
"learning_rate": 0.0002577709122704751,
"loss": 3.1428,
"step": 97950
},
{
"epoch": 28.546460821438973,
"grad_norm": 0.43033161759376526,
"learning_rate": 0.0002575960361410667,
"loss": 3.1545,
"step": 98000
},
{
"epoch": 28.546460821438973,
"eval_accuracy": 0.37523300559966066,
"eval_loss": 3.5400564670562744,
"eval_runtime": 180.0919,
"eval_samples_per_second": 92.403,
"eval_steps_per_second": 5.78,
"step": 98000
},
{
"epoch": 28.56102534226624,
"grad_norm": 0.45368677377700806,
"learning_rate": 0.0002574211600116584,
"loss": 3.1597,
"step": 98050
},
{
"epoch": 28.575589863093505,
"grad_norm": 0.40790659189224243,
"learning_rate": 0.00025724628388225005,
"loss": 3.1517,
"step": 98100
},
{
"epoch": 28.59015438392077,
"grad_norm": 0.4356526732444763,
"learning_rate": 0.0002570714077528417,
"loss": 3.1551,
"step": 98150
},
{
"epoch": 28.604718904748033,
"grad_norm": 0.4357260465621948,
"learning_rate": 0.0002568965316234334,
"loss": 3.1605,
"step": 98200
},
{
"epoch": 28.619283425575297,
"grad_norm": 0.40190744400024414,
"learning_rate": 0.00025672165549402506,
"loss": 3.1605,
"step": 98250
},
{
"epoch": 28.633847946402565,
"grad_norm": 0.43007245659828186,
"learning_rate": 0.0002565467793646167,
"loss": 3.1633,
"step": 98300
},
{
"epoch": 28.64841246722983,
"grad_norm": 0.4541544020175934,
"learning_rate": 0.0002563719032352084,
"loss": 3.1565,
"step": 98350
},
{
"epoch": 28.662976988057093,
"grad_norm": 0.4124305248260498,
"learning_rate": 0.0002561970271058,
"loss": 3.1633,
"step": 98400
},
{
"epoch": 28.677541508884357,
"grad_norm": 0.4222710132598877,
"learning_rate": 0.0002560221509763917,
"loss": 3.1576,
"step": 98450
},
{
"epoch": 28.692106029711624,
"grad_norm": 0.4161086976528168,
"learning_rate": 0.00025584727484698335,
"loss": 3.1599,
"step": 98500
},
{
"epoch": 28.706670550538888,
"grad_norm": 0.42461931705474854,
"learning_rate": 0.00025567239871757504,
"loss": 3.1562,
"step": 98550
},
{
"epoch": 28.721235071366152,
"grad_norm": 0.43226149678230286,
"learning_rate": 0.0002554975225881667,
"loss": 3.1601,
"step": 98600
},
{
"epoch": 28.735799592193416,
"grad_norm": 0.45831042528152466,
"learning_rate": 0.00025532264645875837,
"loss": 3.1543,
"step": 98650
},
{
"epoch": 28.75036411302068,
"grad_norm": 0.4084044098854065,
"learning_rate": 0.00025514777032935,
"loss": 3.1819,
"step": 98700
},
{
"epoch": 28.764928633847948,
"grad_norm": 0.4427086114883423,
"learning_rate": 0.0002549728941999417,
"loss": 3.1693,
"step": 98750
},
{
"epoch": 28.77949315467521,
"grad_norm": 0.4414818584918976,
"learning_rate": 0.00025479801807053333,
"loss": 3.1639,
"step": 98800
},
{
"epoch": 28.794057675502476,
"grad_norm": 0.4295366704463959,
"learning_rate": 0.000254623141941125,
"loss": 3.1766,
"step": 98850
},
{
"epoch": 28.80862219632974,
"grad_norm": 0.4654344320297241,
"learning_rate": 0.0002544482658117167,
"loss": 3.1661,
"step": 98900
},
{
"epoch": 28.823186717157007,
"grad_norm": 0.4341161549091339,
"learning_rate": 0.00025427338968230835,
"loss": 3.1745,
"step": 98950
},
{
"epoch": 28.83775123798427,
"grad_norm": 0.42561206221580505,
"learning_rate": 0.0002540985135529,
"loss": 3.1694,
"step": 99000
},
{
"epoch": 28.83775123798427,
"eval_accuracy": 0.37556016304247486,
"eval_loss": 3.533388376235962,
"eval_runtime": 179.9588,
"eval_samples_per_second": 92.471,
"eval_steps_per_second": 5.785,
"step": 99000
},
{
"epoch": 28.852315758811535,
"grad_norm": 0.4475663900375366,
"learning_rate": 0.0002539236374234917,
"loss": 3.167,
"step": 99050
},
{
"epoch": 28.8668802796388,
"grad_norm": 0.42597696185112,
"learning_rate": 0.0002537487612940833,
"loss": 3.177,
"step": 99100
},
{
"epoch": 28.881444800466063,
"grad_norm": 0.4047093093395233,
"learning_rate": 0.000253573885164675,
"loss": 3.1726,
"step": 99150
},
{
"epoch": 28.89600932129333,
"grad_norm": 0.41124317049980164,
"learning_rate": 0.0002533990090352667,
"loss": 3.1627,
"step": 99200
},
{
"epoch": 28.910573842120595,
"grad_norm": 0.448076993227005,
"learning_rate": 0.00025322413290585833,
"loss": 3.1722,
"step": 99250
},
{
"epoch": 28.92513836294786,
"grad_norm": 0.472428560256958,
"learning_rate": 0.00025304925677644997,
"loss": 3.1716,
"step": 99300
},
{
"epoch": 28.939702883775123,
"grad_norm": 0.4386545419692993,
"learning_rate": 0.00025287438064704166,
"loss": 3.1641,
"step": 99350
},
{
"epoch": 28.954267404602387,
"grad_norm": 0.43108075857162476,
"learning_rate": 0.00025269950451763335,
"loss": 3.1787,
"step": 99400
},
{
"epoch": 28.968831925429654,
"grad_norm": 0.4541033208370209,
"learning_rate": 0.000252524628388225,
"loss": 3.1614,
"step": 99450
},
{
"epoch": 28.98339644625692,
"grad_norm": 0.44250738620758057,
"learning_rate": 0.0002523497522588167,
"loss": 3.1782,
"step": 99500
},
{
"epoch": 28.997960967084182,
"grad_norm": 0.43782711029052734,
"learning_rate": 0.0002521748761294083,
"loss": 3.1698,
"step": 99550
},
{
"epoch": 29.012525487911446,
"grad_norm": 0.4725230932235718,
"learning_rate": 0.00025199999999999995,
"loss": 3.0891,
"step": 99600
},
{
"epoch": 29.027090008738714,
"grad_norm": 0.4644363522529602,
"learning_rate": 0.00025182512387059164,
"loss": 3.0664,
"step": 99650
},
{
"epoch": 29.041654529565978,
"grad_norm": 0.43191707134246826,
"learning_rate": 0.0002516502477411833,
"loss": 3.0881,
"step": 99700
},
{
"epoch": 29.056219050393242,
"grad_norm": 0.45804300904273987,
"learning_rate": 0.00025147537161177496,
"loss": 3.0822,
"step": 99750
},
{
"epoch": 29.070783571220506,
"grad_norm": 0.4388163685798645,
"learning_rate": 0.00025130049548236665,
"loss": 3.0982,
"step": 99800
},
{
"epoch": 29.08534809204777,
"grad_norm": 0.4603483974933624,
"learning_rate": 0.0002511256193529583,
"loss": 3.0999,
"step": 99850
},
{
"epoch": 29.099912612875038,
"grad_norm": 0.4348946809768677,
"learning_rate": 0.00025095074322355,
"loss": 3.0925,
"step": 99900
},
{
"epoch": 29.1144771337023,
"grad_norm": 0.4159044027328491,
"learning_rate": 0.0002507758670941416,
"loss": 3.0993,
"step": 99950
},
{
"epoch": 29.129041654529566,
"grad_norm": 0.4123495817184448,
"learning_rate": 0.0002506009909647333,
"loss": 3.095,
"step": 100000
},
{
"epoch": 29.129041654529566,
"eval_accuracy": 0.37440417538508325,
"eval_loss": 3.551318645477295,
"eval_runtime": 179.942,
"eval_samples_per_second": 92.48,
"eval_steps_per_second": 5.785,
"step": 100000
},
{
"epoch": 29.129041654529566,
"step": 100000,
"total_flos": 2.090252903841792e+18,
"train_loss": 0.6325678546142578,
"train_runtime": 39813.0313,
"train_samples_per_second": 344.898,
"train_steps_per_second": 4.311
}
],
"logging_steps": 50,
"max_steps": 171650,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.090252903841792e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}