exceptions / last_to_hit_frequency_2128 /trainer_state.json
craa's picture
Upload folder using huggingface_hub
071a73a verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": 72000,
"best_metric": 3.5322375297546387,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_2128/checkpoint-30000",
"epoch": 29.129340480074575,
"eval_steps": 1000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01456536937776742,
"grad_norm": 0.8308652639389038,
"learning_rate": 0.000294,
"loss": 8.4387,
"step": 50
},
{
"epoch": 0.02913073875553484,
"grad_norm": 0.6382876038551331,
"learning_rate": 0.0005939999999999999,
"loss": 6.7184,
"step": 100
},
{
"epoch": 0.04369610813330226,
"grad_norm": 0.4452700912952423,
"learning_rate": 0.0005998286213931798,
"loss": 6.3602,
"step": 150
},
{
"epoch": 0.05826147751106968,
"grad_norm": 0.4627819061279297,
"learning_rate": 0.0005996537452637714,
"loss": 6.1529,
"step": 200
},
{
"epoch": 0.0728268468888371,
"grad_norm": 0.4908621311187744,
"learning_rate": 0.0005994788691343632,
"loss": 5.9753,
"step": 250
},
{
"epoch": 0.08739221626660452,
"grad_norm": 0.46045973896980286,
"learning_rate": 0.0005993039930049548,
"loss": 5.8546,
"step": 300
},
{
"epoch": 0.10195758564437195,
"grad_norm": 0.4518718123435974,
"learning_rate": 0.0005991291168755465,
"loss": 5.7234,
"step": 350
},
{
"epoch": 0.11652295502213936,
"grad_norm": 0.41379204392433167,
"learning_rate": 0.0005989542407461382,
"loss": 5.6087,
"step": 400
},
{
"epoch": 0.13108832439990678,
"grad_norm": 0.42392534017562866,
"learning_rate": 0.0005987793646167297,
"loss": 5.5138,
"step": 450
},
{
"epoch": 0.1456536937776742,
"grad_norm": 0.44991201162338257,
"learning_rate": 0.0005986044884873214,
"loss": 5.4074,
"step": 500
},
{
"epoch": 0.16021906315544163,
"grad_norm": 0.4449133574962616,
"learning_rate": 0.0005984296123579131,
"loss": 5.3554,
"step": 550
},
{
"epoch": 0.17478443253320905,
"grad_norm": 0.43911343812942505,
"learning_rate": 0.0005982547362285047,
"loss": 5.2571,
"step": 600
},
{
"epoch": 0.18934980191097647,
"grad_norm": 0.44633105397224426,
"learning_rate": 0.0005980798600990964,
"loss": 5.194,
"step": 650
},
{
"epoch": 0.2039151712887439,
"grad_norm": 0.5142127275466919,
"learning_rate": 0.0005979049839696881,
"loss": 5.1364,
"step": 700
},
{
"epoch": 0.2184805406665113,
"grad_norm": 0.42377275228500366,
"learning_rate": 0.0005977301078402798,
"loss": 5.0645,
"step": 750
},
{
"epoch": 0.23304591004427871,
"grad_norm": 0.49829915165901184,
"learning_rate": 0.0005975552317108715,
"loss": 5.0326,
"step": 800
},
{
"epoch": 0.24761127942204614,
"grad_norm": 0.4879177212715149,
"learning_rate": 0.0005973803555814631,
"loss": 4.9787,
"step": 850
},
{
"epoch": 0.26217664879981356,
"grad_norm": 0.40835854411125183,
"learning_rate": 0.0005972054794520547,
"loss": 4.9266,
"step": 900
},
{
"epoch": 0.276742018177581,
"grad_norm": 0.40832623839378357,
"learning_rate": 0.0005970306033226464,
"loss": 4.8749,
"step": 950
},
{
"epoch": 0.2913073875553484,
"grad_norm": 0.41372886300086975,
"learning_rate": 0.0005968557271932381,
"loss": 4.8229,
"step": 1000
},
{
"epoch": 0.2913073875553484,
"eval_accuracy": 0.2537010050443267,
"eval_loss": 4.7555251121521,
"eval_runtime": 182.9052,
"eval_samples_per_second": 91.003,
"eval_steps_per_second": 5.691,
"step": 1000
},
{
"epoch": 0.30587275693311583,
"grad_norm": 0.4627732038497925,
"learning_rate": 0.0005966808510638297,
"loss": 4.7732,
"step": 1050
},
{
"epoch": 0.32043812631088325,
"grad_norm": 0.5072389245033264,
"learning_rate": 0.0005965059749344214,
"loss": 4.7445,
"step": 1100
},
{
"epoch": 0.3350034956886507,
"grad_norm": 0.4509411156177521,
"learning_rate": 0.0005963310988050131,
"loss": 4.7006,
"step": 1150
},
{
"epoch": 0.3495688650664181,
"grad_norm": 0.4881671965122223,
"learning_rate": 0.0005961562226756047,
"loss": 4.6717,
"step": 1200
},
{
"epoch": 0.3641342344441855,
"grad_norm": 0.43225303292274475,
"learning_rate": 0.0005959813465461965,
"loss": 4.6353,
"step": 1250
},
{
"epoch": 0.37869960382195295,
"grad_norm": 0.40277963876724243,
"learning_rate": 0.000595806470416788,
"loss": 4.6007,
"step": 1300
},
{
"epoch": 0.39326497319972037,
"grad_norm": 0.5126720070838928,
"learning_rate": 0.0005956315942873797,
"loss": 4.5957,
"step": 1350
},
{
"epoch": 0.4078303425774878,
"grad_norm": 0.4341893196105957,
"learning_rate": 0.0005954567181579714,
"loss": 4.5667,
"step": 1400
},
{
"epoch": 0.42239571195525516,
"grad_norm": 0.466202974319458,
"learning_rate": 0.000595281842028563,
"loss": 4.5251,
"step": 1450
},
{
"epoch": 0.4369610813330226,
"grad_norm": 0.47109341621398926,
"learning_rate": 0.0005951069658991547,
"loss": 4.504,
"step": 1500
},
{
"epoch": 0.45152645071079,
"grad_norm": 0.39960241317749023,
"learning_rate": 0.0005949320897697464,
"loss": 4.4896,
"step": 1550
},
{
"epoch": 0.46609182008855743,
"grad_norm": 0.3836056590080261,
"learning_rate": 0.0005947572136403381,
"loss": 4.465,
"step": 1600
},
{
"epoch": 0.48065718946632485,
"grad_norm": 0.4436993896961212,
"learning_rate": 0.0005945823375109297,
"loss": 4.4483,
"step": 1650
},
{
"epoch": 0.4952225588440923,
"grad_norm": 0.49260610342025757,
"learning_rate": 0.0005944074613815215,
"loss": 4.4329,
"step": 1700
},
{
"epoch": 0.5097879282218597,
"grad_norm": 0.40900883078575134,
"learning_rate": 0.000594232585252113,
"loss": 4.4216,
"step": 1750
},
{
"epoch": 0.5243532975996271,
"grad_norm": 0.38395267724990845,
"learning_rate": 0.0005940577091227047,
"loss": 4.3901,
"step": 1800
},
{
"epoch": 0.5389186669773945,
"grad_norm": 0.4002784490585327,
"learning_rate": 0.0005938828329932964,
"loss": 4.3719,
"step": 1850
},
{
"epoch": 0.553484036355162,
"grad_norm": 0.39246320724487305,
"learning_rate": 0.000593707956863888,
"loss": 4.3556,
"step": 1900
},
{
"epoch": 0.5680494057329294,
"grad_norm": 0.429328054189682,
"learning_rate": 0.0005935330807344797,
"loss": 4.3495,
"step": 1950
},
{
"epoch": 0.5826147751106968,
"grad_norm": 0.37213221192359924,
"learning_rate": 0.0005933582046050714,
"loss": 4.3272,
"step": 2000
},
{
"epoch": 0.5826147751106968,
"eval_accuracy": 0.29963923746663224,
"eval_loss": 4.282708644866943,
"eval_runtime": 182.1675,
"eval_samples_per_second": 91.372,
"eval_steps_per_second": 5.715,
"step": 2000
},
{
"epoch": 0.5971801444884642,
"grad_norm": 0.4040358066558838,
"learning_rate": 0.000593183328475663,
"loss": 4.3313,
"step": 2050
},
{
"epoch": 0.6117455138662317,
"grad_norm": 0.4094908535480499,
"learning_rate": 0.0005930084523462546,
"loss": 4.3198,
"step": 2100
},
{
"epoch": 0.6263108832439991,
"grad_norm": 0.39454561471939087,
"learning_rate": 0.0005928335762168463,
"loss": 4.3,
"step": 2150
},
{
"epoch": 0.6408762526217665,
"grad_norm": 0.3934589624404907,
"learning_rate": 0.000592658700087438,
"loss": 4.3025,
"step": 2200
},
{
"epoch": 0.6554416219995339,
"grad_norm": 0.386088103055954,
"learning_rate": 0.0005924838239580297,
"loss": 4.2816,
"step": 2250
},
{
"epoch": 0.6700069913773014,
"grad_norm": 0.4144304096698761,
"learning_rate": 0.0005923089478286214,
"loss": 4.2738,
"step": 2300
},
{
"epoch": 0.6845723607550688,
"grad_norm": 0.4177938401699066,
"learning_rate": 0.000592134071699213,
"loss": 4.2481,
"step": 2350
},
{
"epoch": 0.6991377301328362,
"grad_norm": 0.36143994331359863,
"learning_rate": 0.0005919591955698047,
"loss": 4.2475,
"step": 2400
},
{
"epoch": 0.7137030995106036,
"grad_norm": 0.3758913278579712,
"learning_rate": 0.0005917843194403964,
"loss": 4.233,
"step": 2450
},
{
"epoch": 0.728268468888371,
"grad_norm": 0.3990520238876343,
"learning_rate": 0.000591609443310988,
"loss": 4.2258,
"step": 2500
},
{
"epoch": 0.7428338382661385,
"grad_norm": 0.341074138879776,
"learning_rate": 0.0005914345671815796,
"loss": 4.2438,
"step": 2550
},
{
"epoch": 0.7573992076439059,
"grad_norm": 0.3899039924144745,
"learning_rate": 0.0005912596910521713,
"loss": 4.2077,
"step": 2600
},
{
"epoch": 0.7719645770216733,
"grad_norm": 0.3767816424369812,
"learning_rate": 0.0005910848149227629,
"loss": 4.1915,
"step": 2650
},
{
"epoch": 0.7865299463994407,
"grad_norm": 0.3586917221546173,
"learning_rate": 0.0005909099387933547,
"loss": 4.192,
"step": 2700
},
{
"epoch": 0.8010953157772082,
"grad_norm": 0.3622867465019226,
"learning_rate": 0.0005907350626639463,
"loss": 4.1811,
"step": 2750
},
{
"epoch": 0.8156606851549756,
"grad_norm": 0.34509238600730896,
"learning_rate": 0.000590560186534538,
"loss": 4.1775,
"step": 2800
},
{
"epoch": 0.8302260545327429,
"grad_norm": 0.3676803708076477,
"learning_rate": 0.0005903853104051297,
"loss": 4.1742,
"step": 2850
},
{
"epoch": 0.8447914239105103,
"grad_norm": 0.34232082962989807,
"learning_rate": 0.0005902104342757214,
"loss": 4.1572,
"step": 2900
},
{
"epoch": 0.8593567932882777,
"grad_norm": 0.3571970760822296,
"learning_rate": 0.000590035558146313,
"loss": 4.1584,
"step": 2950
},
{
"epoch": 0.8739221626660452,
"grad_norm": 0.35053205490112305,
"learning_rate": 0.0005898606820169046,
"loss": 4.142,
"step": 3000
},
{
"epoch": 0.8739221626660452,
"eval_accuracy": 0.31524614092253395,
"eval_loss": 4.096119403839111,
"eval_runtime": 182.324,
"eval_samples_per_second": 91.294,
"eval_steps_per_second": 5.71,
"step": 3000
},
{
"epoch": 0.8884875320438126,
"grad_norm": 0.4020876884460449,
"learning_rate": 0.0005896858058874963,
"loss": 4.1395,
"step": 3050
},
{
"epoch": 0.90305290142158,
"grad_norm": 0.3728208839893341,
"learning_rate": 0.0005895109297580879,
"loss": 4.14,
"step": 3100
},
{
"epoch": 0.9176182707993474,
"grad_norm": 0.3634420335292816,
"learning_rate": 0.0005893360536286797,
"loss": 4.1265,
"step": 3150
},
{
"epoch": 0.9321836401771149,
"grad_norm": 0.3639736473560333,
"learning_rate": 0.0005891611774992713,
"loss": 4.1264,
"step": 3200
},
{
"epoch": 0.9467490095548823,
"grad_norm": 0.3608086407184601,
"learning_rate": 0.000588986301369863,
"loss": 4.1242,
"step": 3250
},
{
"epoch": 0.9613143789326497,
"grad_norm": 0.37832555174827576,
"learning_rate": 0.0005888114252404547,
"loss": 4.1139,
"step": 3300
},
{
"epoch": 0.9758797483104171,
"grad_norm": 0.3500097990036011,
"learning_rate": 0.0005886365491110463,
"loss": 4.0998,
"step": 3350
},
{
"epoch": 0.9904451176881846,
"grad_norm": 0.34508016705513,
"learning_rate": 0.000588461672981638,
"loss": 4.0876,
"step": 3400
},
{
"epoch": 1.0049522255884409,
"grad_norm": 0.346635103225708,
"learning_rate": 0.0005882867968522296,
"loss": 4.0698,
"step": 3450
},
{
"epoch": 1.0195175949662083,
"grad_norm": 0.351591020822525,
"learning_rate": 0.0005881119207228212,
"loss": 4.0152,
"step": 3500
},
{
"epoch": 1.0340829643439757,
"grad_norm": 0.342751145362854,
"learning_rate": 0.0005879370445934129,
"loss": 4.0189,
"step": 3550
},
{
"epoch": 1.0486483337217432,
"grad_norm": 0.34400904178619385,
"learning_rate": 0.0005877621684640046,
"loss": 4.0082,
"step": 3600
},
{
"epoch": 1.0632137030995106,
"grad_norm": 0.3556966483592987,
"learning_rate": 0.0005875872923345963,
"loss": 4.0164,
"step": 3650
},
{
"epoch": 1.077779072477278,
"grad_norm": 0.3546448349952698,
"learning_rate": 0.000587412416205188,
"loss": 4.0009,
"step": 3700
},
{
"epoch": 1.0923444418550454,
"grad_norm": 0.37118959426879883,
"learning_rate": 0.0005872375400757797,
"loss": 4.0167,
"step": 3750
},
{
"epoch": 1.1069098112328128,
"grad_norm": 0.3513905704021454,
"learning_rate": 0.0005870626639463713,
"loss": 4.0167,
"step": 3800
},
{
"epoch": 1.1214751806105803,
"grad_norm": 0.3534930348396301,
"learning_rate": 0.0005868877878169629,
"loss": 3.9929,
"step": 3850
},
{
"epoch": 1.1360405499883477,
"grad_norm": 0.3811454176902771,
"learning_rate": 0.0005867129116875546,
"loss": 3.9999,
"step": 3900
},
{
"epoch": 1.1506059193661151,
"grad_norm": 0.34738337993621826,
"learning_rate": 0.0005865380355581462,
"loss": 4.0029,
"step": 3950
},
{
"epoch": 1.1651712887438825,
"grad_norm": 0.3603108525276184,
"learning_rate": 0.0005863631594287379,
"loss": 3.9925,
"step": 4000
},
{
"epoch": 1.1651712887438825,
"eval_accuracy": 0.32528376909551887,
"eval_loss": 3.9905245304107666,
"eval_runtime": 182.0264,
"eval_samples_per_second": 91.443,
"eval_steps_per_second": 5.719,
"step": 4000
},
{
"epoch": 1.17973665812165,
"grad_norm": 0.3429189622402191,
"learning_rate": 0.0005861882832993296,
"loss": 3.9933,
"step": 4050
},
{
"epoch": 1.1943020274994174,
"grad_norm": 0.38075006008148193,
"learning_rate": 0.0005860134071699212,
"loss": 3.9855,
"step": 4100
},
{
"epoch": 1.2088673968771848,
"grad_norm": 0.3338114321231842,
"learning_rate": 0.000585838531040513,
"loss": 3.9794,
"step": 4150
},
{
"epoch": 1.2234327662549522,
"grad_norm": 0.36873266100883484,
"learning_rate": 0.0005856636549111046,
"loss": 3.9787,
"step": 4200
},
{
"epoch": 1.2379981356327197,
"grad_norm": 0.32824212312698364,
"learning_rate": 0.0005854887787816963,
"loss": 3.9766,
"step": 4250
},
{
"epoch": 1.252563505010487,
"grad_norm": 0.3516254723072052,
"learning_rate": 0.0005853139026522879,
"loss": 3.9651,
"step": 4300
},
{
"epoch": 1.2671288743882545,
"grad_norm": 0.32511937618255615,
"learning_rate": 0.0005851390265228796,
"loss": 3.9725,
"step": 4350
},
{
"epoch": 1.281694243766022,
"grad_norm": 0.32222479581832886,
"learning_rate": 0.0005849641503934712,
"loss": 3.9755,
"step": 4400
},
{
"epoch": 1.2962596131437893,
"grad_norm": 0.3308519423007965,
"learning_rate": 0.0005847892742640629,
"loss": 3.9635,
"step": 4450
},
{
"epoch": 1.3108249825215568,
"grad_norm": 0.32887038588523865,
"learning_rate": 0.0005846143981346546,
"loss": 3.9673,
"step": 4500
},
{
"epoch": 1.3253903518993242,
"grad_norm": 0.33978450298309326,
"learning_rate": 0.0005844395220052462,
"loss": 3.9637,
"step": 4550
},
{
"epoch": 1.3399557212770916,
"grad_norm": 0.3525462746620178,
"learning_rate": 0.000584264645875838,
"loss": 3.9552,
"step": 4600
},
{
"epoch": 1.354521090654859,
"grad_norm": 0.3444075882434845,
"learning_rate": 0.0005840897697464296,
"loss": 3.953,
"step": 4650
},
{
"epoch": 1.3690864600326265,
"grad_norm": 0.34169191122055054,
"learning_rate": 0.0005839148936170212,
"loss": 3.9486,
"step": 4700
},
{
"epoch": 1.3836518294103939,
"grad_norm": 0.3524395227432251,
"learning_rate": 0.0005837400174876129,
"loss": 3.941,
"step": 4750
},
{
"epoch": 1.3982171987881613,
"grad_norm": 0.3321269154548645,
"learning_rate": 0.0005835651413582045,
"loss": 3.9375,
"step": 4800
},
{
"epoch": 1.4127825681659287,
"grad_norm": 0.3419478237628937,
"learning_rate": 0.0005833902652287962,
"loss": 3.9489,
"step": 4850
},
{
"epoch": 1.4273479375436962,
"grad_norm": 0.33724281191825867,
"learning_rate": 0.0005832153890993879,
"loss": 3.939,
"step": 4900
},
{
"epoch": 1.4419133069214636,
"grad_norm": 0.32965749502182007,
"learning_rate": 0.0005830405129699796,
"loss": 3.9303,
"step": 4950
},
{
"epoch": 1.456478676299231,
"grad_norm": 0.32706567645072937,
"learning_rate": 0.0005828656368405712,
"loss": 3.9195,
"step": 5000
},
{
"epoch": 1.456478676299231,
"eval_accuracy": 0.33166874275109504,
"eval_loss": 3.9142563343048096,
"eval_runtime": 182.2224,
"eval_samples_per_second": 91.344,
"eval_steps_per_second": 5.713,
"step": 5000
},
{
"epoch": 1.4710440456769984,
"grad_norm": 0.33142364025115967,
"learning_rate": 0.0005826907607111629,
"loss": 3.9196,
"step": 5050
},
{
"epoch": 1.4856094150547658,
"grad_norm": 0.3274194896221161,
"learning_rate": 0.0005825158845817546,
"loss": 3.9284,
"step": 5100
},
{
"epoch": 1.500174784432533,
"grad_norm": 0.35101616382598877,
"learning_rate": 0.0005823410084523462,
"loss": 3.9313,
"step": 5150
},
{
"epoch": 1.5147401538103007,
"grad_norm": 0.3553934395313263,
"learning_rate": 0.0005821661323229379,
"loss": 3.9222,
"step": 5200
},
{
"epoch": 1.529305523188068,
"grad_norm": 0.32745224237442017,
"learning_rate": 0.0005819912561935295,
"loss": 3.9211,
"step": 5250
},
{
"epoch": 1.5438708925658355,
"grad_norm": 0.32173994183540344,
"learning_rate": 0.0005818163800641212,
"loss": 3.9237,
"step": 5300
},
{
"epoch": 1.5584362619436027,
"grad_norm": 0.3147367835044861,
"learning_rate": 0.0005816415039347129,
"loss": 3.9025,
"step": 5350
},
{
"epoch": 1.5730016313213704,
"grad_norm": 0.3226154148578644,
"learning_rate": 0.0005814666278053045,
"loss": 3.9197,
"step": 5400
},
{
"epoch": 1.5875670006991376,
"grad_norm": 0.3392418324947357,
"learning_rate": 0.0005812917516758962,
"loss": 3.9199,
"step": 5450
},
{
"epoch": 1.6021323700769052,
"grad_norm": 0.3240615427494049,
"learning_rate": 0.0005811168755464879,
"loss": 3.9066,
"step": 5500
},
{
"epoch": 1.6166977394546724,
"grad_norm": 0.3571517765522003,
"learning_rate": 0.0005809419994170794,
"loss": 3.9046,
"step": 5550
},
{
"epoch": 1.63126310883244,
"grad_norm": 0.3363195061683655,
"learning_rate": 0.0005807671232876712,
"loss": 3.895,
"step": 5600
},
{
"epoch": 1.6458284782102073,
"grad_norm": 0.35087713599205017,
"learning_rate": 0.0005805922471582628,
"loss": 3.895,
"step": 5650
},
{
"epoch": 1.660393847587975,
"grad_norm": 0.3502371907234192,
"learning_rate": 0.0005804173710288545,
"loss": 3.8907,
"step": 5700
},
{
"epoch": 1.6749592169657421,
"grad_norm": 0.37237074971199036,
"learning_rate": 0.0005802424948994462,
"loss": 3.8942,
"step": 5750
},
{
"epoch": 1.6895245863435098,
"grad_norm": 0.3460238575935364,
"learning_rate": 0.0005800676187700379,
"loss": 3.8973,
"step": 5800
},
{
"epoch": 1.704089955721277,
"grad_norm": 0.3247397243976593,
"learning_rate": 0.0005798927426406295,
"loss": 3.8874,
"step": 5850
},
{
"epoch": 1.7186553250990446,
"grad_norm": 0.37104010581970215,
"learning_rate": 0.0005797178665112212,
"loss": 3.8913,
"step": 5900
},
{
"epoch": 1.7332206944768118,
"grad_norm": 0.32479041814804077,
"learning_rate": 0.0005795429903818129,
"loss": 3.8934,
"step": 5950
},
{
"epoch": 1.7477860638545795,
"grad_norm": 0.34869810938835144,
"learning_rate": 0.0005793681142524044,
"loss": 3.8842,
"step": 6000
},
{
"epoch": 1.7477860638545795,
"eval_accuracy": 0.3369487966450319,
"eval_loss": 3.8599915504455566,
"eval_runtime": 182.1572,
"eval_samples_per_second": 91.377,
"eval_steps_per_second": 5.715,
"step": 6000
},
{
"epoch": 1.7623514332323467,
"grad_norm": 0.32061490416526794,
"learning_rate": 0.0005791932381229961,
"loss": 3.8863,
"step": 6050
},
{
"epoch": 1.7769168026101143,
"grad_norm": 0.3404031991958618,
"learning_rate": 0.0005790183619935878,
"loss": 3.882,
"step": 6100
},
{
"epoch": 1.7914821719878815,
"grad_norm": 0.30672210454940796,
"learning_rate": 0.0005788434858641795,
"loss": 3.8718,
"step": 6150
},
{
"epoch": 1.8060475413656492,
"grad_norm": 0.3539854884147644,
"learning_rate": 0.0005786686097347712,
"loss": 3.8862,
"step": 6200
},
{
"epoch": 1.8206129107434164,
"grad_norm": 0.33336907625198364,
"learning_rate": 0.0005784937336053628,
"loss": 3.8618,
"step": 6250
},
{
"epoch": 1.835178280121184,
"grad_norm": 0.3471635580062866,
"learning_rate": 0.0005783188574759545,
"loss": 3.8604,
"step": 6300
},
{
"epoch": 1.8497436494989512,
"grad_norm": 0.32666853070259094,
"learning_rate": 0.0005781439813465462,
"loss": 3.8604,
"step": 6350
},
{
"epoch": 1.8643090188767188,
"grad_norm": 0.3473672866821289,
"learning_rate": 0.0005779691052171379,
"loss": 3.8667,
"step": 6400
},
{
"epoch": 1.878874388254486,
"grad_norm": 0.3199038803577423,
"learning_rate": 0.0005777942290877294,
"loss": 3.8611,
"step": 6450
},
{
"epoch": 1.8934397576322537,
"grad_norm": 0.32697010040283203,
"learning_rate": 0.0005776193529583211,
"loss": 3.858,
"step": 6500
},
{
"epoch": 1.908005127010021,
"grad_norm": 0.37832486629486084,
"learning_rate": 0.0005774444768289128,
"loss": 3.8601,
"step": 6550
},
{
"epoch": 1.9225704963877885,
"grad_norm": 0.3379972279071808,
"learning_rate": 0.0005772696006995045,
"loss": 3.856,
"step": 6600
},
{
"epoch": 1.9371358657655557,
"grad_norm": 0.3384763300418854,
"learning_rate": 0.0005770947245700962,
"loss": 3.848,
"step": 6650
},
{
"epoch": 1.9517012351433234,
"grad_norm": 0.3093826472759247,
"learning_rate": 0.0005769198484406878,
"loss": 3.8563,
"step": 6700
},
{
"epoch": 1.9662666045210906,
"grad_norm": 0.32603582739830017,
"learning_rate": 0.0005767449723112795,
"loss": 3.8538,
"step": 6750
},
{
"epoch": 1.9808319738988582,
"grad_norm": 0.36787310242652893,
"learning_rate": 0.0005765700961818712,
"loss": 3.8543,
"step": 6800
},
{
"epoch": 1.9953973432766254,
"grad_norm": 0.32396772503852844,
"learning_rate": 0.0005763952200524627,
"loss": 3.8532,
"step": 6850
},
{
"epoch": 2.0099044511768818,
"grad_norm": 0.3264414072036743,
"learning_rate": 0.0005762203439230544,
"loss": 3.7844,
"step": 6900
},
{
"epoch": 2.0244698205546494,
"grad_norm": 0.3082588315010071,
"learning_rate": 0.0005760454677936461,
"loss": 3.722,
"step": 6950
},
{
"epoch": 2.0390351899324166,
"grad_norm": 0.3626100718975067,
"learning_rate": 0.0005758705916642378,
"loss": 3.7443,
"step": 7000
},
{
"epoch": 2.0390351899324166,
"eval_accuracy": 0.34116120036282,
"eval_loss": 3.816786766052246,
"eval_runtime": 182.2244,
"eval_samples_per_second": 91.343,
"eval_steps_per_second": 5.713,
"step": 7000
},
{
"epoch": 2.0536005593101843,
"grad_norm": 0.33928582072257996,
"learning_rate": 0.0005756957155348294,
"loss": 3.7456,
"step": 7050
},
{
"epoch": 2.0681659286879515,
"grad_norm": 0.33408093452453613,
"learning_rate": 0.0005755208394054211,
"loss": 3.7558,
"step": 7100
},
{
"epoch": 2.082731298065719,
"grad_norm": 0.35014262795448303,
"learning_rate": 0.0005753459632760128,
"loss": 3.7546,
"step": 7150
},
{
"epoch": 2.0972966674434863,
"grad_norm": 0.33521801233291626,
"learning_rate": 0.0005751710871466045,
"loss": 3.7488,
"step": 7200
},
{
"epoch": 2.111862036821254,
"grad_norm": 0.3408453166484833,
"learning_rate": 0.0005749962110171962,
"loss": 3.7638,
"step": 7250
},
{
"epoch": 2.126427406199021,
"grad_norm": 0.32431185245513916,
"learning_rate": 0.0005748213348877877,
"loss": 3.7519,
"step": 7300
},
{
"epoch": 2.140992775576789,
"grad_norm": 0.32259050011634827,
"learning_rate": 0.0005746464587583794,
"loss": 3.7608,
"step": 7350
},
{
"epoch": 2.155558144954556,
"grad_norm": 0.3296469748020172,
"learning_rate": 0.0005744715826289711,
"loss": 3.7617,
"step": 7400
},
{
"epoch": 2.1701235143323236,
"grad_norm": 0.3369705379009247,
"learning_rate": 0.0005742967064995627,
"loss": 3.747,
"step": 7450
},
{
"epoch": 2.184688883710091,
"grad_norm": 0.335363507270813,
"learning_rate": 0.0005741218303701544,
"loss": 3.7517,
"step": 7500
},
{
"epoch": 2.1992542530878585,
"grad_norm": 0.3429674208164215,
"learning_rate": 0.0005739469542407461,
"loss": 3.7613,
"step": 7550
},
{
"epoch": 2.2138196224656257,
"grad_norm": 0.3400017023086548,
"learning_rate": 0.0005737720781113378,
"loss": 3.7465,
"step": 7600
},
{
"epoch": 2.2283849918433933,
"grad_norm": 0.33040422201156616,
"learning_rate": 0.0005735972019819295,
"loss": 3.765,
"step": 7650
},
{
"epoch": 2.2429503612211605,
"grad_norm": 0.325589656829834,
"learning_rate": 0.000573422325852521,
"loss": 3.7555,
"step": 7700
},
{
"epoch": 2.257515730598928,
"grad_norm": 0.31000620126724243,
"learning_rate": 0.0005732474497231127,
"loss": 3.7614,
"step": 7750
},
{
"epoch": 2.2720810999766954,
"grad_norm": 0.3232748806476593,
"learning_rate": 0.0005730725735937044,
"loss": 3.7535,
"step": 7800
},
{
"epoch": 2.286646469354463,
"grad_norm": 0.3055737018585205,
"learning_rate": 0.0005728976974642961,
"loss": 3.7598,
"step": 7850
},
{
"epoch": 2.3012118387322302,
"grad_norm": 0.32002055644989014,
"learning_rate": 0.0005727228213348877,
"loss": 3.7501,
"step": 7900
},
{
"epoch": 2.3157772081099974,
"grad_norm": 0.3241938054561615,
"learning_rate": 0.0005725479452054794,
"loss": 3.7547,
"step": 7950
},
{
"epoch": 2.330342577487765,
"grad_norm": 0.3343994915485382,
"learning_rate": 0.0005723730690760711,
"loss": 3.7503,
"step": 8000
},
{
"epoch": 2.330342577487765,
"eval_accuracy": 0.3443282921418196,
"eval_loss": 3.7861814498901367,
"eval_runtime": 182.4203,
"eval_samples_per_second": 91.245,
"eval_steps_per_second": 5.707,
"step": 8000
},
{
"epoch": 2.3449079468655327,
"grad_norm": 0.32609114050865173,
"learning_rate": 0.0005721981929466627,
"loss": 3.7511,
"step": 8050
},
{
"epoch": 2.3594733162433,
"grad_norm": 0.3273298144340515,
"learning_rate": 0.0005720233168172545,
"loss": 3.7491,
"step": 8100
},
{
"epoch": 2.374038685621067,
"grad_norm": 0.31795287132263184,
"learning_rate": 0.000571848440687846,
"loss": 3.7475,
"step": 8150
},
{
"epoch": 2.3886040549988348,
"grad_norm": 0.3376888334751129,
"learning_rate": 0.0005716735645584377,
"loss": 3.7563,
"step": 8200
},
{
"epoch": 2.4031694243766024,
"grad_norm": 0.32242295145988464,
"learning_rate": 0.0005714986884290294,
"loss": 3.7462,
"step": 8250
},
{
"epoch": 2.4177347937543696,
"grad_norm": 0.31965371966362,
"learning_rate": 0.000571323812299621,
"loss": 3.7578,
"step": 8300
},
{
"epoch": 2.432300163132137,
"grad_norm": 0.3355007469654083,
"learning_rate": 0.0005711489361702127,
"loss": 3.7568,
"step": 8350
},
{
"epoch": 2.4468655325099045,
"grad_norm": 0.32753318548202515,
"learning_rate": 0.0005709740600408044,
"loss": 3.7353,
"step": 8400
},
{
"epoch": 2.461430901887672,
"grad_norm": 0.3319459855556488,
"learning_rate": 0.0005707991839113961,
"loss": 3.743,
"step": 8450
},
{
"epoch": 2.4759962712654393,
"grad_norm": 0.3193652927875519,
"learning_rate": 0.0005706243077819877,
"loss": 3.7468,
"step": 8500
},
{
"epoch": 2.4905616406432065,
"grad_norm": 0.32112497091293335,
"learning_rate": 0.0005704494316525793,
"loss": 3.7436,
"step": 8550
},
{
"epoch": 2.505127010020974,
"grad_norm": 0.3209002614021301,
"learning_rate": 0.000570274555523171,
"loss": 3.7432,
"step": 8600
},
{
"epoch": 2.519692379398742,
"grad_norm": 0.3239600956439972,
"learning_rate": 0.0005700996793937627,
"loss": 3.7495,
"step": 8650
},
{
"epoch": 2.534257748776509,
"grad_norm": 0.34214961528778076,
"learning_rate": 0.0005699248032643544,
"loss": 3.7486,
"step": 8700
},
{
"epoch": 2.548823118154276,
"grad_norm": 0.3178744316101074,
"learning_rate": 0.000569749927134946,
"loss": 3.7443,
"step": 8750
},
{
"epoch": 2.563388487532044,
"grad_norm": 0.3296307921409607,
"learning_rate": 0.0005695750510055377,
"loss": 3.7494,
"step": 8800
},
{
"epoch": 2.5779538569098115,
"grad_norm": 0.33302944898605347,
"learning_rate": 0.0005694001748761294,
"loss": 3.7445,
"step": 8850
},
{
"epoch": 2.5925192262875787,
"grad_norm": 0.33363667130470276,
"learning_rate": 0.000569225298746721,
"loss": 3.747,
"step": 8900
},
{
"epoch": 2.607084595665346,
"grad_norm": 0.33573073148727417,
"learning_rate": 0.0005690504226173127,
"loss": 3.731,
"step": 8950
},
{
"epoch": 2.6216499650431135,
"grad_norm": 0.3124948740005493,
"learning_rate": 0.0005688755464879043,
"loss": 3.7376,
"step": 9000
},
{
"epoch": 2.6216499650431135,
"eval_accuracy": 0.3468023107143004,
"eval_loss": 3.756934881210327,
"eval_runtime": 182.4097,
"eval_samples_per_second": 91.251,
"eval_steps_per_second": 5.707,
"step": 9000
},
{
"epoch": 2.636215334420881,
"grad_norm": 0.3117513954639435,
"learning_rate": 0.000568700670358496,
"loss": 3.7412,
"step": 9050
},
{
"epoch": 2.6507807037986484,
"grad_norm": 0.3153388798236847,
"learning_rate": 0.0005685257942290877,
"loss": 3.7424,
"step": 9100
},
{
"epoch": 2.6653460731764156,
"grad_norm": 0.31582581996917725,
"learning_rate": 0.0005683509180996793,
"loss": 3.7352,
"step": 9150
},
{
"epoch": 2.6799114425541832,
"grad_norm": 0.31198346614837646,
"learning_rate": 0.000568176041970271,
"loss": 3.7397,
"step": 9200
},
{
"epoch": 2.6944768119319504,
"grad_norm": 0.33701658248901367,
"learning_rate": 0.0005680011658408627,
"loss": 3.7386,
"step": 9250
},
{
"epoch": 2.709042181309718,
"grad_norm": 0.3240450918674469,
"learning_rate": 0.0005678262897114544,
"loss": 3.7343,
"step": 9300
},
{
"epoch": 2.7236075506874853,
"grad_norm": 0.31347861886024475,
"learning_rate": 0.000567651413582046,
"loss": 3.7271,
"step": 9350
},
{
"epoch": 2.738172920065253,
"grad_norm": 0.33607959747314453,
"learning_rate": 0.0005674765374526377,
"loss": 3.7378,
"step": 9400
},
{
"epoch": 2.75273828944302,
"grad_norm": 0.33370694518089294,
"learning_rate": 0.0005673016613232293,
"loss": 3.7344,
"step": 9450
},
{
"epoch": 2.7673036588207878,
"grad_norm": 0.29530900716781616,
"learning_rate": 0.0005671267851938209,
"loss": 3.7352,
"step": 9500
},
{
"epoch": 2.781869028198555,
"grad_norm": 0.3362729251384735,
"learning_rate": 0.0005669519090644127,
"loss": 3.7499,
"step": 9550
},
{
"epoch": 2.7964343975763226,
"grad_norm": 0.3185634911060333,
"learning_rate": 0.0005667770329350043,
"loss": 3.727,
"step": 9600
},
{
"epoch": 2.81099976695409,
"grad_norm": 0.3251460790634155,
"learning_rate": 0.000566602156805596,
"loss": 3.7348,
"step": 9650
},
{
"epoch": 2.8255651363318575,
"grad_norm": 0.32407787442207336,
"learning_rate": 0.0005664272806761877,
"loss": 3.7312,
"step": 9700
},
{
"epoch": 2.8401305057096247,
"grad_norm": 0.31047409772872925,
"learning_rate": 0.0005662524045467793,
"loss": 3.7414,
"step": 9750
},
{
"epoch": 2.8546958750873923,
"grad_norm": 0.3150789439678192,
"learning_rate": 0.000566077528417371,
"loss": 3.7292,
"step": 9800
},
{
"epoch": 2.8692612444651595,
"grad_norm": 0.32285672426223755,
"learning_rate": 0.0005659026522879626,
"loss": 3.7278,
"step": 9850
},
{
"epoch": 2.883826613842927,
"grad_norm": 0.3205214738845825,
"learning_rate": 0.0005657277761585543,
"loss": 3.7117,
"step": 9900
},
{
"epoch": 2.8983919832206944,
"grad_norm": 0.3268585503101349,
"learning_rate": 0.0005655529000291459,
"loss": 3.7298,
"step": 9950
},
{
"epoch": 2.912957352598462,
"grad_norm": 0.3318754732608795,
"learning_rate": 0.0005653780238997376,
"loss": 3.7244,
"step": 10000
},
{
"epoch": 2.912957352598462,
"eval_accuracy": 0.3494827816278579,
"eval_loss": 3.7275166511535645,
"eval_runtime": 182.3792,
"eval_samples_per_second": 91.266,
"eval_steps_per_second": 5.708,
"step": 10000
},
{
"epoch": 2.927522721976229,
"grad_norm": 0.32303711771965027,
"learning_rate": 0.0005652031477703293,
"loss": 3.7313,
"step": 10050
},
{
"epoch": 2.942088091353997,
"grad_norm": 0.3396250009536743,
"learning_rate": 0.000565028271640921,
"loss": 3.728,
"step": 10100
},
{
"epoch": 2.956653460731764,
"grad_norm": 0.31802433729171753,
"learning_rate": 0.0005648533955115127,
"loss": 3.7244,
"step": 10150
},
{
"epoch": 2.9712188301095317,
"grad_norm": 0.3270646333694458,
"learning_rate": 0.0005646785193821043,
"loss": 3.7336,
"step": 10200
},
{
"epoch": 2.985784199487299,
"grad_norm": 0.3221674859523773,
"learning_rate": 0.000564503643252696,
"loss": 3.7288,
"step": 10250
},
{
"epoch": 3.0002913073875552,
"grad_norm": 0.35366907715797424,
"learning_rate": 0.0005643287671232876,
"loss": 3.7151,
"step": 10300
},
{
"epoch": 3.014856676765323,
"grad_norm": 0.33569812774658203,
"learning_rate": 0.0005641538909938792,
"loss": 3.6103,
"step": 10350
},
{
"epoch": 3.02942204614309,
"grad_norm": 0.3463501036167145,
"learning_rate": 0.0005639790148644709,
"loss": 3.6142,
"step": 10400
},
{
"epoch": 3.0439874155208577,
"grad_norm": 0.3205231726169586,
"learning_rate": 0.0005638041387350626,
"loss": 3.6093,
"step": 10450
},
{
"epoch": 3.058552784898625,
"grad_norm": 0.31053611636161804,
"learning_rate": 0.0005636292626056543,
"loss": 3.6248,
"step": 10500
},
{
"epoch": 3.0731181542763926,
"grad_norm": 0.32655248045921326,
"learning_rate": 0.000563454386476246,
"loss": 3.6235,
"step": 10550
},
{
"epoch": 3.0876835236541598,
"grad_norm": 0.3263218104839325,
"learning_rate": 0.0005632795103468376,
"loss": 3.6202,
"step": 10600
},
{
"epoch": 3.1022488930319274,
"grad_norm": 0.32272443175315857,
"learning_rate": 0.0005631046342174293,
"loss": 3.6317,
"step": 10650
},
{
"epoch": 3.1168142624096946,
"grad_norm": 0.3152412474155426,
"learning_rate": 0.000562929758088021,
"loss": 3.6225,
"step": 10700
},
{
"epoch": 3.1313796317874623,
"grad_norm": 0.3140038251876831,
"learning_rate": 0.0005627548819586126,
"loss": 3.6281,
"step": 10750
},
{
"epoch": 3.1459450011652295,
"grad_norm": 0.3572128713130951,
"learning_rate": 0.0005625800058292042,
"loss": 3.629,
"step": 10800
},
{
"epoch": 3.160510370542997,
"grad_norm": 0.3352822959423065,
"learning_rate": 0.0005624051296997959,
"loss": 3.623,
"step": 10850
},
{
"epoch": 3.1750757399207643,
"grad_norm": 0.3205251097679138,
"learning_rate": 0.0005622302535703876,
"loss": 3.6395,
"step": 10900
},
{
"epoch": 3.189641109298532,
"grad_norm": 0.3109528720378876,
"learning_rate": 0.0005620553774409792,
"loss": 3.6292,
"step": 10950
},
{
"epoch": 3.204206478676299,
"grad_norm": 0.3360290825366974,
"learning_rate": 0.000561880501311571,
"loss": 3.6353,
"step": 11000
},
{
"epoch": 3.204206478676299,
"eval_accuracy": 0.35141351075380384,
"eval_loss": 3.7175660133361816,
"eval_runtime": 182.3999,
"eval_samples_per_second": 91.256,
"eval_steps_per_second": 5.707,
"step": 11000
},
{
"epoch": 3.218771848054067,
"grad_norm": 0.32028627395629883,
"learning_rate": 0.0005617056251821626,
"loss": 3.6444,
"step": 11050
},
{
"epoch": 3.233337217431834,
"grad_norm": 0.31713923811912537,
"learning_rate": 0.0005615307490527543,
"loss": 3.639,
"step": 11100
},
{
"epoch": 3.2479025868096016,
"grad_norm": 0.3299584686756134,
"learning_rate": 0.000561355872923346,
"loss": 3.6466,
"step": 11150
},
{
"epoch": 3.262467956187369,
"grad_norm": 0.3305450677871704,
"learning_rate": 0.0005611809967939375,
"loss": 3.6363,
"step": 11200
},
{
"epoch": 3.2770333255651365,
"grad_norm": 0.3444271385669708,
"learning_rate": 0.0005610061206645292,
"loss": 3.6381,
"step": 11250
},
{
"epoch": 3.2915986949429037,
"grad_norm": 0.31552445888519287,
"learning_rate": 0.0005608312445351209,
"loss": 3.6396,
"step": 11300
},
{
"epoch": 3.3061640643206713,
"grad_norm": 0.309539258480072,
"learning_rate": 0.0005606563684057126,
"loss": 3.6437,
"step": 11350
},
{
"epoch": 3.3207294336984385,
"grad_norm": 0.322343647480011,
"learning_rate": 0.0005604814922763042,
"loss": 3.6493,
"step": 11400
},
{
"epoch": 3.335294803076206,
"grad_norm": 0.3462202847003937,
"learning_rate": 0.0005603066161468959,
"loss": 3.6308,
"step": 11450
},
{
"epoch": 3.3498601724539734,
"grad_norm": 0.3419882357120514,
"learning_rate": 0.0005601317400174876,
"loss": 3.6382,
"step": 11500
},
{
"epoch": 3.364425541831741,
"grad_norm": 0.346147358417511,
"learning_rate": 0.0005599568638880793,
"loss": 3.632,
"step": 11550
},
{
"epoch": 3.3789909112095082,
"grad_norm": 0.32104918360710144,
"learning_rate": 0.0005597819877586709,
"loss": 3.6416,
"step": 11600
},
{
"epoch": 3.393556280587276,
"grad_norm": 0.3208399713039398,
"learning_rate": 0.0005596071116292625,
"loss": 3.6589,
"step": 11650
},
{
"epoch": 3.408121649965043,
"grad_norm": 0.3355486989021301,
"learning_rate": 0.0005594322354998542,
"loss": 3.6351,
"step": 11700
},
{
"epoch": 3.4226870193428107,
"grad_norm": 0.329441100358963,
"learning_rate": 0.0005592573593704459,
"loss": 3.6544,
"step": 11750
},
{
"epoch": 3.437252388720578,
"grad_norm": 0.331617534160614,
"learning_rate": 0.0005590824832410375,
"loss": 3.6444,
"step": 11800
},
{
"epoch": 3.4518177580983456,
"grad_norm": 0.35520729422569275,
"learning_rate": 0.0005589076071116292,
"loss": 3.6517,
"step": 11850
},
{
"epoch": 3.4663831274761128,
"grad_norm": 0.32801005244255066,
"learning_rate": 0.0005587327309822209,
"loss": 3.6411,
"step": 11900
},
{
"epoch": 3.4809484968538804,
"grad_norm": 0.3370635509490967,
"learning_rate": 0.0005585578548528126,
"loss": 3.6359,
"step": 11950
},
{
"epoch": 3.4955138662316476,
"grad_norm": 0.31257134675979614,
"learning_rate": 0.0005583829787234043,
"loss": 3.6428,
"step": 12000
},
{
"epoch": 3.4955138662316476,
"eval_accuracy": 0.3529537910046269,
"eval_loss": 3.6976659297943115,
"eval_runtime": 182.5172,
"eval_samples_per_second": 91.197,
"eval_steps_per_second": 5.704,
"step": 12000
},
{
"epoch": 3.510079235609415,
"grad_norm": 0.3163146674633026,
"learning_rate": 0.0005582081025939958,
"loss": 3.6404,
"step": 12050
},
{
"epoch": 3.5246446049871825,
"grad_norm": 0.31714287400245667,
"learning_rate": 0.0005580332264645875,
"loss": 3.6287,
"step": 12100
},
{
"epoch": 3.53920997436495,
"grad_norm": 0.31056082248687744,
"learning_rate": 0.0005578583503351792,
"loss": 3.6548,
"step": 12150
},
{
"epoch": 3.5537753437427173,
"grad_norm": 0.31519201397895813,
"learning_rate": 0.0005576834742057709,
"loss": 3.6466,
"step": 12200
},
{
"epoch": 3.5683407131204845,
"grad_norm": 0.3620156943798065,
"learning_rate": 0.0005575085980763625,
"loss": 3.6459,
"step": 12250
},
{
"epoch": 3.582906082498252,
"grad_norm": 0.3160246014595032,
"learning_rate": 0.0005573337219469542,
"loss": 3.6571,
"step": 12300
},
{
"epoch": 3.59747145187602,
"grad_norm": 0.3447693884372711,
"learning_rate": 0.0005571588458175459,
"loss": 3.639,
"step": 12350
},
{
"epoch": 3.612036821253787,
"grad_norm": 0.31839776039123535,
"learning_rate": 0.0005569839696881374,
"loss": 3.6516,
"step": 12400
},
{
"epoch": 3.626602190631554,
"grad_norm": 0.3184966742992401,
"learning_rate": 0.0005568090935587292,
"loss": 3.6529,
"step": 12450
},
{
"epoch": 3.641167560009322,
"grad_norm": 0.3189146816730499,
"learning_rate": 0.0005566342174293208,
"loss": 3.6469,
"step": 12500
},
{
"epoch": 3.6557329293870895,
"grad_norm": 0.34218892455101013,
"learning_rate": 0.0005564593412999125,
"loss": 3.6535,
"step": 12550
},
{
"epoch": 3.6702982987648567,
"grad_norm": 0.3211210370063782,
"learning_rate": 0.0005562844651705042,
"loss": 3.6398,
"step": 12600
},
{
"epoch": 3.684863668142624,
"grad_norm": 0.31546565890312195,
"learning_rate": 0.0005561095890410958,
"loss": 3.6409,
"step": 12650
},
{
"epoch": 3.6994290375203915,
"grad_norm": 0.32176557183265686,
"learning_rate": 0.0005559347129116875,
"loss": 3.6457,
"step": 12700
},
{
"epoch": 3.713994406898159,
"grad_norm": 0.323650598526001,
"learning_rate": 0.0005557598367822792,
"loss": 3.6463,
"step": 12750
},
{
"epoch": 3.7285597762759264,
"grad_norm": 0.31617245078086853,
"learning_rate": 0.0005555849606528709,
"loss": 3.6484,
"step": 12800
},
{
"epoch": 3.7431251456536936,
"grad_norm": 0.3181900084018707,
"learning_rate": 0.0005554100845234624,
"loss": 3.659,
"step": 12850
},
{
"epoch": 3.7576905150314612,
"grad_norm": 0.3386443257331848,
"learning_rate": 0.0005552352083940541,
"loss": 3.6516,
"step": 12900
},
{
"epoch": 3.772255884409229,
"grad_norm": 0.33526474237442017,
"learning_rate": 0.0005550603322646458,
"loss": 3.6433,
"step": 12950
},
{
"epoch": 3.786821253786996,
"grad_norm": 0.3211400806903839,
"learning_rate": 0.0005548854561352375,
"loss": 3.646,
"step": 13000
},
{
"epoch": 3.786821253786996,
"eval_accuracy": 0.3546315102000507,
"eval_loss": 3.6797258853912354,
"eval_runtime": 182.657,
"eval_samples_per_second": 91.127,
"eval_steps_per_second": 5.699,
"step": 13000
},
{
"epoch": 3.8013866231647633,
"grad_norm": 0.31162765622138977,
"learning_rate": 0.0005547105800058292,
"loss": 3.6343,
"step": 13050
},
{
"epoch": 3.815951992542531,
"grad_norm": 0.292121559381485,
"learning_rate": 0.0005545357038764208,
"loss": 3.6399,
"step": 13100
},
{
"epoch": 3.8305173619202986,
"grad_norm": 0.354305237531662,
"learning_rate": 0.0005543608277470125,
"loss": 3.6457,
"step": 13150
},
{
"epoch": 3.8450827312980658,
"grad_norm": 0.3242047131061554,
"learning_rate": 0.0005541859516176042,
"loss": 3.6389,
"step": 13200
},
{
"epoch": 3.859648100675833,
"grad_norm": 0.3040805757045746,
"learning_rate": 0.0005540110754881958,
"loss": 3.6281,
"step": 13250
},
{
"epoch": 3.8742134700536006,
"grad_norm": 0.33070269227027893,
"learning_rate": 0.0005538361993587874,
"loss": 3.6411,
"step": 13300
},
{
"epoch": 3.888778839431368,
"grad_norm": 0.3205200135707855,
"learning_rate": 0.0005536613232293791,
"loss": 3.6402,
"step": 13350
},
{
"epoch": 3.9033442088091355,
"grad_norm": 0.3389040231704712,
"learning_rate": 0.0005534864470999708,
"loss": 3.6405,
"step": 13400
},
{
"epoch": 3.9179095781869027,
"grad_norm": 0.34000879526138306,
"learning_rate": 0.0005533115709705625,
"loss": 3.6363,
"step": 13450
},
{
"epoch": 3.9324749475646703,
"grad_norm": 0.31868523359298706,
"learning_rate": 0.0005531366948411541,
"loss": 3.6466,
"step": 13500
},
{
"epoch": 3.9470403169424375,
"grad_norm": 0.31396111845970154,
"learning_rate": 0.0005529618187117458,
"loss": 3.6298,
"step": 13550
},
{
"epoch": 3.961605686320205,
"grad_norm": 0.31641459465026855,
"learning_rate": 0.0005527869425823375,
"loss": 3.6324,
"step": 13600
},
{
"epoch": 3.9761710556979724,
"grad_norm": 0.3213881254196167,
"learning_rate": 0.0005526120664529292,
"loss": 3.6464,
"step": 13650
},
{
"epoch": 3.99073642507574,
"grad_norm": 0.32177630066871643,
"learning_rate": 0.0005524371903235207,
"loss": 3.6374,
"step": 13700
},
{
"epoch": 4.005243532975996,
"grad_norm": 0.32364439964294434,
"learning_rate": 0.0005522623141941124,
"loss": 3.6032,
"step": 13750
},
{
"epoch": 4.0198089023537635,
"grad_norm": 0.3256928026676178,
"learning_rate": 0.0005520874380647041,
"loss": 3.5313,
"step": 13800
},
{
"epoch": 4.034374271731531,
"grad_norm": 0.332454651594162,
"learning_rate": 0.0005519125619352957,
"loss": 3.5456,
"step": 13850
},
{
"epoch": 4.048939641109299,
"grad_norm": 0.34020113945007324,
"learning_rate": 0.0005517376858058875,
"loss": 3.5473,
"step": 13900
},
{
"epoch": 4.063505010487066,
"grad_norm": 0.3192497491836548,
"learning_rate": 0.0005515628096764791,
"loss": 3.5316,
"step": 13950
},
{
"epoch": 4.078070379864833,
"grad_norm": 0.31511300802230835,
"learning_rate": 0.0005513879335470708,
"loss": 3.5479,
"step": 14000
},
{
"epoch": 4.078070379864833,
"eval_accuracy": 0.3563426191818444,
"eval_loss": 3.672767162322998,
"eval_runtime": 182.5621,
"eval_samples_per_second": 91.174,
"eval_steps_per_second": 5.702,
"step": 14000
},
{
"epoch": 4.092635749242601,
"grad_norm": 0.3340079188346863,
"learning_rate": 0.0005512130574176625,
"loss": 3.5413,
"step": 14050
},
{
"epoch": 4.1072011186203685,
"grad_norm": 0.3118899464607239,
"learning_rate": 0.000551038181288254,
"loss": 3.5422,
"step": 14100
},
{
"epoch": 4.121766487998135,
"grad_norm": 0.3274664580821991,
"learning_rate": 0.0005508633051588457,
"loss": 3.5457,
"step": 14150
},
{
"epoch": 4.136331857375903,
"grad_norm": 0.3153150677680969,
"learning_rate": 0.0005506884290294374,
"loss": 3.56,
"step": 14200
},
{
"epoch": 4.150897226753671,
"grad_norm": 0.3385670781135559,
"learning_rate": 0.0005505135529000291,
"loss": 3.5466,
"step": 14250
},
{
"epoch": 4.165462596131438,
"grad_norm": 0.3047159016132355,
"learning_rate": 0.0005503386767706207,
"loss": 3.5501,
"step": 14300
},
{
"epoch": 4.180027965509205,
"grad_norm": 0.3238605260848999,
"learning_rate": 0.0005501638006412124,
"loss": 3.5547,
"step": 14350
},
{
"epoch": 4.194593334886973,
"grad_norm": 0.3106607496738434,
"learning_rate": 0.0005499889245118041,
"loss": 3.5582,
"step": 14400
},
{
"epoch": 4.20915870426474,
"grad_norm": 0.31510302424430847,
"learning_rate": 0.0005498140483823958,
"loss": 3.554,
"step": 14450
},
{
"epoch": 4.223724073642508,
"grad_norm": 0.3922960162162781,
"learning_rate": 0.0005496391722529875,
"loss": 3.5483,
"step": 14500
},
{
"epoch": 4.238289443020275,
"grad_norm": 0.333943635225296,
"learning_rate": 0.000549464296123579,
"loss": 3.56,
"step": 14550
},
{
"epoch": 4.252854812398042,
"grad_norm": 0.3157419264316559,
"learning_rate": 0.0005492894199941707,
"loss": 3.5645,
"step": 14600
},
{
"epoch": 4.26742018177581,
"grad_norm": 0.32612183690071106,
"learning_rate": 0.0005491145438647624,
"loss": 3.5556,
"step": 14650
},
{
"epoch": 4.281985551153578,
"grad_norm": 0.33562448620796204,
"learning_rate": 0.000548939667735354,
"loss": 3.5716,
"step": 14700
},
{
"epoch": 4.296550920531344,
"grad_norm": 0.32943233847618103,
"learning_rate": 0.0005487647916059457,
"loss": 3.5683,
"step": 14750
},
{
"epoch": 4.311116289909112,
"grad_norm": 0.32873958349227905,
"learning_rate": 0.0005485899154765374,
"loss": 3.5584,
"step": 14800
},
{
"epoch": 4.32568165928688,
"grad_norm": 0.3185548782348633,
"learning_rate": 0.0005484150393471291,
"loss": 3.5823,
"step": 14850
},
{
"epoch": 4.340247028664647,
"grad_norm": 0.32610881328582764,
"learning_rate": 0.0005482401632177208,
"loss": 3.5761,
"step": 14900
},
{
"epoch": 4.354812398042414,
"grad_norm": 0.31527790427207947,
"learning_rate": 0.0005480652870883124,
"loss": 3.5687,
"step": 14950
},
{
"epoch": 4.369377767420182,
"grad_norm": 0.3269599378108978,
"learning_rate": 0.000547890410958904,
"loss": 3.5782,
"step": 15000
},
{
"epoch": 4.369377767420182,
"eval_accuracy": 0.3571946465826318,
"eval_loss": 3.6619949340820312,
"eval_runtime": 182.5907,
"eval_samples_per_second": 91.16,
"eval_steps_per_second": 5.701,
"step": 15000
},
{
"epoch": 4.383943136797949,
"grad_norm": 0.31807681918144226,
"learning_rate": 0.0005477155348294957,
"loss": 3.5722,
"step": 15050
},
{
"epoch": 4.398508506175717,
"grad_norm": 0.3249761462211609,
"learning_rate": 0.0005475406587000874,
"loss": 3.58,
"step": 15100
},
{
"epoch": 4.413073875553484,
"grad_norm": 0.33852142095565796,
"learning_rate": 0.000547365782570679,
"loss": 3.5796,
"step": 15150
},
{
"epoch": 4.427639244931251,
"grad_norm": 0.32763510942459106,
"learning_rate": 0.0005471909064412707,
"loss": 3.5777,
"step": 15200
},
{
"epoch": 4.442204614309019,
"grad_norm": 0.31176745891571045,
"learning_rate": 0.0005470160303118624,
"loss": 3.5721,
"step": 15250
},
{
"epoch": 4.456769983686787,
"grad_norm": 0.35347744822502136,
"learning_rate": 0.000546841154182454,
"loss": 3.5759,
"step": 15300
},
{
"epoch": 4.471335353064553,
"grad_norm": 0.33067938685417175,
"learning_rate": 0.0005466662780530458,
"loss": 3.5752,
"step": 15350
},
{
"epoch": 4.485900722442321,
"grad_norm": 0.3238064646720886,
"learning_rate": 0.0005464914019236374,
"loss": 3.5855,
"step": 15400
},
{
"epoch": 4.500466091820089,
"grad_norm": 0.33181995153427124,
"learning_rate": 0.000546316525794229,
"loss": 3.5676,
"step": 15450
},
{
"epoch": 4.515031461197856,
"grad_norm": 0.32865026593208313,
"learning_rate": 0.0005461416496648207,
"loss": 3.5663,
"step": 15500
},
{
"epoch": 4.529596830575623,
"grad_norm": 0.30539754033088684,
"learning_rate": 0.0005459667735354123,
"loss": 3.572,
"step": 15550
},
{
"epoch": 4.544162199953391,
"grad_norm": 0.33434492349624634,
"learning_rate": 0.000545791897406004,
"loss": 3.5709,
"step": 15600
},
{
"epoch": 4.558727569331158,
"grad_norm": 0.32164353132247925,
"learning_rate": 0.0005456170212765957,
"loss": 3.5783,
"step": 15650
},
{
"epoch": 4.573292938708926,
"grad_norm": 0.3319690525531769,
"learning_rate": 0.0005454421451471874,
"loss": 3.58,
"step": 15700
},
{
"epoch": 4.587858308086693,
"grad_norm": 0.3449385464191437,
"learning_rate": 0.000545267269017779,
"loss": 3.58,
"step": 15750
},
{
"epoch": 4.6024236774644605,
"grad_norm": 0.32032108306884766,
"learning_rate": 0.0005450923928883708,
"loss": 3.5837,
"step": 15800
},
{
"epoch": 4.616989046842228,
"grad_norm": 0.3191685974597931,
"learning_rate": 0.0005449175167589623,
"loss": 3.5717,
"step": 15850
},
{
"epoch": 4.631554416219995,
"grad_norm": 0.32119137048721313,
"learning_rate": 0.000544742640629554,
"loss": 3.5932,
"step": 15900
},
{
"epoch": 4.6461197855977625,
"grad_norm": 0.33280646800994873,
"learning_rate": 0.0005445677645001457,
"loss": 3.5691,
"step": 15950
},
{
"epoch": 4.66068515497553,
"grad_norm": 0.32261455059051514,
"learning_rate": 0.0005443928883707373,
"loss": 3.5819,
"step": 16000
},
{
"epoch": 4.66068515497553,
"eval_accuracy": 0.3582682928119667,
"eval_loss": 3.6463451385498047,
"eval_runtime": 182.103,
"eval_samples_per_second": 91.404,
"eval_steps_per_second": 5.717,
"step": 16000
},
{
"epoch": 4.675250524353298,
"grad_norm": 0.3126870095729828,
"learning_rate": 0.000544218012241329,
"loss": 3.5687,
"step": 16050
},
{
"epoch": 4.689815893731065,
"grad_norm": 0.3362468183040619,
"learning_rate": 0.0005440431361119207,
"loss": 3.5788,
"step": 16100
},
{
"epoch": 4.704381263108832,
"grad_norm": 0.30104732513427734,
"learning_rate": 0.0005438682599825123,
"loss": 3.5875,
"step": 16150
},
{
"epoch": 4.7189466324866,
"grad_norm": 0.3225014805793762,
"learning_rate": 0.000543693383853104,
"loss": 3.5542,
"step": 16200
},
{
"epoch": 4.7335120018643675,
"grad_norm": 0.3089386522769928,
"learning_rate": 0.0005435185077236957,
"loss": 3.5733,
"step": 16250
},
{
"epoch": 4.748077371242134,
"grad_norm": 0.32772549986839294,
"learning_rate": 0.0005433436315942873,
"loss": 3.5776,
"step": 16300
},
{
"epoch": 4.762642740619902,
"grad_norm": 0.3394605815410614,
"learning_rate": 0.000543168755464879,
"loss": 3.5714,
"step": 16350
},
{
"epoch": 4.7772081099976695,
"grad_norm": 0.31668463349342346,
"learning_rate": 0.0005429938793354706,
"loss": 3.5802,
"step": 16400
},
{
"epoch": 4.791773479375437,
"grad_norm": 0.30830904841423035,
"learning_rate": 0.0005428190032060623,
"loss": 3.5706,
"step": 16450
},
{
"epoch": 4.806338848753205,
"grad_norm": 0.3307313621044159,
"learning_rate": 0.000542644127076654,
"loss": 3.5669,
"step": 16500
},
{
"epoch": 4.820904218130972,
"grad_norm": 0.3045443892478943,
"learning_rate": 0.0005424692509472457,
"loss": 3.5737,
"step": 16550
},
{
"epoch": 4.835469587508739,
"grad_norm": 0.3446267247200012,
"learning_rate": 0.0005422943748178373,
"loss": 3.5815,
"step": 16600
},
{
"epoch": 4.850034956886507,
"grad_norm": 0.33735302090644836,
"learning_rate": 0.000542119498688429,
"loss": 3.5774,
"step": 16650
},
{
"epoch": 4.864600326264274,
"grad_norm": 0.3310108184814453,
"learning_rate": 0.0005419446225590207,
"loss": 3.5777,
"step": 16700
},
{
"epoch": 4.879165695642041,
"grad_norm": 0.3099808692932129,
"learning_rate": 0.0005417697464296122,
"loss": 3.5811,
"step": 16750
},
{
"epoch": 4.893731065019809,
"grad_norm": 0.3206506371498108,
"learning_rate": 0.000541594870300204,
"loss": 3.5822,
"step": 16800
},
{
"epoch": 4.908296434397577,
"grad_norm": 0.31750303506851196,
"learning_rate": 0.0005414199941707956,
"loss": 3.5802,
"step": 16850
},
{
"epoch": 4.922861803775344,
"grad_norm": 0.33419302105903625,
"learning_rate": 0.0005412451180413873,
"loss": 3.572,
"step": 16900
},
{
"epoch": 4.937427173153111,
"grad_norm": 0.35304707288742065,
"learning_rate": 0.000541070241911979,
"loss": 3.5736,
"step": 16950
},
{
"epoch": 4.951992542530879,
"grad_norm": 0.3392590284347534,
"learning_rate": 0.0005408953657825706,
"loss": 3.5792,
"step": 17000
},
{
"epoch": 4.951992542530879,
"eval_accuracy": 0.35947126567865034,
"eval_loss": 3.634124279022217,
"eval_runtime": 181.9213,
"eval_samples_per_second": 91.496,
"eval_steps_per_second": 5.722,
"step": 17000
},
{
"epoch": 4.966557911908646,
"grad_norm": 0.3133297264575958,
"learning_rate": 0.0005407204896531623,
"loss": 3.574,
"step": 17050
},
{
"epoch": 4.981123281286413,
"grad_norm": 0.31224194169044495,
"learning_rate": 0.000540545613523754,
"loss": 3.5708,
"step": 17100
},
{
"epoch": 4.995688650664181,
"grad_norm": 0.3288334310054779,
"learning_rate": 0.0005403707373943456,
"loss": 3.5705,
"step": 17150
},
{
"epoch": 5.010195758564437,
"grad_norm": 0.343101441860199,
"learning_rate": 0.0005401958612649372,
"loss": 3.5059,
"step": 17200
},
{
"epoch": 5.024761127942204,
"grad_norm": 0.31737393140792847,
"learning_rate": 0.000540020985135529,
"loss": 3.4676,
"step": 17250
},
{
"epoch": 5.039326497319972,
"grad_norm": 0.3343667685985565,
"learning_rate": 0.0005398461090061206,
"loss": 3.4729,
"step": 17300
},
{
"epoch": 5.0538918666977395,
"grad_norm": 0.3518417477607727,
"learning_rate": 0.0005396712328767123,
"loss": 3.4788,
"step": 17350
},
{
"epoch": 5.068457236075507,
"grad_norm": 0.32810088992118835,
"learning_rate": 0.000539496356747304,
"loss": 3.4653,
"step": 17400
},
{
"epoch": 5.083022605453274,
"grad_norm": 0.3590436279773712,
"learning_rate": 0.0005393214806178956,
"loss": 3.4829,
"step": 17450
},
{
"epoch": 5.0975879748310415,
"grad_norm": 0.3379361629486084,
"learning_rate": 0.0005391466044884873,
"loss": 3.4851,
"step": 17500
},
{
"epoch": 5.112153344208809,
"grad_norm": 0.3168104588985443,
"learning_rate": 0.000538971728359079,
"loss": 3.4889,
"step": 17550
},
{
"epoch": 5.126718713586577,
"grad_norm": 0.33108997344970703,
"learning_rate": 0.0005387968522296705,
"loss": 3.492,
"step": 17600
},
{
"epoch": 5.141284082964344,
"grad_norm": 0.3419332802295685,
"learning_rate": 0.0005386219761002622,
"loss": 3.4919,
"step": 17650
},
{
"epoch": 5.155849452342111,
"grad_norm": 0.34570637345314026,
"learning_rate": 0.0005384470999708539,
"loss": 3.4986,
"step": 17700
},
{
"epoch": 5.170414821719879,
"grad_norm": 0.3261895477771759,
"learning_rate": 0.0005382722238414456,
"loss": 3.5012,
"step": 17750
},
{
"epoch": 5.1849801910976465,
"grad_norm": 0.34492751955986023,
"learning_rate": 0.0005380973477120373,
"loss": 3.4965,
"step": 17800
},
{
"epoch": 5.199545560475413,
"grad_norm": 0.3237833082675934,
"learning_rate": 0.000537922471582629,
"loss": 3.4965,
"step": 17850
},
{
"epoch": 5.214110929853181,
"grad_norm": 0.31047049164772034,
"learning_rate": 0.0005377475954532206,
"loss": 3.5017,
"step": 17900
},
{
"epoch": 5.228676299230949,
"grad_norm": 0.32261335849761963,
"learning_rate": 0.0005375727193238123,
"loss": 3.5174,
"step": 17950
},
{
"epoch": 5.243241668608716,
"grad_norm": 0.3363330662250519,
"learning_rate": 0.000537397843194404,
"loss": 3.512,
"step": 18000
},
{
"epoch": 5.243241668608716,
"eval_accuracy": 0.3601516413607749,
"eval_loss": 3.6384644508361816,
"eval_runtime": 181.9458,
"eval_samples_per_second": 91.483,
"eval_steps_per_second": 5.721,
"step": 18000
},
{
"epoch": 5.257807037986483,
"grad_norm": 0.3440595269203186,
"learning_rate": 0.0005372229670649955,
"loss": 3.505,
"step": 18050
},
{
"epoch": 5.272372407364251,
"grad_norm": 0.3164835572242737,
"learning_rate": 0.0005370480909355872,
"loss": 3.5013,
"step": 18100
},
{
"epoch": 5.286937776742018,
"grad_norm": 0.3304733335971832,
"learning_rate": 0.0005368732148061789,
"loss": 3.5155,
"step": 18150
},
{
"epoch": 5.301503146119786,
"grad_norm": 0.3306984305381775,
"learning_rate": 0.0005366983386767705,
"loss": 3.5053,
"step": 18200
},
{
"epoch": 5.316068515497553,
"grad_norm": 0.32221606373786926,
"learning_rate": 0.0005365234625473623,
"loss": 3.5078,
"step": 18250
},
{
"epoch": 5.33063388487532,
"grad_norm": 0.30464252829551697,
"learning_rate": 0.0005363485864179539,
"loss": 3.5094,
"step": 18300
},
{
"epoch": 5.345199254253088,
"grad_norm": 0.32405513525009155,
"learning_rate": 0.0005361737102885456,
"loss": 3.507,
"step": 18350
},
{
"epoch": 5.359764623630856,
"grad_norm": 0.33651819825172424,
"learning_rate": 0.0005359988341591373,
"loss": 3.5145,
"step": 18400
},
{
"epoch": 5.374329993008622,
"grad_norm": 0.357702374458313,
"learning_rate": 0.000535823958029729,
"loss": 3.5132,
"step": 18450
},
{
"epoch": 5.38889536238639,
"grad_norm": 0.3228895962238312,
"learning_rate": 0.0005356490819003205,
"loss": 3.4973,
"step": 18500
},
{
"epoch": 5.403460731764158,
"grad_norm": 0.3350990414619446,
"learning_rate": 0.0005354742057709122,
"loss": 3.5208,
"step": 18550
},
{
"epoch": 5.418026101141925,
"grad_norm": 0.34133604168891907,
"learning_rate": 0.0005352993296415039,
"loss": 3.5177,
"step": 18600
},
{
"epoch": 5.432591470519692,
"grad_norm": 0.32490041851997375,
"learning_rate": 0.0005351244535120955,
"loss": 3.5159,
"step": 18650
},
{
"epoch": 5.44715683989746,
"grad_norm": 0.32596027851104736,
"learning_rate": 0.0005349495773826873,
"loss": 3.5274,
"step": 18700
},
{
"epoch": 5.461722209275227,
"grad_norm": 0.3423188626766205,
"learning_rate": 0.0005347747012532789,
"loss": 3.53,
"step": 18750
},
{
"epoch": 5.476287578652995,
"grad_norm": 0.31081074476242065,
"learning_rate": 0.0005345998251238706,
"loss": 3.5269,
"step": 18800
},
{
"epoch": 5.490852948030762,
"grad_norm": 0.34136995673179626,
"learning_rate": 0.0005344249489944623,
"loss": 3.5268,
"step": 18850
},
{
"epoch": 5.505418317408529,
"grad_norm": 0.34362757205963135,
"learning_rate": 0.0005342500728650538,
"loss": 3.5227,
"step": 18900
},
{
"epoch": 5.519983686786297,
"grad_norm": 0.30831918120384216,
"learning_rate": 0.0005340751967356455,
"loss": 3.5222,
"step": 18950
},
{
"epoch": 5.534549056164065,
"grad_norm": 0.3135395646095276,
"learning_rate": 0.0005339003206062372,
"loss": 3.5208,
"step": 19000
},
{
"epoch": 5.534549056164065,
"eval_accuracy": 0.3609769804464003,
"eval_loss": 3.626232385635376,
"eval_runtime": 181.8603,
"eval_samples_per_second": 91.526,
"eval_steps_per_second": 5.724,
"step": 19000
},
{
"epoch": 5.549114425541831,
"grad_norm": 0.3581465482711792,
"learning_rate": 0.0005337254444768288,
"loss": 3.531,
"step": 19050
},
{
"epoch": 5.563679794919599,
"grad_norm": 0.32384639978408813,
"learning_rate": 0.0005335505683474205,
"loss": 3.5197,
"step": 19100
},
{
"epoch": 5.578245164297367,
"grad_norm": 0.3450806736946106,
"learning_rate": 0.0005333756922180122,
"loss": 3.5301,
"step": 19150
},
{
"epoch": 5.592810533675134,
"grad_norm": 0.32282331585884094,
"learning_rate": 0.0005332008160886039,
"loss": 3.5306,
"step": 19200
},
{
"epoch": 5.607375903052901,
"grad_norm": 0.3486621677875519,
"learning_rate": 0.0005330259399591956,
"loss": 3.5208,
"step": 19250
},
{
"epoch": 5.621941272430669,
"grad_norm": 0.3094702363014221,
"learning_rate": 0.0005328510638297873,
"loss": 3.5239,
"step": 19300
},
{
"epoch": 5.636506641808436,
"grad_norm": 0.3274450898170471,
"learning_rate": 0.0005326761877003788,
"loss": 3.5336,
"step": 19350
},
{
"epoch": 5.651072011186204,
"grad_norm": 0.3350226879119873,
"learning_rate": 0.0005325013115709705,
"loss": 3.5326,
"step": 19400
},
{
"epoch": 5.665637380563971,
"grad_norm": 0.3588801622390747,
"learning_rate": 0.0005323264354415622,
"loss": 3.5271,
"step": 19450
},
{
"epoch": 5.6802027499417385,
"grad_norm": 0.3390669524669647,
"learning_rate": 0.0005321515593121538,
"loss": 3.5297,
"step": 19500
},
{
"epoch": 5.694768119319506,
"grad_norm": 0.322145938873291,
"learning_rate": 0.0005319766831827455,
"loss": 3.5217,
"step": 19550
},
{
"epoch": 5.709333488697274,
"grad_norm": 0.35364869236946106,
"learning_rate": 0.0005318018070533372,
"loss": 3.5149,
"step": 19600
},
{
"epoch": 5.7238988580750405,
"grad_norm": 0.32203230261802673,
"learning_rate": 0.0005316269309239288,
"loss": 3.5356,
"step": 19650
},
{
"epoch": 5.738464227452808,
"grad_norm": 0.352469265460968,
"learning_rate": 0.0005314520547945206,
"loss": 3.532,
"step": 19700
},
{
"epoch": 5.753029596830576,
"grad_norm": NaN,
"learning_rate": 0.0005312771786651121,
"loss": 3.5222,
"step": 19750
},
{
"epoch": 5.7675949662083426,
"grad_norm": 0.3287372589111328,
"learning_rate": 0.0005311023025357038,
"loss": 3.5235,
"step": 19800
},
{
"epoch": 5.78216033558611,
"grad_norm": 0.3262624442577362,
"learning_rate": 0.0005309274264062955,
"loss": 3.5397,
"step": 19850
},
{
"epoch": 5.796725704963878,
"grad_norm": 0.3266109228134155,
"learning_rate": 0.0005307525502768872,
"loss": 3.5228,
"step": 19900
},
{
"epoch": 5.8112910743416455,
"grad_norm": 0.34884291887283325,
"learning_rate": 0.0005305776741474788,
"loss": 3.5194,
"step": 19950
},
{
"epoch": 5.825856443719413,
"grad_norm": 0.3074500858783722,
"learning_rate": 0.0005304027980180705,
"loss": 3.5273,
"step": 20000
},
{
"epoch": 5.825856443719413,
"eval_accuracy": 0.3620989478102355,
"eval_loss": 3.6148364543914795,
"eval_runtime": 181.8652,
"eval_samples_per_second": 91.524,
"eval_steps_per_second": 5.724,
"step": 20000
},
{
"epoch": 5.84042181309718,
"grad_norm": 0.3382808566093445,
"learning_rate": 0.0005302279218886622,
"loss": 3.5279,
"step": 20050
},
{
"epoch": 5.8549871824749475,
"grad_norm": 0.3046127259731293,
"learning_rate": 0.0005300530457592538,
"loss": 3.5388,
"step": 20100
},
{
"epoch": 5.869552551852715,
"grad_norm": 0.3430224657058716,
"learning_rate": 0.0005298781696298456,
"loss": 3.5282,
"step": 20150
},
{
"epoch": 5.884117921230482,
"grad_norm": 0.36001190543174744,
"learning_rate": 0.0005297032935004371,
"loss": 3.5291,
"step": 20200
},
{
"epoch": 5.89868329060825,
"grad_norm": 0.3140873312950134,
"learning_rate": 0.0005295284173710288,
"loss": 3.5389,
"step": 20250
},
{
"epoch": 5.913248659986017,
"grad_norm": 0.34070631861686707,
"learning_rate": 0.0005293535412416205,
"loss": 3.5397,
"step": 20300
},
{
"epoch": 5.927814029363785,
"grad_norm": 0.3694857954978943,
"learning_rate": 0.0005291786651122121,
"loss": 3.5365,
"step": 20350
},
{
"epoch": 5.9423793987415525,
"grad_norm": 0.32443273067474365,
"learning_rate": 0.0005290037889828038,
"loss": 3.5392,
"step": 20400
},
{
"epoch": 5.956944768119319,
"grad_norm": 0.3110935389995575,
"learning_rate": 0.0005288289128533955,
"loss": 3.5326,
"step": 20450
},
{
"epoch": 5.971510137497087,
"grad_norm": 0.3232935965061188,
"learning_rate": 0.0005286540367239872,
"loss": 3.5381,
"step": 20500
},
{
"epoch": 5.986075506874855,
"grad_norm": 0.3413400948047638,
"learning_rate": 0.0005284791605945788,
"loss": 3.5416,
"step": 20550
},
{
"epoch": 6.0005826147751105,
"grad_norm": 0.3248980641365051,
"learning_rate": 0.0005283042844651704,
"loss": 3.5249,
"step": 20600
},
{
"epoch": 6.015147984152878,
"grad_norm": 0.3668578565120697,
"learning_rate": 0.0005281294083357621,
"loss": 3.4226,
"step": 20650
},
{
"epoch": 6.029713353530646,
"grad_norm": 0.32745733857154846,
"learning_rate": 0.0005279545322063538,
"loss": 3.4291,
"step": 20700
},
{
"epoch": 6.044278722908413,
"grad_norm": 0.3670319616794586,
"learning_rate": 0.0005277796560769455,
"loss": 3.4327,
"step": 20750
},
{
"epoch": 6.05884409228618,
"grad_norm": 0.3462134897708893,
"learning_rate": 0.0005276047799475371,
"loss": 3.4398,
"step": 20800
},
{
"epoch": 6.073409461663948,
"grad_norm": 0.3434312641620636,
"learning_rate": 0.0005274299038181288,
"loss": 3.4347,
"step": 20850
},
{
"epoch": 6.087974831041715,
"grad_norm": 0.3447525203227997,
"learning_rate": 0.0005272550276887205,
"loss": 3.448,
"step": 20900
},
{
"epoch": 6.102540200419483,
"grad_norm": 0.3339570462703705,
"learning_rate": 0.0005270801515593121,
"loss": 3.4478,
"step": 20950
},
{
"epoch": 6.11710556979725,
"grad_norm": 0.32857051491737366,
"learning_rate": 0.0005269052754299037,
"loss": 3.4475,
"step": 21000
},
{
"epoch": 6.11710556979725,
"eval_accuracy": 0.3623171571183439,
"eval_loss": 3.619291067123413,
"eval_runtime": 182.0455,
"eval_samples_per_second": 91.433,
"eval_steps_per_second": 5.718,
"step": 21000
},
{
"epoch": 6.1316709391750175,
"grad_norm": 0.33486610651016235,
"learning_rate": 0.0005267303993004954,
"loss": 3.435,
"step": 21050
},
{
"epoch": 6.146236308552785,
"grad_norm": 0.3245387077331543,
"learning_rate": 0.000526555523171087,
"loss": 3.4504,
"step": 21100
},
{
"epoch": 6.160801677930552,
"grad_norm": 0.325870543718338,
"learning_rate": 0.0005263806470416788,
"loss": 3.4532,
"step": 21150
},
{
"epoch": 6.1753670473083195,
"grad_norm": 0.35105100274086,
"learning_rate": 0.0005262057709122704,
"loss": 3.4521,
"step": 21200
},
{
"epoch": 6.189932416686087,
"grad_norm": 0.3488394320011139,
"learning_rate": 0.0005260308947828621,
"loss": 3.4584,
"step": 21250
},
{
"epoch": 6.204497786063855,
"grad_norm": 0.3601958453655243,
"learning_rate": 0.0005258560186534538,
"loss": 3.4632,
"step": 21300
},
{
"epoch": 6.219063155441622,
"grad_norm": 0.320527583360672,
"learning_rate": 0.0005256811425240455,
"loss": 3.4616,
"step": 21350
},
{
"epoch": 6.233628524819389,
"grad_norm": 0.3193604648113251,
"learning_rate": 0.0005255062663946371,
"loss": 3.4545,
"step": 21400
},
{
"epoch": 6.248193894197157,
"grad_norm": 0.32400959730148315,
"learning_rate": 0.0005253313902652287,
"loss": 3.46,
"step": 21450
},
{
"epoch": 6.2627592635749245,
"grad_norm": 0.36129894852638245,
"learning_rate": 0.0005251565141358204,
"loss": 3.4582,
"step": 21500
},
{
"epoch": 6.277324632952691,
"grad_norm": 0.34856846928596497,
"learning_rate": 0.000524981638006412,
"loss": 3.4597,
"step": 21550
},
{
"epoch": 6.291890002330459,
"grad_norm": 0.35143759846687317,
"learning_rate": 0.0005248067618770038,
"loss": 3.4663,
"step": 21600
},
{
"epoch": 6.306455371708227,
"grad_norm": 0.3181470036506653,
"learning_rate": 0.0005246318857475954,
"loss": 3.4571,
"step": 21650
},
{
"epoch": 6.321020741085994,
"grad_norm": 0.3355952799320221,
"learning_rate": 0.0005244570096181871,
"loss": 3.4644,
"step": 21700
},
{
"epoch": 6.335586110463761,
"grad_norm": 0.3471963405609131,
"learning_rate": 0.0005242821334887788,
"loss": 3.4665,
"step": 21750
},
{
"epoch": 6.350151479841529,
"grad_norm": 0.322955846786499,
"learning_rate": 0.0005241072573593704,
"loss": 3.4713,
"step": 21800
},
{
"epoch": 6.364716849219296,
"grad_norm": 0.35527798533439636,
"learning_rate": 0.000523932381229962,
"loss": 3.4744,
"step": 21850
},
{
"epoch": 6.379282218597064,
"grad_norm": 0.3321806490421295,
"learning_rate": 0.0005237575051005537,
"loss": 3.4808,
"step": 21900
},
{
"epoch": 6.393847587974831,
"grad_norm": 0.33331242203712463,
"learning_rate": 0.0005235826289711454,
"loss": 3.4621,
"step": 21950
},
{
"epoch": 6.408412957352598,
"grad_norm": 0.3406297266483307,
"learning_rate": 0.000523407752841737,
"loss": 3.4806,
"step": 22000
},
{
"epoch": 6.408412957352598,
"eval_accuracy": 0.3629372195595958,
"eval_loss": 3.6126840114593506,
"eval_runtime": 179.7323,
"eval_samples_per_second": 92.61,
"eval_steps_per_second": 5.792,
"step": 22000
},
{
"epoch": 6.422978326730366,
"grad_norm": 0.3514094352722168,
"learning_rate": 0.0005232328767123287,
"loss": 3.4762,
"step": 22050
},
{
"epoch": 6.437543696108134,
"grad_norm": 0.33424749970436096,
"learning_rate": 0.0005230580005829204,
"loss": 3.4723,
"step": 22100
},
{
"epoch": 6.4521090654859,
"grad_norm": 0.3223506808280945,
"learning_rate": 0.0005228831244535121,
"loss": 3.4735,
"step": 22150
},
{
"epoch": 6.466674434863668,
"grad_norm": 0.3629089891910553,
"learning_rate": 0.0005227082483241038,
"loss": 3.4835,
"step": 22200
},
{
"epoch": 6.481239804241436,
"grad_norm": 0.3067444860935211,
"learning_rate": 0.0005225333721946954,
"loss": 3.4759,
"step": 22250
},
{
"epoch": 6.495805173619203,
"grad_norm": 0.31871816515922546,
"learning_rate": 0.000522358496065287,
"loss": 3.4852,
"step": 22300
},
{
"epoch": 6.51037054299697,
"grad_norm": 0.3342389464378357,
"learning_rate": 0.0005221836199358787,
"loss": 3.4748,
"step": 22350
},
{
"epoch": 6.524935912374738,
"grad_norm": 0.334839403629303,
"learning_rate": 0.0005220087438064703,
"loss": 3.48,
"step": 22400
},
{
"epoch": 6.539501281752505,
"grad_norm": 0.3808937072753906,
"learning_rate": 0.000521833867677062,
"loss": 3.4808,
"step": 22450
},
{
"epoch": 6.554066651130273,
"grad_norm": 0.3652092516422272,
"learning_rate": 0.0005216589915476537,
"loss": 3.5021,
"step": 22500
},
{
"epoch": 6.56863202050804,
"grad_norm": 0.32643789052963257,
"learning_rate": 0.0005214841154182454,
"loss": 3.4911,
"step": 22550
},
{
"epoch": 6.583197389885807,
"grad_norm": 0.3469211459159851,
"learning_rate": 0.0005213092392888371,
"loss": 3.4757,
"step": 22600
},
{
"epoch": 6.597762759263575,
"grad_norm": 0.3310937285423279,
"learning_rate": 0.0005211343631594287,
"loss": 3.4829,
"step": 22650
},
{
"epoch": 6.612328128641343,
"grad_norm": 0.3375169634819031,
"learning_rate": 0.0005209594870300204,
"loss": 3.4889,
"step": 22700
},
{
"epoch": 6.626893498019109,
"grad_norm": 0.3277340531349182,
"learning_rate": 0.000520784610900612,
"loss": 3.4734,
"step": 22750
},
{
"epoch": 6.641458867396877,
"grad_norm": 0.35384461283683777,
"learning_rate": 0.0005206097347712037,
"loss": 3.5004,
"step": 22800
},
{
"epoch": 6.656024236774645,
"grad_norm": 0.33254358172416687,
"learning_rate": 0.0005204348586417953,
"loss": 3.4922,
"step": 22850
},
{
"epoch": 6.670589606152412,
"grad_norm": 0.3284110426902771,
"learning_rate": 0.000520259982512387,
"loss": 3.4888,
"step": 22900
},
{
"epoch": 6.685154975530179,
"grad_norm": 0.32339197397232056,
"learning_rate": 0.0005200851063829787,
"loss": 3.4905,
"step": 22950
},
{
"epoch": 6.699720344907947,
"grad_norm": 0.33628493547439575,
"learning_rate": 0.0005199102302535703,
"loss": 3.4748,
"step": 23000
},
{
"epoch": 6.699720344907947,
"eval_accuracy": 0.36394831872432204,
"eval_loss": 3.600461006164551,
"eval_runtime": 179.7946,
"eval_samples_per_second": 92.578,
"eval_steps_per_second": 5.79,
"step": 23000
},
{
"epoch": 6.714285714285714,
"grad_norm": 0.3240261375904083,
"learning_rate": 0.0005197353541241621,
"loss": 3.4955,
"step": 23050
},
{
"epoch": 6.728851083663482,
"grad_norm": 0.3188318610191345,
"learning_rate": 0.0005195604779947537,
"loss": 3.483,
"step": 23100
},
{
"epoch": 6.743416453041249,
"grad_norm": 0.3339631259441376,
"learning_rate": 0.0005193856018653454,
"loss": 3.4883,
"step": 23150
},
{
"epoch": 6.7579818224190165,
"grad_norm": 0.3179808557033539,
"learning_rate": 0.000519210725735937,
"loss": 3.4943,
"step": 23200
},
{
"epoch": 6.772547191796784,
"grad_norm": 0.3453110456466675,
"learning_rate": 0.0005190358496065286,
"loss": 3.496,
"step": 23250
},
{
"epoch": 6.787112561174552,
"grad_norm": 0.32360783219337463,
"learning_rate": 0.0005188609734771203,
"loss": 3.4991,
"step": 23300
},
{
"epoch": 6.8016779305523185,
"grad_norm": 0.3246710002422333,
"learning_rate": 0.000518686097347712,
"loss": 3.4903,
"step": 23350
},
{
"epoch": 6.816243299930086,
"grad_norm": 0.344545841217041,
"learning_rate": 0.0005185112212183037,
"loss": 3.4918,
"step": 23400
},
{
"epoch": 6.830808669307854,
"grad_norm": 0.32257169485092163,
"learning_rate": 0.0005183363450888953,
"loss": 3.487,
"step": 23450
},
{
"epoch": 6.845374038685621,
"grad_norm": 0.3380378484725952,
"learning_rate": 0.000518161468959487,
"loss": 3.4881,
"step": 23500
},
{
"epoch": 6.859939408063388,
"grad_norm": 0.34541237354278564,
"learning_rate": 0.0005179865928300787,
"loss": 3.4939,
"step": 23550
},
{
"epoch": 6.874504777441156,
"grad_norm": 0.3542953431606293,
"learning_rate": 0.0005178117167006703,
"loss": 3.4869,
"step": 23600
},
{
"epoch": 6.8890701468189235,
"grad_norm": 0.3760510981082916,
"learning_rate": 0.000517636840571262,
"loss": 3.5067,
"step": 23650
},
{
"epoch": 6.903635516196691,
"grad_norm": 0.33901602029800415,
"learning_rate": 0.0005174619644418536,
"loss": 3.4951,
"step": 23700
},
{
"epoch": 6.918200885574458,
"grad_norm": 0.33704662322998047,
"learning_rate": 0.0005172870883124453,
"loss": 3.4922,
"step": 23750
},
{
"epoch": 6.9327662549522255,
"grad_norm": 0.32309016585350037,
"learning_rate": 0.000517112212183037,
"loss": 3.4908,
"step": 23800
},
{
"epoch": 6.947331624329993,
"grad_norm": 0.3241852819919586,
"learning_rate": 0.0005169373360536286,
"loss": 3.4964,
"step": 23850
},
{
"epoch": 6.961896993707761,
"grad_norm": 0.3242267370223999,
"learning_rate": 0.0005167624599242203,
"loss": 3.4817,
"step": 23900
},
{
"epoch": 6.976462363085528,
"grad_norm": 0.3280220925807953,
"learning_rate": 0.000516587583794812,
"loss": 3.4932,
"step": 23950
},
{
"epoch": 6.991027732463295,
"grad_norm": 0.3425884544849396,
"learning_rate": 0.0005164127076654037,
"loss": 3.4908,
"step": 24000
},
{
"epoch": 6.991027732463295,
"eval_accuracy": 0.3643506421361469,
"eval_loss": 3.5942769050598145,
"eval_runtime": 179.5597,
"eval_samples_per_second": 92.699,
"eval_steps_per_second": 5.798,
"step": 24000
},
{
"epoch": 7.005534840363552,
"grad_norm": 0.3431827127933502,
"learning_rate": 0.0005162378315359953,
"loss": 3.45,
"step": 24050
},
{
"epoch": 7.020100209741319,
"grad_norm": 0.36127012968063354,
"learning_rate": 0.0005160629554065869,
"loss": 3.3704,
"step": 24100
},
{
"epoch": 7.034665579119086,
"grad_norm": 0.36049118638038635,
"learning_rate": 0.0005158880792771786,
"loss": 3.3914,
"step": 24150
},
{
"epoch": 7.049230948496854,
"grad_norm": 0.3440174162387848,
"learning_rate": 0.0005157132031477703,
"loss": 3.3868,
"step": 24200
},
{
"epoch": 7.063796317874622,
"grad_norm": 0.38178542256355286,
"learning_rate": 0.000515538327018362,
"loss": 3.3956,
"step": 24250
},
{
"epoch": 7.0783616872523885,
"grad_norm": 0.32825422286987305,
"learning_rate": 0.0005153634508889536,
"loss": 3.3839,
"step": 24300
},
{
"epoch": 7.092927056630156,
"grad_norm": 0.34752145409584045,
"learning_rate": 0.0005151885747595453,
"loss": 3.3979,
"step": 24350
},
{
"epoch": 7.107492426007924,
"grad_norm": 0.33364078402519226,
"learning_rate": 0.000515013698630137,
"loss": 3.404,
"step": 24400
},
{
"epoch": 7.122057795385691,
"grad_norm": 0.3302494287490845,
"learning_rate": 0.0005148388225007285,
"loss": 3.4225,
"step": 24450
},
{
"epoch": 7.136623164763458,
"grad_norm": 0.33415162563323975,
"learning_rate": 0.0005146639463713203,
"loss": 3.401,
"step": 24500
},
{
"epoch": 7.151188534141226,
"grad_norm": 0.33528947830200195,
"learning_rate": 0.0005144890702419119,
"loss": 3.4015,
"step": 24550
},
{
"epoch": 7.165753903518993,
"grad_norm": 0.3421080410480499,
"learning_rate": 0.0005143141941125036,
"loss": 3.4205,
"step": 24600
},
{
"epoch": 7.180319272896761,
"grad_norm": 0.3325115442276001,
"learning_rate": 0.0005141393179830953,
"loss": 3.4281,
"step": 24650
},
{
"epoch": 7.194884642274528,
"grad_norm": 0.31258365511894226,
"learning_rate": 0.0005139644418536869,
"loss": 3.426,
"step": 24700
},
{
"epoch": 7.2094500116522955,
"grad_norm": 0.31508442759513855,
"learning_rate": 0.0005137895657242786,
"loss": 3.4143,
"step": 24750
},
{
"epoch": 7.224015381030063,
"grad_norm": 0.3417088985443115,
"learning_rate": 0.0005136146895948703,
"loss": 3.4357,
"step": 24800
},
{
"epoch": 7.238580750407831,
"grad_norm": 0.3098302185535431,
"learning_rate": 0.000513439813465462,
"loss": 3.4303,
"step": 24850
},
{
"epoch": 7.2531461197855975,
"grad_norm": 0.31606337428092957,
"learning_rate": 0.0005132649373360535,
"loss": 3.4353,
"step": 24900
},
{
"epoch": 7.267711489163365,
"grad_norm": 0.33023601770401,
"learning_rate": 0.0005130900612066452,
"loss": 3.4206,
"step": 24950
},
{
"epoch": 7.282276858541133,
"grad_norm": 0.33378899097442627,
"learning_rate": 0.0005129151850772369,
"loss": 3.4153,
"step": 25000
},
{
"epoch": 7.282276858541133,
"eval_accuracy": 0.364236481986269,
"eval_loss": 3.6024866104125977,
"eval_runtime": 179.4604,
"eval_samples_per_second": 92.75,
"eval_steps_per_second": 5.801,
"step": 25000
},
{
"epoch": 7.2968422279189,
"grad_norm": 0.3245193064212799,
"learning_rate": 0.0005127403089478286,
"loss": 3.427,
"step": 25050
},
{
"epoch": 7.311407597296667,
"grad_norm": 0.35483458638191223,
"learning_rate": 0.0005125654328184203,
"loss": 3.4314,
"step": 25100
},
{
"epoch": 7.325972966674435,
"grad_norm": 0.3492553234100342,
"learning_rate": 0.0005123905566890119,
"loss": 3.4336,
"step": 25150
},
{
"epoch": 7.3405383360522025,
"grad_norm": 0.35173410177230835,
"learning_rate": 0.0005122156805596036,
"loss": 3.4165,
"step": 25200
},
{
"epoch": 7.35510370542997,
"grad_norm": 0.3420160412788391,
"learning_rate": 0.0005120408044301953,
"loss": 3.4402,
"step": 25250
},
{
"epoch": 7.369669074807737,
"grad_norm": 0.3215605318546295,
"learning_rate": 0.0005118659283007868,
"loss": 3.447,
"step": 25300
},
{
"epoch": 7.384234444185505,
"grad_norm": 0.3140503764152527,
"learning_rate": 0.0005116910521713785,
"loss": 3.4381,
"step": 25350
},
{
"epoch": 7.398799813563272,
"grad_norm": 0.32911545038223267,
"learning_rate": 0.0005115161760419702,
"loss": 3.4403,
"step": 25400
},
{
"epoch": 7.413365182941039,
"grad_norm": 0.3454091548919678,
"learning_rate": 0.0005113412999125619,
"loss": 3.4435,
"step": 25450
},
{
"epoch": 7.427930552318807,
"grad_norm": 0.3304098844528198,
"learning_rate": 0.0005111664237831536,
"loss": 3.4443,
"step": 25500
},
{
"epoch": 7.442495921696574,
"grad_norm": 0.32890447974205017,
"learning_rate": 0.0005109915476537452,
"loss": 3.4347,
"step": 25550
},
{
"epoch": 7.457061291074342,
"grad_norm": 0.3333839476108551,
"learning_rate": 0.0005108166715243369,
"loss": 3.4441,
"step": 25600
},
{
"epoch": 7.471626660452109,
"grad_norm": 0.3388593792915344,
"learning_rate": 0.0005106417953949286,
"loss": 3.4551,
"step": 25650
},
{
"epoch": 7.486192029829876,
"grad_norm": 0.3506496846675873,
"learning_rate": 0.0005104669192655203,
"loss": 3.4465,
"step": 25700
},
{
"epoch": 7.500757399207644,
"grad_norm": 0.35972943902015686,
"learning_rate": 0.0005102920431361118,
"loss": 3.455,
"step": 25750
},
{
"epoch": 7.515322768585412,
"grad_norm": 0.3275600075721741,
"learning_rate": 0.0005101171670067035,
"loss": 3.4599,
"step": 25800
},
{
"epoch": 7.529888137963178,
"grad_norm": 0.3396972417831421,
"learning_rate": 0.0005099422908772952,
"loss": 3.4512,
"step": 25850
},
{
"epoch": 7.544453507340946,
"grad_norm": 0.3468742072582245,
"learning_rate": 0.0005097674147478868,
"loss": 3.4439,
"step": 25900
},
{
"epoch": 7.559018876718714,
"grad_norm": 0.3341714143753052,
"learning_rate": 0.0005095925386184786,
"loss": 3.454,
"step": 25950
},
{
"epoch": 7.573584246096481,
"grad_norm": 0.33167895674705505,
"learning_rate": 0.0005094176624890702,
"loss": 3.4552,
"step": 26000
},
{
"epoch": 7.573584246096481,
"eval_accuracy": 0.3649543623932247,
"eval_loss": 3.5933761596679688,
"eval_runtime": 179.6882,
"eval_samples_per_second": 92.633,
"eval_steps_per_second": 5.793,
"step": 26000
},
{
"epoch": 7.588149615474248,
"grad_norm": 0.38421186804771423,
"learning_rate": 0.0005092427863596619,
"loss": 3.4496,
"step": 26050
},
{
"epoch": 7.602714984852016,
"grad_norm": 0.3296069800853729,
"learning_rate": 0.0005090679102302536,
"loss": 3.4484,
"step": 26100
},
{
"epoch": 7.617280354229783,
"grad_norm": 0.33456072211265564,
"learning_rate": 0.0005088930341008451,
"loss": 3.4478,
"step": 26150
},
{
"epoch": 7.631845723607551,
"grad_norm": 0.34444794058799744,
"learning_rate": 0.0005087181579714368,
"loss": 3.4491,
"step": 26200
},
{
"epoch": 7.646411092985318,
"grad_norm": 0.3780238628387451,
"learning_rate": 0.0005085432818420285,
"loss": 3.451,
"step": 26250
},
{
"epoch": 7.660976462363085,
"grad_norm": 0.33494746685028076,
"learning_rate": 0.0005083684057126202,
"loss": 3.4543,
"step": 26300
},
{
"epoch": 7.675541831740853,
"grad_norm": 0.40819284319877625,
"learning_rate": 0.0005081935295832118,
"loss": 3.4655,
"step": 26350
},
{
"epoch": 7.690107201118621,
"grad_norm": 0.3251825273036957,
"learning_rate": 0.0005080186534538035,
"loss": 3.4459,
"step": 26400
},
{
"epoch": 7.704672570496387,
"grad_norm": 0.3159500062465668,
"learning_rate": 0.0005078437773243952,
"loss": 3.4561,
"step": 26450
},
{
"epoch": 7.719237939874155,
"grad_norm": 0.3354164958000183,
"learning_rate": 0.0005076689011949869,
"loss": 3.4639,
"step": 26500
},
{
"epoch": 7.733803309251923,
"grad_norm": 0.3452058732509613,
"learning_rate": 0.0005074940250655786,
"loss": 3.4627,
"step": 26550
},
{
"epoch": 7.74836867862969,
"grad_norm": 0.3344949781894684,
"learning_rate": 0.0005073191489361701,
"loss": 3.4492,
"step": 26600
},
{
"epoch": 7.762934048007457,
"grad_norm": 0.35478341579437256,
"learning_rate": 0.0005071442728067618,
"loss": 3.4452,
"step": 26650
},
{
"epoch": 7.777499417385225,
"grad_norm": 0.3661314845085144,
"learning_rate": 0.0005069693966773535,
"loss": 3.4577,
"step": 26700
},
{
"epoch": 7.792064786762992,
"grad_norm": 0.34170424938201904,
"learning_rate": 0.0005067945205479451,
"loss": 3.4678,
"step": 26750
},
{
"epoch": 7.80663015614076,
"grad_norm": 0.31290966272354126,
"learning_rate": 0.0005066196444185368,
"loss": 3.4606,
"step": 26800
},
{
"epoch": 7.821195525518527,
"grad_norm": 0.35089555382728577,
"learning_rate": 0.0005064447682891285,
"loss": 3.4679,
"step": 26850
},
{
"epoch": 7.8357608948962945,
"grad_norm": 0.33421048521995544,
"learning_rate": 0.0005062698921597202,
"loss": 3.4708,
"step": 26900
},
{
"epoch": 7.850326264274062,
"grad_norm": 0.35330483317375183,
"learning_rate": 0.0005060950160303119,
"loss": 3.4581,
"step": 26950
},
{
"epoch": 7.86489163365183,
"grad_norm": 0.3339422941207886,
"learning_rate": 0.0005059201399009035,
"loss": 3.4603,
"step": 27000
},
{
"epoch": 7.86489163365183,
"eval_accuracy": 0.3656727130788616,
"eval_loss": 3.585218906402588,
"eval_runtime": 179.5703,
"eval_samples_per_second": 92.694,
"eval_steps_per_second": 5.797,
"step": 27000
},
{
"epoch": 7.8794570030295965,
"grad_norm": 0.335092157125473,
"learning_rate": 0.0005057452637714951,
"loss": 3.4618,
"step": 27050
},
{
"epoch": 7.894022372407364,
"grad_norm": 0.35167837142944336,
"learning_rate": 0.0005055703876420868,
"loss": 3.4618,
"step": 27100
},
{
"epoch": 7.908587741785132,
"grad_norm": 0.3454788327217102,
"learning_rate": 0.0005053955115126785,
"loss": 3.4624,
"step": 27150
},
{
"epoch": 7.923153111162899,
"grad_norm": 0.35379868745803833,
"learning_rate": 0.0005052206353832701,
"loss": 3.4515,
"step": 27200
},
{
"epoch": 7.937718480540666,
"grad_norm": 0.35463815927505493,
"learning_rate": 0.0005050457592538618,
"loss": 3.4616,
"step": 27250
},
{
"epoch": 7.952283849918434,
"grad_norm": 0.36919155716896057,
"learning_rate": 0.0005048708831244535,
"loss": 3.465,
"step": 27300
},
{
"epoch": 7.9668492192962015,
"grad_norm": 0.32335364818573,
"learning_rate": 0.0005046960069950451,
"loss": 3.4751,
"step": 27350
},
{
"epoch": 7.981414588673969,
"grad_norm": 0.32492557168006897,
"learning_rate": 0.0005045211308656369,
"loss": 3.4596,
"step": 27400
},
{
"epoch": 7.995979958051736,
"grad_norm": 0.32239827513694763,
"learning_rate": 0.0005043462547362284,
"loss": 3.4709,
"step": 27450
},
{
"epoch": 8.010487065951992,
"grad_norm": 0.3487697243690491,
"learning_rate": 0.0005041713786068201,
"loss": 3.3878,
"step": 27500
},
{
"epoch": 8.02505243532976,
"grad_norm": 0.339937299489975,
"learning_rate": 0.0005039965024774118,
"loss": 3.3456,
"step": 27550
},
{
"epoch": 8.039617804707527,
"grad_norm": 0.34511151909828186,
"learning_rate": 0.0005038216263480034,
"loss": 3.3591,
"step": 27600
},
{
"epoch": 8.054183174085296,
"grad_norm": 0.33415067195892334,
"learning_rate": 0.0005036467502185951,
"loss": 3.351,
"step": 27650
},
{
"epoch": 8.068748543463062,
"grad_norm": 0.33265748620033264,
"learning_rate": 0.0005034718740891868,
"loss": 3.3549,
"step": 27700
},
{
"epoch": 8.08331391284083,
"grad_norm": 0.3668820261955261,
"learning_rate": 0.0005032969979597785,
"loss": 3.3766,
"step": 27750
},
{
"epoch": 8.097879282218598,
"grad_norm": 0.35505983233451843,
"learning_rate": 0.0005031221218303701,
"loss": 3.3705,
"step": 27800
},
{
"epoch": 8.112444651596364,
"grad_norm": 0.3510807752609253,
"learning_rate": 0.0005029472457009618,
"loss": 3.3713,
"step": 27850
},
{
"epoch": 8.127010020974131,
"grad_norm": 0.3338639736175537,
"learning_rate": 0.0005027723695715534,
"loss": 3.3701,
"step": 27900
},
{
"epoch": 8.1415753903519,
"grad_norm": 0.327267587184906,
"learning_rate": 0.0005025974934421451,
"loss": 3.3841,
"step": 27950
},
{
"epoch": 8.156140759729666,
"grad_norm": 0.3316822052001953,
"learning_rate": 0.0005024226173127368,
"loss": 3.3745,
"step": 28000
},
{
"epoch": 8.156140759729666,
"eval_accuracy": 0.36602271798739533,
"eval_loss": 3.592015266418457,
"eval_runtime": 179.543,
"eval_samples_per_second": 92.708,
"eval_steps_per_second": 5.798,
"step": 28000
},
{
"epoch": 8.170706129107435,
"grad_norm": 0.34737464785575867,
"learning_rate": 0.0005022477411833284,
"loss": 3.3776,
"step": 28050
},
{
"epoch": 8.185271498485202,
"grad_norm": 0.35966166853904724,
"learning_rate": 0.0005020728650539201,
"loss": 3.3925,
"step": 28100
},
{
"epoch": 8.199836867862969,
"grad_norm": 0.3718971312046051,
"learning_rate": 0.0005018979889245118,
"loss": 3.3919,
"step": 28150
},
{
"epoch": 8.214402237240737,
"grad_norm": 0.36448919773101807,
"learning_rate": 0.0005017231127951034,
"loss": 3.383,
"step": 28200
},
{
"epoch": 8.228967606618504,
"grad_norm": 0.3384815752506256,
"learning_rate": 0.0005015482366656951,
"loss": 3.395,
"step": 28250
},
{
"epoch": 8.24353297599627,
"grad_norm": 0.35150644183158875,
"learning_rate": 0.0005013733605362868,
"loss": 3.392,
"step": 28300
},
{
"epoch": 8.258098345374039,
"grad_norm": 0.3527531027793884,
"learning_rate": 0.0005011984844068784,
"loss": 3.3917,
"step": 28350
},
{
"epoch": 8.272663714751806,
"grad_norm": 0.35610780119895935,
"learning_rate": 0.0005010236082774701,
"loss": 3.4048,
"step": 28400
},
{
"epoch": 8.287229084129574,
"grad_norm": 0.3675138056278229,
"learning_rate": 0.0005008487321480617,
"loss": 3.4005,
"step": 28450
},
{
"epoch": 8.301794453507341,
"grad_norm": 0.33330920338630676,
"learning_rate": 0.0005006738560186534,
"loss": 3.4086,
"step": 28500
},
{
"epoch": 8.316359822885108,
"grad_norm": 0.3794533312320709,
"learning_rate": 0.0005004989798892451,
"loss": 3.4062,
"step": 28550
},
{
"epoch": 8.330925192262876,
"grad_norm": 0.33697935938835144,
"learning_rate": 0.0005003241037598368,
"loss": 3.4017,
"step": 28600
},
{
"epoch": 8.345490561640643,
"grad_norm": 0.3670228123664856,
"learning_rate": 0.0005001492276304284,
"loss": 3.4173,
"step": 28650
},
{
"epoch": 8.36005593101841,
"grad_norm": 0.3506808578968048,
"learning_rate": 0.0004999743515010201,
"loss": 3.405,
"step": 28700
},
{
"epoch": 8.374621300396178,
"grad_norm": 0.31867876648902893,
"learning_rate": 0.0004997994753716117,
"loss": 3.423,
"step": 28750
},
{
"epoch": 8.389186669773945,
"grad_norm": 0.34136125445365906,
"learning_rate": 0.0004996245992422033,
"loss": 3.4096,
"step": 28800
},
{
"epoch": 8.403752039151712,
"grad_norm": 0.36181262135505676,
"learning_rate": 0.0004994497231127951,
"loss": 3.4094,
"step": 28850
},
{
"epoch": 8.41831740852948,
"grad_norm": 0.36319324374198914,
"learning_rate": 0.0004992748469833867,
"loss": 3.4074,
"step": 28900
},
{
"epoch": 8.432882777907247,
"grad_norm": 0.31755128502845764,
"learning_rate": 0.0004990999708539784,
"loss": 3.4134,
"step": 28950
},
{
"epoch": 8.447448147285016,
"grad_norm": 0.34557104110717773,
"learning_rate": 0.0004989250947245701,
"loss": 3.406,
"step": 29000
},
{
"epoch": 8.447448147285016,
"eval_accuracy": 0.36604270483134926,
"eval_loss": 3.5855937004089355,
"eval_runtime": 179.6608,
"eval_samples_per_second": 92.647,
"eval_steps_per_second": 5.794,
"step": 29000
},
{
"epoch": 8.462013516662783,
"grad_norm": 0.3253464102745056,
"learning_rate": 0.0004987502185951617,
"loss": 3.4097,
"step": 29050
},
{
"epoch": 8.47657888604055,
"grad_norm": 0.3373265564441681,
"learning_rate": 0.0004985753424657534,
"loss": 3.4196,
"step": 29100
},
{
"epoch": 8.491144255418318,
"grad_norm": 0.3448173701763153,
"learning_rate": 0.000498400466336345,
"loss": 3.423,
"step": 29150
},
{
"epoch": 8.505709624796085,
"grad_norm": 0.33007627725601196,
"learning_rate": 0.0004982255902069367,
"loss": 3.4089,
"step": 29200
},
{
"epoch": 8.520274994173853,
"grad_norm": 0.3380190432071686,
"learning_rate": 0.0004980507140775283,
"loss": 3.4091,
"step": 29250
},
{
"epoch": 8.53484036355162,
"grad_norm": 0.33369770646095276,
"learning_rate": 0.0004978758379481201,
"loss": 3.4299,
"step": 29300
},
{
"epoch": 8.549405732929387,
"grad_norm": 0.3573598265647888,
"learning_rate": 0.0004977009618187117,
"loss": 3.4095,
"step": 29350
},
{
"epoch": 8.563971102307155,
"grad_norm": 0.3663587272167206,
"learning_rate": 0.0004975260856893034,
"loss": 3.4357,
"step": 29400
},
{
"epoch": 8.578536471684922,
"grad_norm": 0.3414006233215332,
"learning_rate": 0.0004973512095598951,
"loss": 3.4199,
"step": 29450
},
{
"epoch": 8.593101841062689,
"grad_norm": 0.34500792622566223,
"learning_rate": 0.0004971763334304867,
"loss": 3.4219,
"step": 29500
},
{
"epoch": 8.607667210440457,
"grad_norm": 0.32384201884269714,
"learning_rate": 0.0004970014573010784,
"loss": 3.4209,
"step": 29550
},
{
"epoch": 8.622232579818224,
"grad_norm": 0.351113498210907,
"learning_rate": 0.00049682658117167,
"loss": 3.425,
"step": 29600
},
{
"epoch": 8.63679794919599,
"grad_norm": 0.3571644127368927,
"learning_rate": 0.0004966517050422616,
"loss": 3.4208,
"step": 29650
},
{
"epoch": 8.65136331857376,
"grad_norm": 0.34789296984672546,
"learning_rate": 0.0004964768289128533,
"loss": 3.4295,
"step": 29700
},
{
"epoch": 8.665928687951526,
"grad_norm": 0.34940293431282043,
"learning_rate": 0.000496301952783445,
"loss": 3.4267,
"step": 29750
},
{
"epoch": 8.680494057329295,
"grad_norm": 0.34850820899009705,
"learning_rate": 0.0004961270766540367,
"loss": 3.4153,
"step": 29800
},
{
"epoch": 8.695059426707061,
"grad_norm": 0.35262537002563477,
"learning_rate": 0.0004959522005246284,
"loss": 3.4276,
"step": 29850
},
{
"epoch": 8.709624796084828,
"grad_norm": 0.33390718698501587,
"learning_rate": 0.00049577732439522,
"loss": 3.4387,
"step": 29900
},
{
"epoch": 8.724190165462597,
"grad_norm": 0.33877861499786377,
"learning_rate": 0.0004956024482658117,
"loss": 3.4404,
"step": 29950
},
{
"epoch": 8.738755534840363,
"grad_norm": 0.3310054838657379,
"learning_rate": 0.0004954275721364034,
"loss": 3.429,
"step": 30000
},
{
"epoch": 8.738755534840363,
"eval_accuracy": 0.36692130297762826,
"eval_loss": 3.57623028755188,
"eval_runtime": 179.5297,
"eval_samples_per_second": 92.714,
"eval_steps_per_second": 5.798,
"step": 30000
},
{
"epoch": 8.753320904218132,
"grad_norm": 0.3291216790676117,
"learning_rate": 0.000495252696006995,
"loss": 3.4293,
"step": 30050
},
{
"epoch": 8.767886273595899,
"grad_norm": 0.336401104927063,
"learning_rate": 0.0004950778198775866,
"loss": 3.4227,
"step": 30100
},
{
"epoch": 8.782451642973665,
"grad_norm": 0.3546486794948578,
"learning_rate": 0.0004949029437481783,
"loss": 3.4305,
"step": 30150
},
{
"epoch": 8.797017012351434,
"grad_norm": 0.3768438398838043,
"learning_rate": 0.00049472806761877,
"loss": 3.4396,
"step": 30200
},
{
"epoch": 8.8115823817292,
"grad_norm": 0.36031797528266907,
"learning_rate": 0.0004945531914893616,
"loss": 3.4374,
"step": 30250
},
{
"epoch": 8.826147751106967,
"grad_norm": 0.338821679353714,
"learning_rate": 0.0004943783153599534,
"loss": 3.434,
"step": 30300
},
{
"epoch": 8.840713120484736,
"grad_norm": 0.3513728976249695,
"learning_rate": 0.000494203439230545,
"loss": 3.4468,
"step": 30350
},
{
"epoch": 8.855278489862503,
"grad_norm": 0.3318538963794708,
"learning_rate": 0.0004940285631011367,
"loss": 3.4349,
"step": 30400
},
{
"epoch": 8.86984385924027,
"grad_norm": 0.35798031091690063,
"learning_rate": 0.0004938536869717284,
"loss": 3.4249,
"step": 30450
},
{
"epoch": 8.884409228618038,
"grad_norm": 0.32275164127349854,
"learning_rate": 0.0004936788108423199,
"loss": 3.4337,
"step": 30500
},
{
"epoch": 8.898974597995805,
"grad_norm": 0.3375926613807678,
"learning_rate": 0.0004935039347129116,
"loss": 3.4416,
"step": 30550
},
{
"epoch": 8.913539967373573,
"grad_norm": 0.3426482677459717,
"learning_rate": 0.0004933290585835033,
"loss": 3.4399,
"step": 30600
},
{
"epoch": 8.92810533675134,
"grad_norm": 0.3388661742210388,
"learning_rate": 0.000493154182454095,
"loss": 3.4365,
"step": 30650
},
{
"epoch": 8.942670706129107,
"grad_norm": 0.33085867762565613,
"learning_rate": 0.0004929793063246866,
"loss": 3.4426,
"step": 30700
},
{
"epoch": 8.957236075506875,
"grad_norm": 0.34728479385375977,
"learning_rate": 0.0004928044301952783,
"loss": 3.4563,
"step": 30750
},
{
"epoch": 8.971801444884642,
"grad_norm": 0.37290677428245544,
"learning_rate": 0.00049262955406587,
"loss": 3.4409,
"step": 30800
},
{
"epoch": 8.986366814262409,
"grad_norm": 0.3528863787651062,
"learning_rate": 0.0004924546779364617,
"loss": 3.4321,
"step": 30850
},
{
"epoch": 9.000873922162667,
"grad_norm": 0.33041954040527344,
"learning_rate": 0.0004922798018070533,
"loss": 3.4225,
"step": 30900
},
{
"epoch": 9.015439291540433,
"grad_norm": 0.3403935730457306,
"learning_rate": 0.0004921049256776449,
"loss": 3.3243,
"step": 30950
},
{
"epoch": 9.0300046609182,
"grad_norm": 0.350462406873703,
"learning_rate": 0.0004919300495482366,
"loss": 3.317,
"step": 31000
},
{
"epoch": 9.0300046609182,
"eval_accuracy": 0.3672527318782519,
"eval_loss": 3.5818264484405518,
"eval_runtime": 179.6159,
"eval_samples_per_second": 92.67,
"eval_steps_per_second": 5.796,
"step": 31000
},
{
"epoch": 9.044570030295969,
"grad_norm": 0.3548542857170105,
"learning_rate": 0.0004917551734188283,
"loss": 3.3339,
"step": 31050
},
{
"epoch": 9.059135399673735,
"grad_norm": 0.34578046202659607,
"learning_rate": 0.0004915802972894199,
"loss": 3.3305,
"step": 31100
},
{
"epoch": 9.073700769051504,
"grad_norm": 0.335111141204834,
"learning_rate": 0.0004914054211600116,
"loss": 3.337,
"step": 31150
},
{
"epoch": 9.08826613842927,
"grad_norm": 0.32308679819107056,
"learning_rate": 0.0004912305450306033,
"loss": 3.3521,
"step": 31200
},
{
"epoch": 9.102831507807037,
"grad_norm": 0.359343558549881,
"learning_rate": 0.000491055668901195,
"loss": 3.3517,
"step": 31250
},
{
"epoch": 9.117396877184806,
"grad_norm": 0.3571661710739136,
"learning_rate": 0.0004908807927717865,
"loss": 3.3586,
"step": 31300
},
{
"epoch": 9.131962246562573,
"grad_norm": 0.3565060496330261,
"learning_rate": 0.0004907059166423783,
"loss": 3.3453,
"step": 31350
},
{
"epoch": 9.14652761594034,
"grad_norm": 0.39377450942993164,
"learning_rate": 0.0004905310405129699,
"loss": 3.3478,
"step": 31400
},
{
"epoch": 9.161092985318108,
"grad_norm": 0.3553165793418884,
"learning_rate": 0.0004903561643835616,
"loss": 3.3566,
"step": 31450
},
{
"epoch": 9.175658354695875,
"grad_norm": 0.34331372380256653,
"learning_rate": 0.0004901812882541533,
"loss": 3.3631,
"step": 31500
},
{
"epoch": 9.190223724073643,
"grad_norm": 0.3618679344654083,
"learning_rate": 0.0004900064121247449,
"loss": 3.3571,
"step": 31550
},
{
"epoch": 9.20478909345141,
"grad_norm": 0.37382376194000244,
"learning_rate": 0.0004898315359953366,
"loss": 3.3748,
"step": 31600
},
{
"epoch": 9.219354462829177,
"grad_norm": 0.35998404026031494,
"learning_rate": 0.0004896566598659283,
"loss": 3.3669,
"step": 31650
},
{
"epoch": 9.233919832206945,
"grad_norm": 0.33387935161590576,
"learning_rate": 0.0004894817837365199,
"loss": 3.3586,
"step": 31700
},
{
"epoch": 9.248485201584712,
"grad_norm": 0.3467971682548523,
"learning_rate": 0.0004893069076071115,
"loss": 3.37,
"step": 31750
},
{
"epoch": 9.263050570962479,
"grad_norm": 0.33971235156059265,
"learning_rate": 0.0004891320314777032,
"loss": 3.3652,
"step": 31800
},
{
"epoch": 9.277615940340247,
"grad_norm": 0.33422431349754333,
"learning_rate": 0.0004889571553482949,
"loss": 3.3646,
"step": 31850
},
{
"epoch": 9.292181309718014,
"grad_norm": 0.3441632390022278,
"learning_rate": 0.0004887822792188866,
"loss": 3.3774,
"step": 31900
},
{
"epoch": 9.306746679095783,
"grad_norm": 0.34111490845680237,
"learning_rate": 0.0004886074030894782,
"loss": 3.37,
"step": 31950
},
{
"epoch": 9.32131204847355,
"grad_norm": 0.35753685235977173,
"learning_rate": 0.0004884325269600699,
"loss": 3.3833,
"step": 32000
},
{
"epoch": 9.32131204847355,
"eval_accuracy": 0.36719994309627957,
"eval_loss": 3.580444812774658,
"eval_runtime": 179.6886,
"eval_samples_per_second": 92.632,
"eval_steps_per_second": 5.793,
"step": 32000
},
{
"epoch": 9.335877417851316,
"grad_norm": 0.3657474219799042,
"learning_rate": 0.0004882576508306615,
"loss": 3.3801,
"step": 32050
},
{
"epoch": 9.350442787229085,
"grad_norm": 0.360665500164032,
"learning_rate": 0.00048808277470125327,
"loss": 3.3826,
"step": 32100
},
{
"epoch": 9.365008156606851,
"grad_norm": 0.3531738817691803,
"learning_rate": 0.0004879078985718449,
"loss": 3.379,
"step": 32150
},
{
"epoch": 9.379573525984618,
"grad_norm": 0.3487440347671509,
"learning_rate": 0.0004877330224424366,
"loss": 3.3753,
"step": 32200
},
{
"epoch": 9.394138895362387,
"grad_norm": 0.3362681269645691,
"learning_rate": 0.00048755814631302823,
"loss": 3.3867,
"step": 32250
},
{
"epoch": 9.408704264740154,
"grad_norm": 0.38282108306884766,
"learning_rate": 0.00048738327018361987,
"loss": 3.3906,
"step": 32300
},
{
"epoch": 9.423269634117922,
"grad_norm": 0.3605978786945343,
"learning_rate": 0.00048720839405421156,
"loss": 3.3758,
"step": 32350
},
{
"epoch": 9.437835003495689,
"grad_norm": 0.33404749631881714,
"learning_rate": 0.0004870335179248032,
"loss": 3.3921,
"step": 32400
},
{
"epoch": 9.452400372873456,
"grad_norm": 0.3609205186367035,
"learning_rate": 0.0004868586417953949,
"loss": 3.3738,
"step": 32450
},
{
"epoch": 9.466965742251224,
"grad_norm": 0.3603569567203522,
"learning_rate": 0.0004866837656659865,
"loss": 3.3808,
"step": 32500
},
{
"epoch": 9.48153111162899,
"grad_norm": 0.34669244289398193,
"learning_rate": 0.00048650888953657816,
"loss": 3.4071,
"step": 32550
},
{
"epoch": 9.496096481006758,
"grad_norm": 0.3603197932243347,
"learning_rate": 0.0004863340134071699,
"loss": 3.3825,
"step": 32600
},
{
"epoch": 9.510661850384526,
"grad_norm": 0.33809515833854675,
"learning_rate": 0.00048615913727776154,
"loss": 3.3967,
"step": 32650
},
{
"epoch": 9.525227219762293,
"grad_norm": 0.33064547181129456,
"learning_rate": 0.00048598426114835323,
"loss": 3.4032,
"step": 32700
},
{
"epoch": 9.53979258914006,
"grad_norm": 0.343467116355896,
"learning_rate": 0.00048580938501894486,
"loss": 3.4028,
"step": 32750
},
{
"epoch": 9.554357958517828,
"grad_norm": 0.3574856221675873,
"learning_rate": 0.00048563450888953655,
"loss": 3.4011,
"step": 32800
},
{
"epoch": 9.568923327895595,
"grad_norm": 0.373276025056839,
"learning_rate": 0.0004854596327601282,
"loss": 3.3909,
"step": 32850
},
{
"epoch": 9.583488697273363,
"grad_norm": 0.372328519821167,
"learning_rate": 0.0004852847566307198,
"loss": 3.399,
"step": 32900
},
{
"epoch": 9.59805406665113,
"grad_norm": 0.3499378561973572,
"learning_rate": 0.0004851098805013115,
"loss": 3.4031,
"step": 32950
},
{
"epoch": 9.612619436028897,
"grad_norm": 0.36343520879745483,
"learning_rate": 0.00048493500437190315,
"loss": 3.3999,
"step": 33000
},
{
"epoch": 9.612619436028897,
"eval_accuracy": 0.3681005267709079,
"eval_loss": 3.5709691047668457,
"eval_runtime": 179.7111,
"eval_samples_per_second": 92.621,
"eval_steps_per_second": 5.793,
"step": 33000
},
{
"epoch": 9.627184805406666,
"grad_norm": 0.33371296525001526,
"learning_rate": 0.0004847601282424949,
"loss": 3.4052,
"step": 33050
},
{
"epoch": 9.641750174784432,
"grad_norm": 0.3303377032279968,
"learning_rate": 0.00048458525211308653,
"loss": 3.4074,
"step": 33100
},
{
"epoch": 9.6563155441622,
"grad_norm": 0.35207274556159973,
"learning_rate": 0.00048441037598367817,
"loss": 3.4158,
"step": 33150
},
{
"epoch": 9.670880913539968,
"grad_norm": 0.365450918674469,
"learning_rate": 0.00048423549985426986,
"loss": 3.4032,
"step": 33200
},
{
"epoch": 9.685446282917734,
"grad_norm": 0.33346548676490784,
"learning_rate": 0.0004840606237248615,
"loss": 3.4133,
"step": 33250
},
{
"epoch": 9.700011652295503,
"grad_norm": 0.3496672511100769,
"learning_rate": 0.0004838857475954532,
"loss": 3.4026,
"step": 33300
},
{
"epoch": 9.71457702167327,
"grad_norm": 0.3341895341873169,
"learning_rate": 0.0004837108714660448,
"loss": 3.4104,
"step": 33350
},
{
"epoch": 9.729142391051036,
"grad_norm": 0.35880976915359497,
"learning_rate": 0.0004835359953366365,
"loss": 3.419,
"step": 33400
},
{
"epoch": 9.743707760428805,
"grad_norm": 0.36180543899536133,
"learning_rate": 0.00048336111920722815,
"loss": 3.3987,
"step": 33450
},
{
"epoch": 9.758273129806572,
"grad_norm": 0.33391448855400085,
"learning_rate": 0.0004831862430778198,
"loss": 3.4069,
"step": 33500
},
{
"epoch": 9.772838499184338,
"grad_norm": 0.335786908864975,
"learning_rate": 0.00048301136694841153,
"loss": 3.4064,
"step": 33550
},
{
"epoch": 9.787403868562107,
"grad_norm": 0.3425864279270172,
"learning_rate": 0.00048283649081900317,
"loss": 3.4015,
"step": 33600
},
{
"epoch": 9.801969237939874,
"grad_norm": 0.3565329313278198,
"learning_rate": 0.00048266161468959486,
"loss": 3.4078,
"step": 33650
},
{
"epoch": 9.816534607317642,
"grad_norm": 0.3550211489200592,
"learning_rate": 0.0004824867385601865,
"loss": 3.3977,
"step": 33700
},
{
"epoch": 9.831099976695409,
"grad_norm": 0.33708900213241577,
"learning_rate": 0.00048231186243077813,
"loss": 3.4104,
"step": 33750
},
{
"epoch": 9.845665346073176,
"grad_norm": 0.3294453024864197,
"learning_rate": 0.0004821369863013698,
"loss": 3.4108,
"step": 33800
},
{
"epoch": 9.860230715450944,
"grad_norm": 0.373319536447525,
"learning_rate": 0.00048196211017196146,
"loss": 3.4074,
"step": 33850
},
{
"epoch": 9.874796084828711,
"grad_norm": 0.34141167998313904,
"learning_rate": 0.00048178723404255315,
"loss": 3.4147,
"step": 33900
},
{
"epoch": 9.88936145420648,
"grad_norm": 0.35247910022735596,
"learning_rate": 0.0004816123579131448,
"loss": 3.4067,
"step": 33950
},
{
"epoch": 9.903926823584246,
"grad_norm": 0.3380388021469116,
"learning_rate": 0.0004814374817837364,
"loss": 3.4035,
"step": 34000
},
{
"epoch": 9.903926823584246,
"eval_accuracy": 0.3680470325709136,
"eval_loss": 3.5645980834960938,
"eval_runtime": 179.6385,
"eval_samples_per_second": 92.658,
"eval_steps_per_second": 5.795,
"step": 34000
},
{
"epoch": 9.918492192962013,
"grad_norm": 0.33066990971565247,
"learning_rate": 0.00048126260565432816,
"loss": 3.4149,
"step": 34050
},
{
"epoch": 9.933057562339782,
"grad_norm": 0.36766499280929565,
"learning_rate": 0.0004810877295249198,
"loss": 3.4174,
"step": 34100
},
{
"epoch": 9.947622931717548,
"grad_norm": 0.32511964440345764,
"learning_rate": 0.0004809128533955115,
"loss": 3.4018,
"step": 34150
},
{
"epoch": 9.962188301095315,
"grad_norm": 0.3310585618019104,
"learning_rate": 0.0004807379772661031,
"loss": 3.4235,
"step": 34200
},
{
"epoch": 9.976753670473084,
"grad_norm": 0.34995537996292114,
"learning_rate": 0.0004805631011366948,
"loss": 3.405,
"step": 34250
},
{
"epoch": 9.99131903985085,
"grad_norm": 0.3458772301673889,
"learning_rate": 0.00048038822500728645,
"loss": 3.4196,
"step": 34300
},
{
"epoch": 10.005826147751106,
"grad_norm": 0.3567534387111664,
"learning_rate": 0.0004802133488778781,
"loss": 3.3754,
"step": 34350
},
{
"epoch": 10.020391517128875,
"grad_norm": 0.34907981753349304,
"learning_rate": 0.0004800384727484698,
"loss": 3.3042,
"step": 34400
},
{
"epoch": 10.034956886506642,
"grad_norm": 0.36751100420951843,
"learning_rate": 0.0004798635966190614,
"loss": 3.3046,
"step": 34450
},
{
"epoch": 10.049522255884408,
"grad_norm": 0.3411542773246765,
"learning_rate": 0.00047968872048965316,
"loss": 3.3093,
"step": 34500
},
{
"epoch": 10.064087625262177,
"grad_norm": 0.35095176100730896,
"learning_rate": 0.0004795138443602448,
"loss": 3.3179,
"step": 34550
},
{
"epoch": 10.078652994639944,
"grad_norm": 0.34343641996383667,
"learning_rate": 0.00047933896823083643,
"loss": 3.3087,
"step": 34600
},
{
"epoch": 10.093218364017712,
"grad_norm": 0.39370444416999817,
"learning_rate": 0.0004791640921014281,
"loss": 3.3124,
"step": 34650
},
{
"epoch": 10.107783733395479,
"grad_norm": 0.36577534675598145,
"learning_rate": 0.00047898921597201976,
"loss": 3.3235,
"step": 34700
},
{
"epoch": 10.122349102773246,
"grad_norm": 0.38264644145965576,
"learning_rate": 0.00047881433984261145,
"loss": 3.3297,
"step": 34750
},
{
"epoch": 10.136914472151014,
"grad_norm": 0.3537810742855072,
"learning_rate": 0.0004786394637132031,
"loss": 3.3233,
"step": 34800
},
{
"epoch": 10.151479841528781,
"grad_norm": 0.36147990822792053,
"learning_rate": 0.0004784645875837948,
"loss": 3.3265,
"step": 34850
},
{
"epoch": 10.166045210906548,
"grad_norm": 0.36201244592666626,
"learning_rate": 0.0004782897114543864,
"loss": 3.328,
"step": 34900
},
{
"epoch": 10.180610580284316,
"grad_norm": 0.3463992178440094,
"learning_rate": 0.00047811483532497805,
"loss": 3.3514,
"step": 34950
},
{
"epoch": 10.195175949662083,
"grad_norm": 0.3376941382884979,
"learning_rate": 0.0004779399591955698,
"loss": 3.3507,
"step": 35000
},
{
"epoch": 10.195175949662083,
"eval_accuracy": 0.3683125048864894,
"eval_loss": 3.574247121810913,
"eval_runtime": 179.5052,
"eval_samples_per_second": 92.727,
"eval_steps_per_second": 5.799,
"step": 35000
},
{
"epoch": 10.209741319039852,
"grad_norm": 0.3373648226261139,
"learning_rate": 0.00047776508306616143,
"loss": 3.339,
"step": 35050
},
{
"epoch": 10.224306688417618,
"grad_norm": 0.34261074662208557,
"learning_rate": 0.0004775902069367531,
"loss": 3.344,
"step": 35100
},
{
"epoch": 10.238872057795385,
"grad_norm": 0.3796720802783966,
"learning_rate": 0.00047741533080734476,
"loss": 3.3424,
"step": 35150
},
{
"epoch": 10.253437427173154,
"grad_norm": 0.36016082763671875,
"learning_rate": 0.0004772404546779364,
"loss": 3.3428,
"step": 35200
},
{
"epoch": 10.26800279655092,
"grad_norm": 0.350201815366745,
"learning_rate": 0.0004770655785485281,
"loss": 3.3538,
"step": 35250
},
{
"epoch": 10.282568165928687,
"grad_norm": 0.3484781086444855,
"learning_rate": 0.0004768907024191197,
"loss": 3.3387,
"step": 35300
},
{
"epoch": 10.297133535306456,
"grad_norm": 0.33689433336257935,
"learning_rate": 0.0004767158262897114,
"loss": 3.3516,
"step": 35350
},
{
"epoch": 10.311698904684222,
"grad_norm": 0.3487689793109894,
"learning_rate": 0.00047654095016030305,
"loss": 3.346,
"step": 35400
},
{
"epoch": 10.326264274061991,
"grad_norm": 0.34821704030036926,
"learning_rate": 0.0004763660740308948,
"loss": 3.3566,
"step": 35450
},
{
"epoch": 10.340829643439758,
"grad_norm": 0.3616722524166107,
"learning_rate": 0.0004761911979014864,
"loss": 3.3588,
"step": 35500
},
{
"epoch": 10.355395012817525,
"grad_norm": 0.390299528837204,
"learning_rate": 0.00047601632177207806,
"loss": 3.3664,
"step": 35550
},
{
"epoch": 10.369960382195293,
"grad_norm": 0.35546815395355225,
"learning_rate": 0.00047584144564266975,
"loss": 3.3594,
"step": 35600
},
{
"epoch": 10.38452575157306,
"grad_norm": 0.35586267709732056,
"learning_rate": 0.0004756665695132614,
"loss": 3.3715,
"step": 35650
},
{
"epoch": 10.399091120950827,
"grad_norm": 0.3343265950679779,
"learning_rate": 0.0004754916933838531,
"loss": 3.3525,
"step": 35700
},
{
"epoch": 10.413656490328595,
"grad_norm": 0.350169837474823,
"learning_rate": 0.0004753168172544447,
"loss": 3.3714,
"step": 35750
},
{
"epoch": 10.428221859706362,
"grad_norm": 0.347483366727829,
"learning_rate": 0.00047514194112503635,
"loss": 3.3678,
"step": 35800
},
{
"epoch": 10.44278722908413,
"grad_norm": 0.3664875328540802,
"learning_rate": 0.00047496706499562804,
"loss": 3.3738,
"step": 35850
},
{
"epoch": 10.457352598461897,
"grad_norm": 0.35261833667755127,
"learning_rate": 0.0004747921888662197,
"loss": 3.374,
"step": 35900
},
{
"epoch": 10.471917967839664,
"grad_norm": 0.3594660758972168,
"learning_rate": 0.0004746173127368114,
"loss": 3.3644,
"step": 35950
},
{
"epoch": 10.486483337217432,
"grad_norm": 0.3424801528453827,
"learning_rate": 0.00047444243660740306,
"loss": 3.3689,
"step": 36000
},
{
"epoch": 10.486483337217432,
"eval_accuracy": 0.36841914057746694,
"eval_loss": 3.5697360038757324,
"eval_runtime": 179.5198,
"eval_samples_per_second": 92.72,
"eval_steps_per_second": 5.799,
"step": 36000
},
{
"epoch": 10.5010487065952,
"grad_norm": 0.3455258905887604,
"learning_rate": 0.0004742675604779947,
"loss": 3.3751,
"step": 36050
},
{
"epoch": 10.515614075972966,
"grad_norm": 0.3722718358039856,
"learning_rate": 0.0004740926843485864,
"loss": 3.3761,
"step": 36100
},
{
"epoch": 10.530179445350734,
"grad_norm": 0.3568621575832367,
"learning_rate": 0.000473917808219178,
"loss": 3.3765,
"step": 36150
},
{
"epoch": 10.544744814728501,
"grad_norm": 0.3658199906349182,
"learning_rate": 0.0004737429320897697,
"loss": 3.3724,
"step": 36200
},
{
"epoch": 10.55931018410627,
"grad_norm": 0.34567561745643616,
"learning_rate": 0.00047356805596036135,
"loss": 3.3713,
"step": 36250
},
{
"epoch": 10.573875553484037,
"grad_norm": 0.3556523621082306,
"learning_rate": 0.00047339317983095304,
"loss": 3.3746,
"step": 36300
},
{
"epoch": 10.588440922861803,
"grad_norm": 0.3559434115886688,
"learning_rate": 0.0004732183037015447,
"loss": 3.3639,
"step": 36350
},
{
"epoch": 10.603006292239572,
"grad_norm": 0.36187922954559326,
"learning_rate": 0.0004730434275721363,
"loss": 3.3817,
"step": 36400
},
{
"epoch": 10.617571661617339,
"grad_norm": 0.3516775369644165,
"learning_rate": 0.00047286855144272806,
"loss": 3.3719,
"step": 36450
},
{
"epoch": 10.632137030995105,
"grad_norm": 0.34939226508140564,
"learning_rate": 0.0004726936753133197,
"loss": 3.3685,
"step": 36500
},
{
"epoch": 10.646702400372874,
"grad_norm": 0.35714349150657654,
"learning_rate": 0.0004725187991839114,
"loss": 3.3735,
"step": 36550
},
{
"epoch": 10.66126776975064,
"grad_norm": 0.38368478417396545,
"learning_rate": 0.000472343923054503,
"loss": 3.3903,
"step": 36600
},
{
"epoch": 10.675833139128407,
"grad_norm": 0.3668145537376404,
"learning_rate": 0.00047216904692509465,
"loss": 3.389,
"step": 36650
},
{
"epoch": 10.690398508506176,
"grad_norm": 0.3444463908672333,
"learning_rate": 0.00047199417079568634,
"loss": 3.3861,
"step": 36700
},
{
"epoch": 10.704963877883943,
"grad_norm": 0.36496496200561523,
"learning_rate": 0.000471819294666278,
"loss": 3.3841,
"step": 36750
},
{
"epoch": 10.719529247261711,
"grad_norm": 0.37470269203186035,
"learning_rate": 0.00047164441853686967,
"loss": 3.384,
"step": 36800
},
{
"epoch": 10.734094616639478,
"grad_norm": 0.379410982131958,
"learning_rate": 0.0004714695424074613,
"loss": 3.376,
"step": 36850
},
{
"epoch": 10.748659986017245,
"grad_norm": 0.34845808148384094,
"learning_rate": 0.00047129466627805305,
"loss": 3.386,
"step": 36900
},
{
"epoch": 10.763225355395013,
"grad_norm": 0.33667710423469543,
"learning_rate": 0.0004711197901486447,
"loss": 3.3802,
"step": 36950
},
{
"epoch": 10.77779072477278,
"grad_norm": 0.33371374011039734,
"learning_rate": 0.0004709449140192363,
"loss": 3.379,
"step": 37000
},
{
"epoch": 10.77779072477278,
"eval_accuracy": 0.36916417957826586,
"eval_loss": 3.5598151683807373,
"eval_runtime": 179.634,
"eval_samples_per_second": 92.661,
"eval_steps_per_second": 5.795,
"step": 37000
},
{
"epoch": 10.792356094150549,
"grad_norm": 0.3444075584411621,
"learning_rate": 0.000470770037889828,
"loss": 3.3856,
"step": 37050
},
{
"epoch": 10.806921463528315,
"grad_norm": 0.34622007608413696,
"learning_rate": 0.00047059516176041965,
"loss": 3.3919,
"step": 37100
},
{
"epoch": 10.821486832906082,
"grad_norm": 0.39845308661460876,
"learning_rate": 0.00047042028563101134,
"loss": 3.3956,
"step": 37150
},
{
"epoch": 10.83605220228385,
"grad_norm": 0.36669787764549255,
"learning_rate": 0.000470245409501603,
"loss": 3.3887,
"step": 37200
},
{
"epoch": 10.850617571661617,
"grad_norm": 0.34656545519828796,
"learning_rate": 0.0004700705333721946,
"loss": 3.4005,
"step": 37250
},
{
"epoch": 10.865182941039384,
"grad_norm": 0.35354891419410706,
"learning_rate": 0.0004698956572427863,
"loss": 3.3945,
"step": 37300
},
{
"epoch": 10.879748310417153,
"grad_norm": 0.33106672763824463,
"learning_rate": 0.00046972078111337794,
"loss": 3.3908,
"step": 37350
},
{
"epoch": 10.89431367979492,
"grad_norm": 0.3322337567806244,
"learning_rate": 0.0004695459049839697,
"loss": 3.3792,
"step": 37400
},
{
"epoch": 10.908879049172686,
"grad_norm": 0.35176587104797363,
"learning_rate": 0.0004693710288545613,
"loss": 3.3877,
"step": 37450
},
{
"epoch": 10.923444418550455,
"grad_norm": 0.34353724122047424,
"learning_rate": 0.000469196152725153,
"loss": 3.3928,
"step": 37500
},
{
"epoch": 10.938009787928221,
"grad_norm": 0.36018845438957214,
"learning_rate": 0.00046902127659574465,
"loss": 3.3907,
"step": 37550
},
{
"epoch": 10.95257515730599,
"grad_norm": 0.3528198003768921,
"learning_rate": 0.0004688464004663363,
"loss": 3.3962,
"step": 37600
},
{
"epoch": 10.967140526683757,
"grad_norm": 0.34510186314582825,
"learning_rate": 0.000468671524336928,
"loss": 3.3842,
"step": 37650
},
{
"epoch": 10.981705896061523,
"grad_norm": 0.3542734384536743,
"learning_rate": 0.0004684966482075196,
"loss": 3.4001,
"step": 37700
},
{
"epoch": 10.996271265439292,
"grad_norm": 0.3544139266014099,
"learning_rate": 0.0004683217720781113,
"loss": 3.3989,
"step": 37750
},
{
"epoch": 11.010778373339548,
"grad_norm": 0.35023751854896545,
"learning_rate": 0.00046814689594870294,
"loss": 3.3096,
"step": 37800
},
{
"epoch": 11.025343742717315,
"grad_norm": 0.36776822805404663,
"learning_rate": 0.0004679720198192946,
"loss": 3.2977,
"step": 37850
},
{
"epoch": 11.039909112095083,
"grad_norm": 0.3768557608127594,
"learning_rate": 0.0004677971436898863,
"loss": 3.2838,
"step": 37900
},
{
"epoch": 11.05447448147285,
"grad_norm": 0.34867051243782043,
"learning_rate": 0.00046762226756047795,
"loss": 3.2998,
"step": 37950
},
{
"epoch": 11.069039850850617,
"grad_norm": 0.3468160331249237,
"learning_rate": 0.00046744739143106964,
"loss": 3.2905,
"step": 38000
},
{
"epoch": 11.069039850850617,
"eval_accuracy": 0.368822286976984,
"eval_loss": 3.57218337059021,
"eval_runtime": 179.7316,
"eval_samples_per_second": 92.61,
"eval_steps_per_second": 5.792,
"step": 38000
},
{
"epoch": 11.083605220228385,
"grad_norm": 0.35249587893486023,
"learning_rate": 0.0004672725153016613,
"loss": 3.3018,
"step": 38050
},
{
"epoch": 11.098170589606152,
"grad_norm": 0.3808625340461731,
"learning_rate": 0.00046709763917225297,
"loss": 3.297,
"step": 38100
},
{
"epoch": 11.11273595898392,
"grad_norm": 0.3879368007183075,
"learning_rate": 0.0004669227630428446,
"loss": 3.3146,
"step": 38150
},
{
"epoch": 11.127301328361687,
"grad_norm": 0.34145820140838623,
"learning_rate": 0.00046674788691343624,
"loss": 3.3006,
"step": 38200
},
{
"epoch": 11.141866697739454,
"grad_norm": 0.3693101108074188,
"learning_rate": 0.00046657301078402793,
"loss": 3.3096,
"step": 38250
},
{
"epoch": 11.156432067117223,
"grad_norm": 0.3697426915168762,
"learning_rate": 0.00046639813465461957,
"loss": 3.3111,
"step": 38300
},
{
"epoch": 11.17099743649499,
"grad_norm": 0.37008944153785706,
"learning_rate": 0.0004662232585252113,
"loss": 3.3214,
"step": 38350
},
{
"epoch": 11.185562805872756,
"grad_norm": 0.3367568254470825,
"learning_rate": 0.00046604838239580295,
"loss": 3.3141,
"step": 38400
},
{
"epoch": 11.200128175250525,
"grad_norm": 0.36272698640823364,
"learning_rate": 0.0004658735062663946,
"loss": 3.3173,
"step": 38450
},
{
"epoch": 11.214693544628291,
"grad_norm": 0.35574871301651,
"learning_rate": 0.0004656986301369863,
"loss": 3.3162,
"step": 38500
},
{
"epoch": 11.22925891400606,
"grad_norm": 0.3292505443096161,
"learning_rate": 0.0004655237540075779,
"loss": 3.3112,
"step": 38550
},
{
"epoch": 11.243824283383827,
"grad_norm": 0.36970898509025574,
"learning_rate": 0.0004653488778781696,
"loss": 3.3333,
"step": 38600
},
{
"epoch": 11.258389652761593,
"grad_norm": 0.3610363304615021,
"learning_rate": 0.00046517400174876124,
"loss": 3.3249,
"step": 38650
},
{
"epoch": 11.272955022139362,
"grad_norm": 0.38531872630119324,
"learning_rate": 0.0004649991256193529,
"loss": 3.3284,
"step": 38700
},
{
"epoch": 11.287520391517129,
"grad_norm": 0.37316545844078064,
"learning_rate": 0.00046482424948994457,
"loss": 3.3431,
"step": 38750
},
{
"epoch": 11.302085760894895,
"grad_norm": 0.4000358581542969,
"learning_rate": 0.0004646493733605362,
"loss": 3.3247,
"step": 38800
},
{
"epoch": 11.316651130272664,
"grad_norm": 0.3880660831928253,
"learning_rate": 0.00046447449723112795,
"loss": 3.337,
"step": 38850
},
{
"epoch": 11.33121649965043,
"grad_norm": 0.38193467259407043,
"learning_rate": 0.0004642996211017196,
"loss": 3.3412,
"step": 38900
},
{
"epoch": 11.3457818690282,
"grad_norm": 0.3892691433429718,
"learning_rate": 0.0004641247449723113,
"loss": 3.3442,
"step": 38950
},
{
"epoch": 11.360347238405966,
"grad_norm": 0.3644644618034363,
"learning_rate": 0.0004639498688429029,
"loss": 3.3332,
"step": 39000
},
{
"epoch": 11.360347238405966,
"eval_accuracy": 0.36906248181344165,
"eval_loss": 3.567659616470337,
"eval_runtime": 179.7805,
"eval_samples_per_second": 92.585,
"eval_steps_per_second": 5.79,
"step": 39000
},
{
"epoch": 11.374912607783733,
"grad_norm": 0.38239893317222595,
"learning_rate": 0.00046377499271349455,
"loss": 3.334,
"step": 39050
},
{
"epoch": 11.389477977161501,
"grad_norm": 0.34430813789367676,
"learning_rate": 0.00046360011658408624,
"loss": 3.3331,
"step": 39100
},
{
"epoch": 11.404043346539268,
"grad_norm": 0.3366773724555969,
"learning_rate": 0.00046342524045467787,
"loss": 3.3524,
"step": 39150
},
{
"epoch": 11.418608715917035,
"grad_norm": 0.3577640950679779,
"learning_rate": 0.00046325036432526956,
"loss": 3.3418,
"step": 39200
},
{
"epoch": 11.433174085294803,
"grad_norm": 0.3685474395751953,
"learning_rate": 0.0004630754881958612,
"loss": 3.348,
"step": 39250
},
{
"epoch": 11.44773945467257,
"grad_norm": 0.36754393577575684,
"learning_rate": 0.00046290061206645284,
"loss": 3.3386,
"step": 39300
},
{
"epoch": 11.462304824050339,
"grad_norm": 0.36672037839889526,
"learning_rate": 0.0004627257359370446,
"loss": 3.3531,
"step": 39350
},
{
"epoch": 11.476870193428105,
"grad_norm": 0.3429676592350006,
"learning_rate": 0.0004625508598076362,
"loss": 3.3398,
"step": 39400
},
{
"epoch": 11.491435562805872,
"grad_norm": 0.40767839550971985,
"learning_rate": 0.0004623759836782279,
"loss": 3.3367,
"step": 39450
},
{
"epoch": 11.50600093218364,
"grad_norm": 0.3494288921356201,
"learning_rate": 0.00046220110754881954,
"loss": 3.3617,
"step": 39500
},
{
"epoch": 11.520566301561407,
"grad_norm": 0.35878896713256836,
"learning_rate": 0.00046202623141941123,
"loss": 3.354,
"step": 39550
},
{
"epoch": 11.535131670939174,
"grad_norm": 0.3389423191547394,
"learning_rate": 0.00046185135529000287,
"loss": 3.3341,
"step": 39600
},
{
"epoch": 11.549697040316943,
"grad_norm": 0.35176849365234375,
"learning_rate": 0.0004616764791605945,
"loss": 3.3581,
"step": 39650
},
{
"epoch": 11.56426240969471,
"grad_norm": 0.33921658992767334,
"learning_rate": 0.0004615016030311862,
"loss": 3.3701,
"step": 39700
},
{
"epoch": 11.578827779072478,
"grad_norm": 0.35132601857185364,
"learning_rate": 0.00046132672690177783,
"loss": 3.3563,
"step": 39750
},
{
"epoch": 11.593393148450245,
"grad_norm": 0.34590718150138855,
"learning_rate": 0.0004611518507723696,
"loss": 3.3562,
"step": 39800
},
{
"epoch": 11.607958517828012,
"grad_norm": 0.3639914393424988,
"learning_rate": 0.0004609769746429612,
"loss": 3.36,
"step": 39850
},
{
"epoch": 11.62252388720578,
"grad_norm": 0.3800894021987915,
"learning_rate": 0.00046080209851355285,
"loss": 3.3671,
"step": 39900
},
{
"epoch": 11.637089256583547,
"grad_norm": 0.3536291718482971,
"learning_rate": 0.00046062722238414454,
"loss": 3.3553,
"step": 39950
},
{
"epoch": 11.651654625961314,
"grad_norm": 0.35299673676490784,
"learning_rate": 0.0004604523462547362,
"loss": 3.3602,
"step": 40000
},
{
"epoch": 11.651654625961314,
"eval_accuracy": 0.3690838794934393,
"eval_loss": 3.560605764389038,
"eval_runtime": 179.7311,
"eval_samples_per_second": 92.611,
"eval_steps_per_second": 5.792,
"step": 40000
},
{
"epoch": 11.666219995339082,
"grad_norm": 0.37060144543647766,
"learning_rate": 0.00046027747012532787,
"loss": 3.3573,
"step": 40050
},
{
"epoch": 11.680785364716849,
"grad_norm": 0.4053195118904114,
"learning_rate": 0.0004601025939959195,
"loss": 3.3733,
"step": 40100
},
{
"epoch": 11.695350734094617,
"grad_norm": 0.3602805733680725,
"learning_rate": 0.0004599277178665112,
"loss": 3.3565,
"step": 40150
},
{
"epoch": 11.709916103472384,
"grad_norm": 0.3374777138233185,
"learning_rate": 0.00045975284173710283,
"loss": 3.357,
"step": 40200
},
{
"epoch": 11.724481472850151,
"grad_norm": 0.369104266166687,
"learning_rate": 0.00045957796560769446,
"loss": 3.371,
"step": 40250
},
{
"epoch": 11.73904684222792,
"grad_norm": 0.3783121407032013,
"learning_rate": 0.0004594030894782862,
"loss": 3.3587,
"step": 40300
},
{
"epoch": 11.753612211605686,
"grad_norm": 0.3741343021392822,
"learning_rate": 0.00045922821334887785,
"loss": 3.3759,
"step": 40350
},
{
"epoch": 11.768177580983453,
"grad_norm": 0.40091201663017273,
"learning_rate": 0.00045905333721946954,
"loss": 3.363,
"step": 40400
},
{
"epoch": 11.782742950361222,
"grad_norm": 0.3418520390987396,
"learning_rate": 0.00045887846109006117,
"loss": 3.3579,
"step": 40450
},
{
"epoch": 11.797308319738988,
"grad_norm": 0.328216016292572,
"learning_rate": 0.0004587035849606528,
"loss": 3.368,
"step": 40500
},
{
"epoch": 11.811873689116755,
"grad_norm": 0.3601894676685333,
"learning_rate": 0.0004585287088312445,
"loss": 3.3796,
"step": 40550
},
{
"epoch": 11.826439058494524,
"grad_norm": 0.3456202447414398,
"learning_rate": 0.00045835383270183613,
"loss": 3.3721,
"step": 40600
},
{
"epoch": 11.84100442787229,
"grad_norm": 0.3935994803905487,
"learning_rate": 0.0004581789565724278,
"loss": 3.3641,
"step": 40650
},
{
"epoch": 11.855569797250059,
"grad_norm": 0.38910239934921265,
"learning_rate": 0.00045800408044301946,
"loss": 3.3686,
"step": 40700
},
{
"epoch": 11.870135166627826,
"grad_norm": 0.37248024344444275,
"learning_rate": 0.0004578292043136111,
"loss": 3.3777,
"step": 40750
},
{
"epoch": 11.884700536005592,
"grad_norm": 0.36761152744293213,
"learning_rate": 0.00045765432818420284,
"loss": 3.359,
"step": 40800
},
{
"epoch": 11.899265905383361,
"grad_norm": 0.36878830194473267,
"learning_rate": 0.0004574794520547945,
"loss": 3.374,
"step": 40850
},
{
"epoch": 11.913831274761128,
"grad_norm": 0.36202317476272583,
"learning_rate": 0.00045730457592538617,
"loss": 3.3762,
"step": 40900
},
{
"epoch": 11.928396644138896,
"grad_norm": 0.37495362758636475,
"learning_rate": 0.0004571296997959778,
"loss": 3.3604,
"step": 40950
},
{
"epoch": 11.942962013516663,
"grad_norm": 0.3607545495033264,
"learning_rate": 0.0004569548236665695,
"loss": 3.3818,
"step": 41000
},
{
"epoch": 11.942962013516663,
"eval_accuracy": 0.37025463827045607,
"eval_loss": 3.5513434410095215,
"eval_runtime": 179.6988,
"eval_samples_per_second": 92.627,
"eval_steps_per_second": 5.793,
"step": 41000
},
{
"epoch": 11.95752738289443,
"grad_norm": 0.3468360900878906,
"learning_rate": 0.00045677994753716113,
"loss": 3.3849,
"step": 41050
},
{
"epoch": 11.972092752272198,
"grad_norm": 0.3485865294933319,
"learning_rate": 0.00045660507140775277,
"loss": 3.3785,
"step": 41100
},
{
"epoch": 11.986658121649965,
"grad_norm": 0.34709101915359497,
"learning_rate": 0.00045643019527834446,
"loss": 3.3663,
"step": 41150
},
{
"epoch": 12.001165229550221,
"grad_norm": 0.38278666138648987,
"learning_rate": 0.0004562553191489361,
"loss": 3.3766,
"step": 41200
},
{
"epoch": 12.01573059892799,
"grad_norm": 0.34984317421913147,
"learning_rate": 0.00045608044301952784,
"loss": 3.2617,
"step": 41250
},
{
"epoch": 12.030295968305756,
"grad_norm": 0.329671174287796,
"learning_rate": 0.0004559055668901195,
"loss": 3.265,
"step": 41300
},
{
"epoch": 12.044861337683523,
"grad_norm": 0.377273291349411,
"learning_rate": 0.0004557306907607111,
"loss": 3.274,
"step": 41350
},
{
"epoch": 12.059426707061291,
"grad_norm": 0.353500097990036,
"learning_rate": 0.0004555558146313028,
"loss": 3.2773,
"step": 41400
},
{
"epoch": 12.073992076439058,
"grad_norm": 0.33617493510246277,
"learning_rate": 0.00045538093850189444,
"loss": 3.2767,
"step": 41450
},
{
"epoch": 12.088557445816827,
"grad_norm": 0.37292763590812683,
"learning_rate": 0.00045520606237248613,
"loss": 3.2795,
"step": 41500
},
{
"epoch": 12.103122815194594,
"grad_norm": 0.3755471706390381,
"learning_rate": 0.00045503118624307776,
"loss": 3.2793,
"step": 41550
},
{
"epoch": 12.11768818457236,
"grad_norm": 0.4093469977378845,
"learning_rate": 0.00045485631011366945,
"loss": 3.2801,
"step": 41600
},
{
"epoch": 12.132253553950129,
"grad_norm": 0.3457607924938202,
"learning_rate": 0.0004546814339842611,
"loss": 3.2882,
"step": 41650
},
{
"epoch": 12.146818923327896,
"grad_norm": 0.37875697016716003,
"learning_rate": 0.0004545065578548527,
"loss": 3.2868,
"step": 41700
},
{
"epoch": 12.161384292705662,
"grad_norm": 0.3649895489215851,
"learning_rate": 0.00045433168172544447,
"loss": 3.2959,
"step": 41750
},
{
"epoch": 12.17594966208343,
"grad_norm": 0.391001433134079,
"learning_rate": 0.0004541568055960361,
"loss": 3.3045,
"step": 41800
},
{
"epoch": 12.190515031461198,
"grad_norm": 0.3820909857749939,
"learning_rate": 0.0004539819294666278,
"loss": 3.3038,
"step": 41850
},
{
"epoch": 12.205080400838966,
"grad_norm": 0.37390992045402527,
"learning_rate": 0.00045380705333721943,
"loss": 3.2909,
"step": 41900
},
{
"epoch": 12.219645770216733,
"grad_norm": 0.38024622201919556,
"learning_rate": 0.00045363217720781107,
"loss": 3.3109,
"step": 41950
},
{
"epoch": 12.2342111395945,
"grad_norm": 0.3541949391365051,
"learning_rate": 0.00045345730107840276,
"loss": 3.2924,
"step": 42000
},
{
"epoch": 12.2342111395945,
"eval_accuracy": 0.3695281752775673,
"eval_loss": 3.5650668144226074,
"eval_runtime": 179.7999,
"eval_samples_per_second": 92.575,
"eval_steps_per_second": 5.79,
"step": 42000
},
{
"epoch": 12.248776508972268,
"grad_norm": 0.35464340448379517,
"learning_rate": 0.0004532824249489944,
"loss": 3.3094,
"step": 42050
},
{
"epoch": 12.263341878350035,
"grad_norm": 0.3588162362575531,
"learning_rate": 0.0004531075488195861,
"loss": 3.3044,
"step": 42100
},
{
"epoch": 12.277907247727802,
"grad_norm": 0.35945770144462585,
"learning_rate": 0.0004529326726901777,
"loss": 3.3149,
"step": 42150
},
{
"epoch": 12.29247261710557,
"grad_norm": 0.3775635063648224,
"learning_rate": 0.00045275779656076947,
"loss": 3.3133,
"step": 42200
},
{
"epoch": 12.307037986483337,
"grad_norm": 0.36539244651794434,
"learning_rate": 0.0004525829204313611,
"loss": 3.3236,
"step": 42250
},
{
"epoch": 12.321603355861104,
"grad_norm": 0.36154550313949585,
"learning_rate": 0.00045240804430195274,
"loss": 3.3229,
"step": 42300
},
{
"epoch": 12.336168725238872,
"grad_norm": 0.36169642210006714,
"learning_rate": 0.00045223316817254443,
"loss": 3.3229,
"step": 42350
},
{
"epoch": 12.350734094616639,
"grad_norm": 0.3900752067565918,
"learning_rate": 0.00045205829204313607,
"loss": 3.3163,
"step": 42400
},
{
"epoch": 12.365299463994408,
"grad_norm": 0.36182570457458496,
"learning_rate": 0.00045188341591372776,
"loss": 3.3282,
"step": 42450
},
{
"epoch": 12.379864833372174,
"grad_norm": 0.36705368757247925,
"learning_rate": 0.0004517085397843194,
"loss": 3.3301,
"step": 42500
},
{
"epoch": 12.394430202749941,
"grad_norm": 0.35039055347442627,
"learning_rate": 0.00045153366365491103,
"loss": 3.3307,
"step": 42550
},
{
"epoch": 12.40899557212771,
"grad_norm": 0.38655978441238403,
"learning_rate": 0.0004513587875255027,
"loss": 3.3337,
"step": 42600
},
{
"epoch": 12.423560941505476,
"grad_norm": 0.34374991059303284,
"learning_rate": 0.00045118391139609436,
"loss": 3.3308,
"step": 42650
},
{
"epoch": 12.438126310883243,
"grad_norm": 0.366639643907547,
"learning_rate": 0.0004510090352666861,
"loss": 3.3421,
"step": 42700
},
{
"epoch": 12.452691680261012,
"grad_norm": 0.3672907054424286,
"learning_rate": 0.00045083415913727774,
"loss": 3.3184,
"step": 42750
},
{
"epoch": 12.467257049638778,
"grad_norm": 0.36934059858322144,
"learning_rate": 0.0004506592830078694,
"loss": 3.3262,
"step": 42800
},
{
"epoch": 12.481822419016547,
"grad_norm": 0.36116307973861694,
"learning_rate": 0.00045048440687846106,
"loss": 3.3253,
"step": 42850
},
{
"epoch": 12.496387788394314,
"grad_norm": 0.3898486793041229,
"learning_rate": 0.0004503095307490527,
"loss": 3.3426,
"step": 42900
},
{
"epoch": 12.51095315777208,
"grad_norm": 0.3628914952278137,
"learning_rate": 0.0004501346546196444,
"loss": 3.3299,
"step": 42950
},
{
"epoch": 12.525518527149849,
"grad_norm": 0.3678210973739624,
"learning_rate": 0.000449959778490236,
"loss": 3.3339,
"step": 43000
},
{
"epoch": 12.525518527149849,
"eval_accuracy": 0.3700859257935512,
"eval_loss": 3.558652877807617,
"eval_runtime": 179.6498,
"eval_samples_per_second": 92.652,
"eval_steps_per_second": 5.795,
"step": 43000
},
{
"epoch": 12.540083896527616,
"grad_norm": 0.3682061433792114,
"learning_rate": 0.0004497849023608277,
"loss": 3.3348,
"step": 43050
},
{
"epoch": 12.554649265905383,
"grad_norm": 0.3561127781867981,
"learning_rate": 0.00044961002623141935,
"loss": 3.332,
"step": 43100
},
{
"epoch": 12.569214635283151,
"grad_norm": 0.36310920119285583,
"learning_rate": 0.000449435150102011,
"loss": 3.3311,
"step": 43150
},
{
"epoch": 12.583780004660918,
"grad_norm": 0.378019779920578,
"learning_rate": 0.00044926027397260273,
"loss": 3.3486,
"step": 43200
},
{
"epoch": 12.598345374038686,
"grad_norm": 0.35761335492134094,
"learning_rate": 0.00044908539784319437,
"loss": 3.3456,
"step": 43250
},
{
"epoch": 12.612910743416453,
"grad_norm": 0.38079050183296204,
"learning_rate": 0.00044891052171378606,
"loss": 3.34,
"step": 43300
},
{
"epoch": 12.62747611279422,
"grad_norm": 0.36224061250686646,
"learning_rate": 0.0004487356455843777,
"loss": 3.3454,
"step": 43350
},
{
"epoch": 12.642041482171988,
"grad_norm": 0.3658839762210846,
"learning_rate": 0.00044856076945496933,
"loss": 3.3426,
"step": 43400
},
{
"epoch": 12.656606851549755,
"grad_norm": 0.3703918755054474,
"learning_rate": 0.000448385893325561,
"loss": 3.3413,
"step": 43450
},
{
"epoch": 12.671172220927522,
"grad_norm": 0.39403098821640015,
"learning_rate": 0.00044821101719615266,
"loss": 3.3435,
"step": 43500
},
{
"epoch": 12.68573759030529,
"grad_norm": 0.3617226481437683,
"learning_rate": 0.00044803614106674435,
"loss": 3.3455,
"step": 43550
},
{
"epoch": 12.700302959683057,
"grad_norm": 0.38985195755958557,
"learning_rate": 0.000447861264937336,
"loss": 3.347,
"step": 43600
},
{
"epoch": 12.714868329060826,
"grad_norm": 0.3814113438129425,
"learning_rate": 0.00044768638880792773,
"loss": 3.345,
"step": 43650
},
{
"epoch": 12.729433698438593,
"grad_norm": 0.33327075839042664,
"learning_rate": 0.00044751151267851937,
"loss": 3.3439,
"step": 43700
},
{
"epoch": 12.74399906781636,
"grad_norm": 0.39450183510780334,
"learning_rate": 0.000447336636549111,
"loss": 3.354,
"step": 43750
},
{
"epoch": 12.758564437194128,
"grad_norm": 0.34784385561943054,
"learning_rate": 0.0004471617604197027,
"loss": 3.3438,
"step": 43800
},
{
"epoch": 12.773129806571895,
"grad_norm": 0.3895801305770874,
"learning_rate": 0.00044698688429029433,
"loss": 3.3385,
"step": 43850
},
{
"epoch": 12.787695175949661,
"grad_norm": 0.37325412034988403,
"learning_rate": 0.000446812008160886,
"loss": 3.3382,
"step": 43900
},
{
"epoch": 12.80226054532743,
"grad_norm": 0.3635622262954712,
"learning_rate": 0.00044663713203147766,
"loss": 3.351,
"step": 43950
},
{
"epoch": 12.816825914705197,
"grad_norm": 0.34390226006507874,
"learning_rate": 0.0004464622559020693,
"loss": 3.352,
"step": 44000
},
{
"epoch": 12.816825914705197,
"eval_accuracy": 0.37086952764621406,
"eval_loss": 3.5511374473571777,
"eval_runtime": 179.8312,
"eval_samples_per_second": 92.559,
"eval_steps_per_second": 5.789,
"step": 44000
},
{
"epoch": 12.831391284082965,
"grad_norm": 0.3434717059135437,
"learning_rate": 0.000446287379772661,
"loss": 3.358,
"step": 44050
},
{
"epoch": 12.845956653460732,
"grad_norm": 0.3896610140800476,
"learning_rate": 0.0004461125036432526,
"loss": 3.349,
"step": 44100
},
{
"epoch": 12.860522022838499,
"grad_norm": 0.38952627778053284,
"learning_rate": 0.00044593762751384436,
"loss": 3.3503,
"step": 44150
},
{
"epoch": 12.875087392216267,
"grad_norm": 0.3934302031993866,
"learning_rate": 0.000445762751384436,
"loss": 3.365,
"step": 44200
},
{
"epoch": 12.889652761594034,
"grad_norm": 0.4013819694519043,
"learning_rate": 0.0004455878752550277,
"loss": 3.3597,
"step": 44250
},
{
"epoch": 12.9042181309718,
"grad_norm": 0.3661031126976013,
"learning_rate": 0.0004454129991256193,
"loss": 3.3674,
"step": 44300
},
{
"epoch": 12.91878350034957,
"grad_norm": 0.417216420173645,
"learning_rate": 0.00044523812299621096,
"loss": 3.3518,
"step": 44350
},
{
"epoch": 12.933348869727336,
"grad_norm": 0.3435131907463074,
"learning_rate": 0.00044506324686680265,
"loss": 3.3626,
"step": 44400
},
{
"epoch": 12.947914239105105,
"grad_norm": 0.3419662415981293,
"learning_rate": 0.0004448883707373943,
"loss": 3.3505,
"step": 44450
},
{
"epoch": 12.962479608482871,
"grad_norm": 0.370420902967453,
"learning_rate": 0.000444713494607986,
"loss": 3.3523,
"step": 44500
},
{
"epoch": 12.977044977860638,
"grad_norm": 0.37728220224380493,
"learning_rate": 0.0004445386184785776,
"loss": 3.344,
"step": 44550
},
{
"epoch": 12.991610347238407,
"grad_norm": 0.34244418144226074,
"learning_rate": 0.00044436374234916925,
"loss": 3.363,
"step": 44600
},
{
"epoch": 13.006117455138662,
"grad_norm": 0.37950703501701355,
"learning_rate": 0.000444188866219761,
"loss": 3.3071,
"step": 44650
},
{
"epoch": 13.02068282451643,
"grad_norm": 0.3414568305015564,
"learning_rate": 0.00044401399009035263,
"loss": 3.2523,
"step": 44700
},
{
"epoch": 13.035248193894198,
"grad_norm": 0.3659200966358185,
"learning_rate": 0.0004438391139609443,
"loss": 3.2501,
"step": 44750
},
{
"epoch": 13.049813563271965,
"grad_norm": 0.3586159348487854,
"learning_rate": 0.00044366423783153596,
"loss": 3.2587,
"step": 44800
},
{
"epoch": 13.064378932649731,
"grad_norm": 0.3674415051937103,
"learning_rate": 0.0004434893617021276,
"loss": 3.265,
"step": 44850
},
{
"epoch": 13.0789443020275,
"grad_norm": 0.36518362164497375,
"learning_rate": 0.0004433144855727193,
"loss": 3.2717,
"step": 44900
},
{
"epoch": 13.093509671405267,
"grad_norm": 0.35845455527305603,
"learning_rate": 0.0004431396094433109,
"loss": 3.2737,
"step": 44950
},
{
"epoch": 13.108075040783035,
"grad_norm": 0.3665563464164734,
"learning_rate": 0.0004429647333139026,
"loss": 3.2587,
"step": 45000
},
{
"epoch": 13.108075040783035,
"eval_accuracy": 0.37041758983351547,
"eval_loss": 3.561434268951416,
"eval_runtime": 179.821,
"eval_samples_per_second": 92.564,
"eval_steps_per_second": 5.789,
"step": 45000
},
{
"epoch": 13.122640410160802,
"grad_norm": 0.34525081515312195,
"learning_rate": 0.00044278985718449425,
"loss": 3.2747,
"step": 45050
},
{
"epoch": 13.137205779538569,
"grad_norm": 0.3698437809944153,
"learning_rate": 0.000442614981055086,
"loss": 3.2796,
"step": 45100
},
{
"epoch": 13.151771148916337,
"grad_norm": 0.34682697057724,
"learning_rate": 0.00044244010492567763,
"loss": 3.2792,
"step": 45150
},
{
"epoch": 13.166336518294104,
"grad_norm": 0.371136337518692,
"learning_rate": 0.00044226522879626927,
"loss": 3.2857,
"step": 45200
},
{
"epoch": 13.18090188767187,
"grad_norm": 0.3699894845485687,
"learning_rate": 0.00044209035266686096,
"loss": 3.2826,
"step": 45250
},
{
"epoch": 13.19546725704964,
"grad_norm": 0.35856735706329346,
"learning_rate": 0.0004419154765374526,
"loss": 3.2864,
"step": 45300
},
{
"epoch": 13.210032626427406,
"grad_norm": 0.37214043736457825,
"learning_rate": 0.0004417406004080443,
"loss": 3.2817,
"step": 45350
},
{
"epoch": 13.224597995805174,
"grad_norm": 0.35830315947532654,
"learning_rate": 0.0004415657242786359,
"loss": 3.2916,
"step": 45400
},
{
"epoch": 13.239163365182941,
"grad_norm": 0.38184452056884766,
"learning_rate": 0.00044139084814922755,
"loss": 3.2905,
"step": 45450
},
{
"epoch": 13.253728734560708,
"grad_norm": 0.36729663610458374,
"learning_rate": 0.00044121597201981924,
"loss": 3.2988,
"step": 45500
},
{
"epoch": 13.268294103938477,
"grad_norm": 0.360363632440567,
"learning_rate": 0.0004410410958904109,
"loss": 3.2856,
"step": 45550
},
{
"epoch": 13.282859473316243,
"grad_norm": 0.3617470860481262,
"learning_rate": 0.0004408662197610026,
"loss": 3.2866,
"step": 45600
},
{
"epoch": 13.29742484269401,
"grad_norm": 0.35599926114082336,
"learning_rate": 0.00044069134363159426,
"loss": 3.2948,
"step": 45650
},
{
"epoch": 13.311990212071779,
"grad_norm": 0.3696223199367523,
"learning_rate": 0.00044051646750218595,
"loss": 3.3045,
"step": 45700
},
{
"epoch": 13.326555581449545,
"grad_norm": 0.3685641288757324,
"learning_rate": 0.0004403415913727776,
"loss": 3.2931,
"step": 45750
},
{
"epoch": 13.341120950827314,
"grad_norm": 0.36321672797203064,
"learning_rate": 0.0004401667152433692,
"loss": 3.3018,
"step": 45800
},
{
"epoch": 13.35568632020508,
"grad_norm": 0.3663840591907501,
"learning_rate": 0.0004399918391139609,
"loss": 3.3055,
"step": 45850
},
{
"epoch": 13.370251689582847,
"grad_norm": 0.37315475940704346,
"learning_rate": 0.00043981696298455255,
"loss": 3.3086,
"step": 45900
},
{
"epoch": 13.384817058960616,
"grad_norm": 0.404310941696167,
"learning_rate": 0.00043964208685514424,
"loss": 3.3042,
"step": 45950
},
{
"epoch": 13.399382428338383,
"grad_norm": 0.3406934142112732,
"learning_rate": 0.0004394672107257359,
"loss": 3.3059,
"step": 46000
},
{
"epoch": 13.399382428338383,
"eval_accuracy": 0.37039325291175984,
"eval_loss": 3.558974266052246,
"eval_runtime": 179.7471,
"eval_samples_per_second": 92.602,
"eval_steps_per_second": 5.791,
"step": 46000
},
{
"epoch": 13.41394779771615,
"grad_norm": 0.3774344325065613,
"learning_rate": 0.0004392923345963275,
"loss": 3.3012,
"step": 46050
},
{
"epoch": 13.428513167093918,
"grad_norm": 0.36368629336357117,
"learning_rate": 0.00043911745846691926,
"loss": 3.3133,
"step": 46100
},
{
"epoch": 13.443078536471685,
"grad_norm": 0.34740936756134033,
"learning_rate": 0.0004389425823375109,
"loss": 3.3042,
"step": 46150
},
{
"epoch": 13.457643905849451,
"grad_norm": 0.3726678192615509,
"learning_rate": 0.0004387677062081026,
"loss": 3.319,
"step": 46200
},
{
"epoch": 13.47220927522722,
"grad_norm": 0.3507010340690613,
"learning_rate": 0.0004385928300786942,
"loss": 3.3134,
"step": 46250
},
{
"epoch": 13.486774644604987,
"grad_norm": 0.36555829644203186,
"learning_rate": 0.0004384179539492859,
"loss": 3.3205,
"step": 46300
},
{
"epoch": 13.501340013982755,
"grad_norm": 0.34969812631607056,
"learning_rate": 0.00043824307781987755,
"loss": 3.3253,
"step": 46350
},
{
"epoch": 13.515905383360522,
"grad_norm": 0.3921741545200348,
"learning_rate": 0.0004380682016904692,
"loss": 3.3118,
"step": 46400
},
{
"epoch": 13.530470752738289,
"grad_norm": 0.4136374592781067,
"learning_rate": 0.0004378933255610609,
"loss": 3.3229,
"step": 46450
},
{
"epoch": 13.545036122116057,
"grad_norm": 0.39142510294914246,
"learning_rate": 0.0004377184494316525,
"loss": 3.3253,
"step": 46500
},
{
"epoch": 13.559601491493824,
"grad_norm": 0.35085681080818176,
"learning_rate": 0.00043754357330224426,
"loss": 3.3399,
"step": 46550
},
{
"epoch": 13.574166860871593,
"grad_norm": 0.38441339135169983,
"learning_rate": 0.0004373686971728359,
"loss": 3.3177,
"step": 46600
},
{
"epoch": 13.58873223024936,
"grad_norm": 0.3715854287147522,
"learning_rate": 0.00043719382104342753,
"loss": 3.3232,
"step": 46650
},
{
"epoch": 13.603297599627126,
"grad_norm": 0.37551915645599365,
"learning_rate": 0.0004370189449140192,
"loss": 3.319,
"step": 46700
},
{
"epoch": 13.617862969004895,
"grad_norm": 0.41187357902526855,
"learning_rate": 0.00043684406878461085,
"loss": 3.3137,
"step": 46750
},
{
"epoch": 13.632428338382661,
"grad_norm": 0.37525227665901184,
"learning_rate": 0.00043666919265520254,
"loss": 3.319,
"step": 46800
},
{
"epoch": 13.646993707760428,
"grad_norm": 0.3764234483242035,
"learning_rate": 0.0004364943165257942,
"loss": 3.3301,
"step": 46850
},
{
"epoch": 13.661559077138197,
"grad_norm": 0.3498331904411316,
"learning_rate": 0.0004363194403963858,
"loss": 3.3309,
"step": 46900
},
{
"epoch": 13.676124446515963,
"grad_norm": 0.35479307174682617,
"learning_rate": 0.0004361445642669775,
"loss": 3.3297,
"step": 46950
},
{
"epoch": 13.69068981589373,
"grad_norm": 0.36635255813598633,
"learning_rate": 0.00043596968813756914,
"loss": 3.3337,
"step": 47000
},
{
"epoch": 13.69068981589373,
"eval_accuracy": 0.370927724633021,
"eval_loss": 3.551501989364624,
"eval_runtime": 179.6084,
"eval_samples_per_second": 92.674,
"eval_steps_per_second": 5.796,
"step": 47000
},
{
"epoch": 13.705255185271499,
"grad_norm": 0.3506696820259094,
"learning_rate": 0.0004357948120081609,
"loss": 3.3283,
"step": 47050
},
{
"epoch": 13.719820554649266,
"grad_norm": 0.33802589774131775,
"learning_rate": 0.0004356199358787525,
"loss": 3.3193,
"step": 47100
},
{
"epoch": 13.734385924027034,
"grad_norm": 0.4081648886203766,
"learning_rate": 0.0004354450597493442,
"loss": 3.3291,
"step": 47150
},
{
"epoch": 13.7489512934048,
"grad_norm": 0.35251572728157043,
"learning_rate": 0.00043527018361993585,
"loss": 3.3204,
"step": 47200
},
{
"epoch": 13.763516662782568,
"grad_norm": 0.34620070457458496,
"learning_rate": 0.0004350953074905275,
"loss": 3.3274,
"step": 47250
},
{
"epoch": 13.778082032160336,
"grad_norm": 0.36664706468582153,
"learning_rate": 0.0004349204313611192,
"loss": 3.3266,
"step": 47300
},
{
"epoch": 13.792647401538103,
"grad_norm": 0.35269954800605774,
"learning_rate": 0.0004347455552317108,
"loss": 3.3388,
"step": 47350
},
{
"epoch": 13.80721277091587,
"grad_norm": 0.39205700159072876,
"learning_rate": 0.0004345706791023025,
"loss": 3.3248,
"step": 47400
},
{
"epoch": 13.821778140293638,
"grad_norm": 0.39721786975860596,
"learning_rate": 0.00043439580297289414,
"loss": 3.3325,
"step": 47450
},
{
"epoch": 13.836343509671405,
"grad_norm": 0.39120006561279297,
"learning_rate": 0.0004342209268434858,
"loss": 3.3322,
"step": 47500
},
{
"epoch": 13.850908879049173,
"grad_norm": 0.34740549325942993,
"learning_rate": 0.0004340460507140775,
"loss": 3.3376,
"step": 47550
},
{
"epoch": 13.86547424842694,
"grad_norm": 0.3894191086292267,
"learning_rate": 0.00043387117458466916,
"loss": 3.3419,
"step": 47600
},
{
"epoch": 13.880039617804707,
"grad_norm": 0.36777186393737793,
"learning_rate": 0.00043369629845526085,
"loss": 3.3491,
"step": 47650
},
{
"epoch": 13.894604987182475,
"grad_norm": 0.3732227683067322,
"learning_rate": 0.0004335214223258525,
"loss": 3.3383,
"step": 47700
},
{
"epoch": 13.909170356560242,
"grad_norm": 0.3650364279747009,
"learning_rate": 0.0004333465461964442,
"loss": 3.3382,
"step": 47750
},
{
"epoch": 13.923735725938009,
"grad_norm": 0.36800310015678406,
"learning_rate": 0.0004331716700670358,
"loss": 3.3333,
"step": 47800
},
{
"epoch": 13.938301095315778,
"grad_norm": 0.4116499125957489,
"learning_rate": 0.00043299679393762745,
"loss": 3.3534,
"step": 47850
},
{
"epoch": 13.952866464693544,
"grad_norm": 0.3539418876171112,
"learning_rate": 0.00043282191780821914,
"loss": 3.3383,
"step": 47900
},
{
"epoch": 13.967431834071313,
"grad_norm": 0.35670432448387146,
"learning_rate": 0.00043264704167881077,
"loss": 3.347,
"step": 47950
},
{
"epoch": 13.98199720344908,
"grad_norm": 0.3599216639995575,
"learning_rate": 0.0004324721655494025,
"loss": 3.3343,
"step": 48000
},
{
"epoch": 13.98199720344908,
"eval_accuracy": 0.3713798975850602,
"eval_loss": 3.5442428588867188,
"eval_runtime": 179.6319,
"eval_samples_per_second": 92.662,
"eval_steps_per_second": 5.795,
"step": 48000
},
{
"epoch": 13.996562572826846,
"grad_norm": 0.3528788387775421,
"learning_rate": 0.00043229728941999415,
"loss": 3.345,
"step": 48050
},
{
"epoch": 14.011069680727104,
"grad_norm": 0.3962458670139313,
"learning_rate": 0.0004321224132905858,
"loss": 3.2571,
"step": 48100
},
{
"epoch": 14.02563505010487,
"grad_norm": 0.3570566475391388,
"learning_rate": 0.0004319475371611775,
"loss": 3.2367,
"step": 48150
},
{
"epoch": 14.040200419482638,
"grad_norm": 0.3566766679286957,
"learning_rate": 0.0004317726610317691,
"loss": 3.2429,
"step": 48200
},
{
"epoch": 14.054765788860406,
"grad_norm": 0.38148075342178345,
"learning_rate": 0.0004315977849023608,
"loss": 3.2336,
"step": 48250
},
{
"epoch": 14.069331158238173,
"grad_norm": 0.36465924978256226,
"learning_rate": 0.00043142290877295244,
"loss": 3.2448,
"step": 48300
},
{
"epoch": 14.08389652761594,
"grad_norm": 0.4034234583377838,
"learning_rate": 0.00043124803264354413,
"loss": 3.2529,
"step": 48350
},
{
"epoch": 14.098461896993708,
"grad_norm": 0.38046795129776,
"learning_rate": 0.00043107315651413577,
"loss": 3.2572,
"step": 48400
},
{
"epoch": 14.113027266371475,
"grad_norm": 0.37367624044418335,
"learning_rate": 0.0004308982803847274,
"loss": 3.2464,
"step": 48450
},
{
"epoch": 14.127592635749243,
"grad_norm": 0.3575690984725952,
"learning_rate": 0.00043072340425531915,
"loss": 3.2573,
"step": 48500
},
{
"epoch": 14.14215800512701,
"grad_norm": 0.37947431206703186,
"learning_rate": 0.0004305485281259108,
"loss": 3.2611,
"step": 48550
},
{
"epoch": 14.156723374504777,
"grad_norm": 0.4311124086380005,
"learning_rate": 0.0004303736519965025,
"loss": 3.2644,
"step": 48600
},
{
"epoch": 14.171288743882545,
"grad_norm": 0.40697988867759705,
"learning_rate": 0.0004301987758670941,
"loss": 3.271,
"step": 48650
},
{
"epoch": 14.185854113260312,
"grad_norm": 0.3614986538887024,
"learning_rate": 0.00043002389973768575,
"loss": 3.2703,
"step": 48700
},
{
"epoch": 14.200419482638079,
"grad_norm": 0.40103664994239807,
"learning_rate": 0.00042984902360827744,
"loss": 3.2759,
"step": 48750
},
{
"epoch": 14.214984852015847,
"grad_norm": 0.3614042401313782,
"learning_rate": 0.0004296741474788691,
"loss": 3.2763,
"step": 48800
},
{
"epoch": 14.229550221393614,
"grad_norm": 0.3694717586040497,
"learning_rate": 0.00042949927134946077,
"loss": 3.2836,
"step": 48850
},
{
"epoch": 14.244115590771383,
"grad_norm": 0.3669103682041168,
"learning_rate": 0.0004293243952200524,
"loss": 3.2802,
"step": 48900
},
{
"epoch": 14.25868096014915,
"grad_norm": 0.38352200388908386,
"learning_rate": 0.00042914951909064415,
"loss": 3.2663,
"step": 48950
},
{
"epoch": 14.273246329526916,
"grad_norm": 0.44323334097862244,
"learning_rate": 0.0004289746429612358,
"loss": 3.2804,
"step": 49000
},
{
"epoch": 14.273246329526916,
"eval_accuracy": 0.3708143874708354,
"eval_loss": 3.558420181274414,
"eval_runtime": 179.5689,
"eval_samples_per_second": 92.694,
"eval_steps_per_second": 5.797,
"step": 49000
},
{
"epoch": 14.287811698904685,
"grad_norm": 0.3794345259666443,
"learning_rate": 0.0004287997668318274,
"loss": 3.2769,
"step": 49050
},
{
"epoch": 14.302377068282452,
"grad_norm": 0.38060110807418823,
"learning_rate": 0.0004286248907024191,
"loss": 3.2725,
"step": 49100
},
{
"epoch": 14.316942437660218,
"grad_norm": 0.43410491943359375,
"learning_rate": 0.00042845001457301075,
"loss": 3.2889,
"step": 49150
},
{
"epoch": 14.331507807037987,
"grad_norm": 0.39754602313041687,
"learning_rate": 0.00042827513844360244,
"loss": 3.2825,
"step": 49200
},
{
"epoch": 14.346073176415754,
"grad_norm": 0.41671112179756165,
"learning_rate": 0.00042810026231419407,
"loss": 3.2873,
"step": 49250
},
{
"epoch": 14.360638545793522,
"grad_norm": 0.37623023986816406,
"learning_rate": 0.0004279253861847857,
"loss": 3.2924,
"step": 49300
},
{
"epoch": 14.375203915171289,
"grad_norm": 0.421653687953949,
"learning_rate": 0.0004277505100553774,
"loss": 3.2735,
"step": 49350
},
{
"epoch": 14.389769284549056,
"grad_norm": 0.3558456301689148,
"learning_rate": 0.00042757563392596904,
"loss": 3.3055,
"step": 49400
},
{
"epoch": 14.404334653926824,
"grad_norm": 0.3729119300842285,
"learning_rate": 0.0004274007577965608,
"loss": 3.3128,
"step": 49450
},
{
"epoch": 14.418900023304591,
"grad_norm": 0.3821575939655304,
"learning_rate": 0.0004272258816671524,
"loss": 3.2866,
"step": 49500
},
{
"epoch": 14.433465392682358,
"grad_norm": 0.38078463077545166,
"learning_rate": 0.00042705100553774405,
"loss": 3.2985,
"step": 49550
},
{
"epoch": 14.448030762060126,
"grad_norm": 0.38333752751350403,
"learning_rate": 0.00042687612940833574,
"loss": 3.2894,
"step": 49600
},
{
"epoch": 14.462596131437893,
"grad_norm": 0.343722939491272,
"learning_rate": 0.0004267012532789274,
"loss": 3.307,
"step": 49650
},
{
"epoch": 14.477161500815662,
"grad_norm": 0.34256860613822937,
"learning_rate": 0.00042652637714951907,
"loss": 3.3008,
"step": 49700
},
{
"epoch": 14.491726870193428,
"grad_norm": 0.37949851155281067,
"learning_rate": 0.0004263515010201107,
"loss": 3.3107,
"step": 49750
},
{
"epoch": 14.506292239571195,
"grad_norm": 0.3749626874923706,
"learning_rate": 0.0004261766248907024,
"loss": 3.2993,
"step": 49800
},
{
"epoch": 14.520857608948964,
"grad_norm": 0.36548712849617004,
"learning_rate": 0.00042600174876129403,
"loss": 3.3102,
"step": 49850
},
{
"epoch": 14.53542297832673,
"grad_norm": 0.404715895652771,
"learning_rate": 0.00042582687263188567,
"loss": 3.3081,
"step": 49900
},
{
"epoch": 14.549988347704497,
"grad_norm": 0.37166038155555725,
"learning_rate": 0.0004256519965024774,
"loss": 3.3047,
"step": 49950
},
{
"epoch": 14.564553717082266,
"grad_norm": 0.3786543011665344,
"learning_rate": 0.00042547712037306905,
"loss": 3.293,
"step": 50000
},
{
"epoch": 14.564553717082266,
"eval_accuracy": 0.37125997652133685,
"eval_loss": 3.553053140640259,
"eval_runtime": 179.5811,
"eval_samples_per_second": 92.688,
"eval_steps_per_second": 5.797,
"step": 50000
},
{
"epoch": 14.579119086460032,
"grad_norm": 0.35522982478141785,
"learning_rate": 0.00042530224424366074,
"loss": 3.3022,
"step": 50050
},
{
"epoch": 14.5936844558378,
"grad_norm": 0.3993748426437378,
"learning_rate": 0.0004251273681142524,
"loss": 3.312,
"step": 50100
},
{
"epoch": 14.608249825215568,
"grad_norm": 0.39016029238700867,
"learning_rate": 0.000424952491984844,
"loss": 3.3076,
"step": 50150
},
{
"epoch": 14.622815194593334,
"grad_norm": 0.38183167576789856,
"learning_rate": 0.0004247776158554357,
"loss": 3.2985,
"step": 50200
},
{
"epoch": 14.637380563971103,
"grad_norm": 0.3808605968952179,
"learning_rate": 0.00042460273972602734,
"loss": 3.3138,
"step": 50250
},
{
"epoch": 14.65194593334887,
"grad_norm": 0.366777241230011,
"learning_rate": 0.00042442786359661903,
"loss": 3.3074,
"step": 50300
},
{
"epoch": 14.666511302726637,
"grad_norm": 0.3863094449043274,
"learning_rate": 0.00042425298746721066,
"loss": 3.3151,
"step": 50350
},
{
"epoch": 14.681076672104405,
"grad_norm": 0.35356074571609497,
"learning_rate": 0.0004240781113378024,
"loss": 3.3114,
"step": 50400
},
{
"epoch": 14.695642041482172,
"grad_norm": 0.38444754481315613,
"learning_rate": 0.00042390323520839405,
"loss": 3.3152,
"step": 50450
},
{
"epoch": 14.71020741085994,
"grad_norm": 0.3628937602043152,
"learning_rate": 0.0004237283590789857,
"loss": 3.3073,
"step": 50500
},
{
"epoch": 14.724772780237707,
"grad_norm": 0.3597457706928253,
"learning_rate": 0.00042355348294957737,
"loss": 3.3091,
"step": 50550
},
{
"epoch": 14.739338149615474,
"grad_norm": 0.40730124711990356,
"learning_rate": 0.000423378606820169,
"loss": 3.303,
"step": 50600
},
{
"epoch": 14.753903518993242,
"grad_norm": 0.3871900737285614,
"learning_rate": 0.0004232037306907607,
"loss": 3.323,
"step": 50650
},
{
"epoch": 14.76846888837101,
"grad_norm": 0.3685663938522339,
"learning_rate": 0.00042302885456135233,
"loss": 3.3253,
"step": 50700
},
{
"epoch": 14.783034257748776,
"grad_norm": 0.358916699886322,
"learning_rate": 0.00042285397843194397,
"loss": 3.3162,
"step": 50750
},
{
"epoch": 14.797599627126544,
"grad_norm": 0.37842485308647156,
"learning_rate": 0.00042267910230253566,
"loss": 3.335,
"step": 50800
},
{
"epoch": 14.812164996504311,
"grad_norm": 0.36957690119743347,
"learning_rate": 0.0004225042261731273,
"loss": 3.3302,
"step": 50850
},
{
"epoch": 14.826730365882078,
"grad_norm": 0.3704380989074707,
"learning_rate": 0.00042232935004371904,
"loss": 3.324,
"step": 50900
},
{
"epoch": 14.841295735259846,
"grad_norm": 0.3660496175289154,
"learning_rate": 0.0004221544739143107,
"loss": 3.3219,
"step": 50950
},
{
"epoch": 14.855861104637613,
"grad_norm": 0.3719576299190521,
"learning_rate": 0.00042197959778490237,
"loss": 3.3208,
"step": 51000
},
{
"epoch": 14.855861104637613,
"eval_accuracy": 0.3716361994663513,
"eval_loss": 3.5443522930145264,
"eval_runtime": 179.6673,
"eval_samples_per_second": 92.643,
"eval_steps_per_second": 5.794,
"step": 51000
},
{
"epoch": 14.870426474015382,
"grad_norm": 0.39968937635421753,
"learning_rate": 0.000421804721655494,
"loss": 3.332,
"step": 51050
},
{
"epoch": 14.884991843393149,
"grad_norm": 0.3981848359107971,
"learning_rate": 0.00042162984552608564,
"loss": 3.3107,
"step": 51100
},
{
"epoch": 14.899557212770915,
"grad_norm": 0.3961758613586426,
"learning_rate": 0.00042145496939667733,
"loss": 3.3288,
"step": 51150
},
{
"epoch": 14.914122582148684,
"grad_norm": 0.3636086881160736,
"learning_rate": 0.00042128009326726897,
"loss": 3.3134,
"step": 51200
},
{
"epoch": 14.92868795152645,
"grad_norm": 0.36392343044281006,
"learning_rate": 0.00042110521713786066,
"loss": 3.3325,
"step": 51250
},
{
"epoch": 14.943253320904217,
"grad_norm": 0.36686888337135315,
"learning_rate": 0.0004209303410084523,
"loss": 3.3204,
"step": 51300
},
{
"epoch": 14.957818690281986,
"grad_norm": 0.37597978115081787,
"learning_rate": 0.00042075546487904393,
"loss": 3.3212,
"step": 51350
},
{
"epoch": 14.972384059659753,
"grad_norm": 0.3588141202926636,
"learning_rate": 0.0004205805887496357,
"loss": 3.3291,
"step": 51400
},
{
"epoch": 14.986949429037521,
"grad_norm": 0.37139445543289185,
"learning_rate": 0.0004204057126202273,
"loss": 3.3212,
"step": 51450
},
{
"epoch": 15.001456536937777,
"grad_norm": 0.393344908952713,
"learning_rate": 0.000420230836490819,
"loss": 3.3173,
"step": 51500
},
{
"epoch": 15.016021906315544,
"grad_norm": 0.4330257475376129,
"learning_rate": 0.00042005596036141064,
"loss": 3.2146,
"step": 51550
},
{
"epoch": 15.030587275693312,
"grad_norm": 0.3898090422153473,
"learning_rate": 0.0004198810842320023,
"loss": 3.2189,
"step": 51600
},
{
"epoch": 15.045152645071079,
"grad_norm": 0.4010067880153656,
"learning_rate": 0.00041970620810259396,
"loss": 3.2251,
"step": 51650
},
{
"epoch": 15.059718014448846,
"grad_norm": 0.40854838490486145,
"learning_rate": 0.0004195313319731856,
"loss": 3.2156,
"step": 51700
},
{
"epoch": 15.074283383826614,
"grad_norm": 0.36628204584121704,
"learning_rate": 0.0004193564558437773,
"loss": 3.234,
"step": 51750
},
{
"epoch": 15.088848753204381,
"grad_norm": 0.38783887028694153,
"learning_rate": 0.0004191815797143689,
"loss": 3.2315,
"step": 51800
},
{
"epoch": 15.103414122582148,
"grad_norm": 0.3718164265155792,
"learning_rate": 0.00041900670358496067,
"loss": 3.2369,
"step": 51850
},
{
"epoch": 15.117979491959916,
"grad_norm": 0.42094212770462036,
"learning_rate": 0.0004188318274555523,
"loss": 3.2398,
"step": 51900
},
{
"epoch": 15.132544861337683,
"grad_norm": 0.36034852266311646,
"learning_rate": 0.00041865695132614394,
"loss": 3.2404,
"step": 51950
},
{
"epoch": 15.147110230715452,
"grad_norm": 0.3888159692287445,
"learning_rate": 0.00041848207519673563,
"loss": 3.2449,
"step": 52000
},
{
"epoch": 15.147110230715452,
"eval_accuracy": 0.3711453460927778,
"eval_loss": 3.5572781562805176,
"eval_runtime": 179.6315,
"eval_samples_per_second": 92.662,
"eval_steps_per_second": 5.795,
"step": 52000
},
{
"epoch": 15.161675600093218,
"grad_norm": 0.3690231740474701,
"learning_rate": 0.00041830719906732727,
"loss": 3.2507,
"step": 52050
},
{
"epoch": 15.176240969470985,
"grad_norm": 0.37178748846054077,
"learning_rate": 0.00041813232293791896,
"loss": 3.2635,
"step": 52100
},
{
"epoch": 15.190806338848754,
"grad_norm": 0.40822193026542664,
"learning_rate": 0.0004179574468085106,
"loss": 3.2505,
"step": 52150
},
{
"epoch": 15.20537170822652,
"grad_norm": 0.40897294878959656,
"learning_rate": 0.00041778257067910223,
"loss": 3.2578,
"step": 52200
},
{
"epoch": 15.219937077604287,
"grad_norm": 0.416759729385376,
"learning_rate": 0.0004176076945496939,
"loss": 3.2576,
"step": 52250
},
{
"epoch": 15.234502446982056,
"grad_norm": 0.3542684018611908,
"learning_rate": 0.00041743281842028556,
"loss": 3.248,
"step": 52300
},
{
"epoch": 15.249067816359823,
"grad_norm": 0.3839828670024872,
"learning_rate": 0.0004172579422908773,
"loss": 3.2648,
"step": 52350
},
{
"epoch": 15.263633185737591,
"grad_norm": 0.36714503169059753,
"learning_rate": 0.00041708306616146894,
"loss": 3.257,
"step": 52400
},
{
"epoch": 15.278198555115358,
"grad_norm": 0.38585343956947327,
"learning_rate": 0.00041690819003206063,
"loss": 3.263,
"step": 52450
},
{
"epoch": 15.292763924493125,
"grad_norm": 0.3717619776725769,
"learning_rate": 0.00041673331390265227,
"loss": 3.2767,
"step": 52500
},
{
"epoch": 15.307329293870893,
"grad_norm": 0.3753516972064972,
"learning_rate": 0.0004165584377732439,
"loss": 3.2765,
"step": 52550
},
{
"epoch": 15.32189466324866,
"grad_norm": 0.3641180098056793,
"learning_rate": 0.0004163835616438356,
"loss": 3.2676,
"step": 52600
},
{
"epoch": 15.336460032626427,
"grad_norm": 0.37075987458229065,
"learning_rate": 0.00041620868551442723,
"loss": 3.2739,
"step": 52650
},
{
"epoch": 15.351025402004195,
"grad_norm": 0.3723219633102417,
"learning_rate": 0.0004160338093850189,
"loss": 3.2761,
"step": 52700
},
{
"epoch": 15.365590771381962,
"grad_norm": 0.3730946183204651,
"learning_rate": 0.00041585893325561056,
"loss": 3.2739,
"step": 52750
},
{
"epoch": 15.38015614075973,
"grad_norm": 0.3803166449069977,
"learning_rate": 0.0004156840571262022,
"loss": 3.2923,
"step": 52800
},
{
"epoch": 15.394721510137497,
"grad_norm": 0.3986593782901764,
"learning_rate": 0.00041550918099679394,
"loss": 3.2797,
"step": 52850
},
{
"epoch": 15.409286879515264,
"grad_norm": 0.3798179626464844,
"learning_rate": 0.0004153343048673856,
"loss": 3.2717,
"step": 52900
},
{
"epoch": 15.423852248893033,
"grad_norm": 0.4205482304096222,
"learning_rate": 0.00041515942873797726,
"loss": 3.2857,
"step": 52950
},
{
"epoch": 15.4384176182708,
"grad_norm": 0.35909244418144226,
"learning_rate": 0.0004149845526085689,
"loss": 3.2765,
"step": 53000
},
{
"epoch": 15.4384176182708,
"eval_accuracy": 0.3717464798171086,
"eval_loss": 3.5521633625030518,
"eval_runtime": 193.9407,
"eval_samples_per_second": 85.825,
"eval_steps_per_second": 5.368,
"step": 53000
},
{
"epoch": 15.452982987648566,
"grad_norm": 0.361979216337204,
"learning_rate": 0.0004148096764791606,
"loss": 3.2751,
"step": 53050
},
{
"epoch": 15.467548357026335,
"grad_norm": 0.36735597252845764,
"learning_rate": 0.0004146348003497522,
"loss": 3.3015,
"step": 53100
},
{
"epoch": 15.482113726404101,
"grad_norm": 0.3767015039920807,
"learning_rate": 0.00041445992422034386,
"loss": 3.2938,
"step": 53150
},
{
"epoch": 15.49667909578187,
"grad_norm": 0.38670143485069275,
"learning_rate": 0.00041428504809093555,
"loss": 3.2936,
"step": 53200
},
{
"epoch": 15.511244465159637,
"grad_norm": 0.39119359850883484,
"learning_rate": 0.0004141101719615272,
"loss": 3.2893,
"step": 53250
},
{
"epoch": 15.525809834537403,
"grad_norm": 0.36352699995040894,
"learning_rate": 0.00041393529583211893,
"loss": 3.2955,
"step": 53300
},
{
"epoch": 15.540375203915172,
"grad_norm": 0.38741451501846313,
"learning_rate": 0.00041376041970271057,
"loss": 3.299,
"step": 53350
},
{
"epoch": 15.554940573292939,
"grad_norm": 0.3951430916786194,
"learning_rate": 0.0004135855435733022,
"loss": 3.2996,
"step": 53400
},
{
"epoch": 15.569505942670705,
"grad_norm": 0.36441171169281006,
"learning_rate": 0.0004134106674438939,
"loss": 3.2938,
"step": 53450
},
{
"epoch": 15.584071312048474,
"grad_norm": 0.3774093985557556,
"learning_rate": 0.00041323579131448553,
"loss": 3.2882,
"step": 53500
},
{
"epoch": 15.59863668142624,
"grad_norm": 0.3849200904369354,
"learning_rate": 0.0004130609151850772,
"loss": 3.3071,
"step": 53550
},
{
"epoch": 15.61320205080401,
"grad_norm": 0.3753909468650818,
"learning_rate": 0.00041288603905566886,
"loss": 3.2861,
"step": 53600
},
{
"epoch": 15.627767420181776,
"grad_norm": 0.3853233754634857,
"learning_rate": 0.0004127111629262605,
"loss": 3.3072,
"step": 53650
},
{
"epoch": 15.642332789559543,
"grad_norm": 0.3988652229309082,
"learning_rate": 0.0004125362867968522,
"loss": 3.3094,
"step": 53700
},
{
"epoch": 15.656898158937311,
"grad_norm": 0.3708445429801941,
"learning_rate": 0.0004123614106674438,
"loss": 3.2967,
"step": 53750
},
{
"epoch": 15.671463528315078,
"grad_norm": 0.36685454845428467,
"learning_rate": 0.00041218653453803557,
"loss": 3.2913,
"step": 53800
},
{
"epoch": 15.686028897692845,
"grad_norm": 0.38278666138648987,
"learning_rate": 0.0004120116584086272,
"loss": 3.2861,
"step": 53850
},
{
"epoch": 15.700594267070613,
"grad_norm": 0.384741872549057,
"learning_rate": 0.0004118367822792189,
"loss": 3.304,
"step": 53900
},
{
"epoch": 15.71515963644838,
"grad_norm": 0.3768286108970642,
"learning_rate": 0.00041166190614981053,
"loss": 3.2982,
"step": 53950
},
{
"epoch": 15.729725005826147,
"grad_norm": 0.3943612575531006,
"learning_rate": 0.00041148703002040217,
"loss": 3.2996,
"step": 54000
},
{
"epoch": 15.729725005826147,
"eval_accuracy": 0.3718921486386314,
"eval_loss": 3.544917345046997,
"eval_runtime": 220.4654,
"eval_samples_per_second": 75.499,
"eval_steps_per_second": 4.722,
"step": 54000
},
{
"epoch": 15.744290375203915,
"grad_norm": 0.3631158769130707,
"learning_rate": 0.00041131215389099386,
"loss": 3.2903,
"step": 54050
},
{
"epoch": 15.758855744581682,
"grad_norm": 0.40076887607574463,
"learning_rate": 0.0004111372777615855,
"loss": 3.3015,
"step": 54100
},
{
"epoch": 15.77342111395945,
"grad_norm": 0.3838764429092407,
"learning_rate": 0.0004109624016321772,
"loss": 3.2982,
"step": 54150
},
{
"epoch": 15.787986483337217,
"grad_norm": 0.3836144804954529,
"learning_rate": 0.0004107875255027688,
"loss": 3.3118,
"step": 54200
},
{
"epoch": 15.802551852714984,
"grad_norm": 0.39159563183784485,
"learning_rate": 0.00041061264937336045,
"loss": 3.2957,
"step": 54250
},
{
"epoch": 15.817117222092753,
"grad_norm": 0.3700462281703949,
"learning_rate": 0.0004104377732439522,
"loss": 3.3081,
"step": 54300
},
{
"epoch": 15.83168259147052,
"grad_norm": 0.37243711948394775,
"learning_rate": 0.00041026289711454384,
"loss": 3.3134,
"step": 54350
},
{
"epoch": 15.846247960848288,
"grad_norm": 0.38975927233695984,
"learning_rate": 0.0004100880209851355,
"loss": 3.3046,
"step": 54400
},
{
"epoch": 15.860813330226055,
"grad_norm": 0.39330175518989563,
"learning_rate": 0.00040991314485572716,
"loss": 3.3103,
"step": 54450
},
{
"epoch": 15.875378699603822,
"grad_norm": 0.36677080392837524,
"learning_rate": 0.00040973826872631885,
"loss": 3.3041,
"step": 54500
},
{
"epoch": 15.88994406898159,
"grad_norm": 0.38371795415878296,
"learning_rate": 0.0004095633925969105,
"loss": 3.3192,
"step": 54550
},
{
"epoch": 15.904509438359357,
"grad_norm": 0.37720179557800293,
"learning_rate": 0.0004093885164675021,
"loss": 3.3182,
"step": 54600
},
{
"epoch": 15.919074807737124,
"grad_norm": 0.372707724571228,
"learning_rate": 0.0004092136403380938,
"loss": 3.3182,
"step": 54650
},
{
"epoch": 15.933640177114892,
"grad_norm": 0.38466477394104004,
"learning_rate": 0.00040903876420868545,
"loss": 3.3171,
"step": 54700
},
{
"epoch": 15.948205546492659,
"grad_norm": 0.4322209656238556,
"learning_rate": 0.00040886388807927714,
"loss": 3.3102,
"step": 54750
},
{
"epoch": 15.962770915870426,
"grad_norm": 0.3643110394477844,
"learning_rate": 0.00040868901194986883,
"loss": 3.3154,
"step": 54800
},
{
"epoch": 15.977336285248194,
"grad_norm": 0.3549572229385376,
"learning_rate": 0.00040851413582046047,
"loss": 3.3118,
"step": 54850
},
{
"epoch": 15.991901654625961,
"grad_norm": 0.35710573196411133,
"learning_rate": 0.00040833925969105216,
"loss": 3.3088,
"step": 54900
},
{
"epoch": 16.006408762526217,
"grad_norm": 0.3736666738986969,
"learning_rate": 0.0004081643835616438,
"loss": 3.2679,
"step": 54950
},
{
"epoch": 16.020974131903984,
"grad_norm": 0.39405354857444763,
"learning_rate": 0.0004079895074322355,
"loss": 3.195,
"step": 55000
},
{
"epoch": 16.020974131903984,
"eval_accuracy": 0.37159434466371843,
"eval_loss": 3.5555102825164795,
"eval_runtime": 179.8603,
"eval_samples_per_second": 92.544,
"eval_steps_per_second": 5.788,
"step": 55000
},
{
"epoch": 16.035539501281754,
"grad_norm": 0.38940897583961487,
"learning_rate": 0.0004078146313028271,
"loss": 3.2109,
"step": 55050
},
{
"epoch": 16.05010487065952,
"grad_norm": 0.3882853388786316,
"learning_rate": 0.0004076397551734188,
"loss": 3.2045,
"step": 55100
},
{
"epoch": 16.064670240037287,
"grad_norm": 0.39605289697647095,
"learning_rate": 0.00040746487904401045,
"loss": 3.2356,
"step": 55150
},
{
"epoch": 16.079235609415054,
"grad_norm": 0.3754449486732483,
"learning_rate": 0.0004072900029146021,
"loss": 3.2203,
"step": 55200
},
{
"epoch": 16.09380097879282,
"grad_norm": 0.4028746783733368,
"learning_rate": 0.0004071151267851938,
"loss": 3.2177,
"step": 55250
},
{
"epoch": 16.10836634817059,
"grad_norm": 0.36337772011756897,
"learning_rate": 0.00040694025065578546,
"loss": 3.2235,
"step": 55300
},
{
"epoch": 16.122931717548358,
"grad_norm": 0.3819507360458374,
"learning_rate": 0.00040676537452637716,
"loss": 3.2269,
"step": 55350
},
{
"epoch": 16.137497086926125,
"grad_norm": 0.38409850001335144,
"learning_rate": 0.0004065904983969688,
"loss": 3.2293,
"step": 55400
},
{
"epoch": 16.15206245630389,
"grad_norm": 0.4041096866130829,
"learning_rate": 0.00040641562226756043,
"loss": 3.2422,
"step": 55450
},
{
"epoch": 16.16662782568166,
"grad_norm": 0.3929169774055481,
"learning_rate": 0.0004062407461381521,
"loss": 3.2401,
"step": 55500
},
{
"epoch": 16.181193195059425,
"grad_norm": 0.379218190908432,
"learning_rate": 0.00040606587000874375,
"loss": 3.2338,
"step": 55550
},
{
"epoch": 16.195758564437195,
"grad_norm": 0.39579394459724426,
"learning_rate": 0.00040589099387933544,
"loss": 3.2271,
"step": 55600
},
{
"epoch": 16.210323933814962,
"grad_norm": 0.38522908091545105,
"learning_rate": 0.0004057161177499271,
"loss": 3.2319,
"step": 55650
},
{
"epoch": 16.22488930319273,
"grad_norm": 0.3886246085166931,
"learning_rate": 0.0004055412416205187,
"loss": 3.25,
"step": 55700
},
{
"epoch": 16.239454672570496,
"grad_norm": 0.387268990278244,
"learning_rate": 0.0004053663654911104,
"loss": 3.2485,
"step": 55750
},
{
"epoch": 16.254020041948262,
"grad_norm": 0.3706577718257904,
"learning_rate": 0.0004051914893617021,
"loss": 3.2549,
"step": 55800
},
{
"epoch": 16.268585411326033,
"grad_norm": 0.36555173993110657,
"learning_rate": 0.0004050166132322938,
"loss": 3.2546,
"step": 55850
},
{
"epoch": 16.2831507807038,
"grad_norm": 0.4174744486808777,
"learning_rate": 0.0004048417371028854,
"loss": 3.2461,
"step": 55900
},
{
"epoch": 16.297716150081566,
"grad_norm": 0.3815324604511261,
"learning_rate": 0.0004046668609734771,
"loss": 3.2584,
"step": 55950
},
{
"epoch": 16.312281519459333,
"grad_norm": 0.3781425654888153,
"learning_rate": 0.00040449198484406875,
"loss": 3.2685,
"step": 56000
},
{
"epoch": 16.312281519459333,
"eval_accuracy": 0.3717649382553484,
"eval_loss": 3.551072835922241,
"eval_runtime": 179.583,
"eval_samples_per_second": 92.687,
"eval_steps_per_second": 5.797,
"step": 56000
},
{
"epoch": 16.3268468888371,
"grad_norm": 0.3846278190612793,
"learning_rate": 0.0004043171087146604,
"loss": 3.2671,
"step": 56050
},
{
"epoch": 16.34141225821487,
"grad_norm": 0.3843114674091339,
"learning_rate": 0.0004041422325852521,
"loss": 3.2572,
"step": 56100
},
{
"epoch": 16.355977627592637,
"grad_norm": 0.3832460343837738,
"learning_rate": 0.0004039673564558437,
"loss": 3.2482,
"step": 56150
},
{
"epoch": 16.370542996970403,
"grad_norm": 0.39614608883857727,
"learning_rate": 0.0004037924803264354,
"loss": 3.2568,
"step": 56200
},
{
"epoch": 16.38510836634817,
"grad_norm": 0.4128139615058899,
"learning_rate": 0.00040361760419702704,
"loss": 3.2602,
"step": 56250
},
{
"epoch": 16.399673735725937,
"grad_norm": 0.41927552223205566,
"learning_rate": 0.00040344272806761873,
"loss": 3.2699,
"step": 56300
},
{
"epoch": 16.414239105103704,
"grad_norm": 0.4142034351825714,
"learning_rate": 0.0004032678519382104,
"loss": 3.2651,
"step": 56350
},
{
"epoch": 16.428804474481474,
"grad_norm": 0.4234794080257416,
"learning_rate": 0.00040309297580880206,
"loss": 3.2693,
"step": 56400
},
{
"epoch": 16.44336984385924,
"grad_norm": 0.379566490650177,
"learning_rate": 0.00040291809967939375,
"loss": 3.259,
"step": 56450
},
{
"epoch": 16.457935213237008,
"grad_norm": 0.3937167525291443,
"learning_rate": 0.0004027432235499854,
"loss": 3.2841,
"step": 56500
},
{
"epoch": 16.472500582614774,
"grad_norm": 0.386248379945755,
"learning_rate": 0.0004025683474205771,
"loss": 3.277,
"step": 56550
},
{
"epoch": 16.48706595199254,
"grad_norm": 0.38750800490379333,
"learning_rate": 0.0004023934712911687,
"loss": 3.2801,
"step": 56600
},
{
"epoch": 16.50163132137031,
"grad_norm": 0.39586499333381653,
"learning_rate": 0.00040221859516176035,
"loss": 3.2722,
"step": 56650
},
{
"epoch": 16.516196690748078,
"grad_norm": 0.37789252400398254,
"learning_rate": 0.00040204371903235204,
"loss": 3.2748,
"step": 56700
},
{
"epoch": 16.530762060125845,
"grad_norm": 0.3938862085342407,
"learning_rate": 0.0004018688429029437,
"loss": 3.277,
"step": 56750
},
{
"epoch": 16.54532742950361,
"grad_norm": 0.3977769613265991,
"learning_rate": 0.0004016939667735354,
"loss": 3.291,
"step": 56800
},
{
"epoch": 16.55989279888138,
"grad_norm": 0.3525155782699585,
"learning_rate": 0.00040151909064412705,
"loss": 3.2735,
"step": 56850
},
{
"epoch": 16.57445816825915,
"grad_norm": 0.362099826335907,
"learning_rate": 0.0004013442145147187,
"loss": 3.2917,
"step": 56900
},
{
"epoch": 16.589023537636916,
"grad_norm": 0.37509220838546753,
"learning_rate": 0.0004011693383853104,
"loss": 3.2782,
"step": 56950
},
{
"epoch": 16.603588907014682,
"grad_norm": 0.3895016312599182,
"learning_rate": 0.000400994462255902,
"loss": 3.2776,
"step": 57000
},
{
"epoch": 16.603588907014682,
"eval_accuracy": 0.3718954405894003,
"eval_loss": 3.5490047931671143,
"eval_runtime": 185.9028,
"eval_samples_per_second": 89.536,
"eval_steps_per_second": 5.6,
"step": 57000
},
{
"epoch": 16.61815427639245,
"grad_norm": 0.3511142432689667,
"learning_rate": 0.0004008195861264937,
"loss": 3.2816,
"step": 57050
},
{
"epoch": 16.632719645770216,
"grad_norm": 0.42816150188446045,
"learning_rate": 0.00040064470999708534,
"loss": 3.2748,
"step": 57100
},
{
"epoch": 16.647285015147983,
"grad_norm": 0.370182067155838,
"learning_rate": 0.00040046983386767703,
"loss": 3.2864,
"step": 57150
},
{
"epoch": 16.661850384525753,
"grad_norm": 0.39222970604896545,
"learning_rate": 0.00040029495773826867,
"loss": 3.2856,
"step": 57200
},
{
"epoch": 16.67641575390352,
"grad_norm": 0.3937409818172455,
"learning_rate": 0.0004001200816088603,
"loss": 3.2803,
"step": 57250
},
{
"epoch": 16.690981123281286,
"grad_norm": 0.38916105031967163,
"learning_rate": 0.00039994520547945205,
"loss": 3.2942,
"step": 57300
},
{
"epoch": 16.705546492659053,
"grad_norm": 0.37478119134902954,
"learning_rate": 0.0003997703293500437,
"loss": 3.3025,
"step": 57350
},
{
"epoch": 16.72011186203682,
"grad_norm": 0.3683931827545166,
"learning_rate": 0.0003995954532206354,
"loss": 3.2842,
"step": 57400
},
{
"epoch": 16.73467723141459,
"grad_norm": 0.4007303714752197,
"learning_rate": 0.000399420577091227,
"loss": 3.2829,
"step": 57450
},
{
"epoch": 16.749242600792357,
"grad_norm": 0.3843965232372284,
"learning_rate": 0.00039924570096181865,
"loss": 3.2901,
"step": 57500
},
{
"epoch": 16.763807970170124,
"grad_norm": 0.3941800594329834,
"learning_rate": 0.00039907082483241034,
"loss": 3.2858,
"step": 57550
},
{
"epoch": 16.77837333954789,
"grad_norm": 0.37438079714775085,
"learning_rate": 0.000398895948703002,
"loss": 3.2916,
"step": 57600
},
{
"epoch": 16.792938708925657,
"grad_norm": 0.3703000545501709,
"learning_rate": 0.00039872107257359367,
"loss": 3.3016,
"step": 57650
},
{
"epoch": 16.807504078303424,
"grad_norm": 0.3948332369327545,
"learning_rate": 0.0003985461964441853,
"loss": 3.3057,
"step": 57700
},
{
"epoch": 16.822069447681194,
"grad_norm": 0.38669082522392273,
"learning_rate": 0.00039837132031477694,
"loss": 3.2897,
"step": 57750
},
{
"epoch": 16.83663481705896,
"grad_norm": 0.3628772497177124,
"learning_rate": 0.0003981964441853687,
"loss": 3.2891,
"step": 57800
},
{
"epoch": 16.851200186436728,
"grad_norm": 0.39237385988235474,
"learning_rate": 0.0003980215680559603,
"loss": 3.3021,
"step": 57850
},
{
"epoch": 16.865765555814495,
"grad_norm": 0.3908953070640564,
"learning_rate": 0.000397846691926552,
"loss": 3.2962,
"step": 57900
},
{
"epoch": 16.88033092519226,
"grad_norm": 0.3867229223251343,
"learning_rate": 0.00039767181579714365,
"loss": 3.2861,
"step": 57950
},
{
"epoch": 16.89489629457003,
"grad_norm": 0.3902886211872101,
"learning_rate": 0.00039749693966773534,
"loss": 3.3034,
"step": 58000
},
{
"epoch": 16.89489629457003,
"eval_accuracy": 0.3726240198363548,
"eval_loss": 3.5381805896759033,
"eval_runtime": 441.5045,
"eval_samples_per_second": 37.701,
"eval_steps_per_second": 2.358,
"step": 58000
},
{
"epoch": 16.9094616639478,
"grad_norm": 0.38796380162239075,
"learning_rate": 0.00039732206353832697,
"loss": 3.2994,
"step": 58050
},
{
"epoch": 16.924027033325565,
"grad_norm": 0.35192742943763733,
"learning_rate": 0.0003971471874089186,
"loss": 3.2959,
"step": 58100
},
{
"epoch": 16.938592402703332,
"grad_norm": 0.372641384601593,
"learning_rate": 0.0003969723112795103,
"loss": 3.3026,
"step": 58150
},
{
"epoch": 16.9531577720811,
"grad_norm": 0.37450307607650757,
"learning_rate": 0.00039679743515010194,
"loss": 3.3098,
"step": 58200
},
{
"epoch": 16.96772314145887,
"grad_norm": 0.38844752311706543,
"learning_rate": 0.0003966225590206937,
"loss": 3.3031,
"step": 58250
},
{
"epoch": 16.982288510836636,
"grad_norm": 0.37731024622917175,
"learning_rate": 0.0003964476828912853,
"loss": 3.3081,
"step": 58300
},
{
"epoch": 16.996853880214402,
"grad_norm": 0.37375837564468384,
"learning_rate": 0.00039627280676187695,
"loss": 3.2973,
"step": 58350
},
{
"epoch": 17.01136098811466,
"grad_norm": 0.4000365436077118,
"learning_rate": 0.00039609793063246864,
"loss": 3.2205,
"step": 58400
},
{
"epoch": 17.025926357492427,
"grad_norm": 0.3670046031475067,
"learning_rate": 0.0003959230545030603,
"loss": 3.1889,
"step": 58450
},
{
"epoch": 17.040491726870194,
"grad_norm": 0.39600327610969543,
"learning_rate": 0.00039574817837365197,
"loss": 3.2056,
"step": 58500
},
{
"epoch": 17.05505709624796,
"grad_norm": 0.38830217719078064,
"learning_rate": 0.0003955733022442436,
"loss": 3.2047,
"step": 58550
},
{
"epoch": 17.069622465625727,
"grad_norm": 0.394195556640625,
"learning_rate": 0.0003953984261148353,
"loss": 3.2156,
"step": 58600
},
{
"epoch": 17.084187835003497,
"grad_norm": 0.3784361183643341,
"learning_rate": 0.00039522354998542693,
"loss": 3.2088,
"step": 58650
},
{
"epoch": 17.098753204381264,
"grad_norm": 0.4057703912258148,
"learning_rate": 0.00039504867385601857,
"loss": 3.2158,
"step": 58700
},
{
"epoch": 17.11331857375903,
"grad_norm": 0.37357842922210693,
"learning_rate": 0.0003948737977266103,
"loss": 3.2155,
"step": 58750
},
{
"epoch": 17.127883943136798,
"grad_norm": 0.3923245370388031,
"learning_rate": 0.00039469892159720195,
"loss": 3.2217,
"step": 58800
},
{
"epoch": 17.142449312514564,
"grad_norm": 0.4075673520565033,
"learning_rate": 0.00039452404546779364,
"loss": 3.2176,
"step": 58850
},
{
"epoch": 17.15701468189233,
"grad_norm": 0.37942689657211304,
"learning_rate": 0.0003943491693383853,
"loss": 3.2212,
"step": 58900
},
{
"epoch": 17.1715800512701,
"grad_norm": 0.39208337664604187,
"learning_rate": 0.0003941742932089769,
"loss": 3.2204,
"step": 58950
},
{
"epoch": 17.18614542064787,
"grad_norm": 0.3947177529335022,
"learning_rate": 0.0003939994170795686,
"loss": 3.2318,
"step": 59000
},
{
"epoch": 17.18614542064787,
"eval_accuracy": 0.37187474832742445,
"eval_loss": 3.5549113750457764,
"eval_runtime": 179.6498,
"eval_samples_per_second": 92.653,
"eval_steps_per_second": 5.795,
"step": 59000
},
{
"epoch": 17.200710790025635,
"grad_norm": 0.4263794720172882,
"learning_rate": 0.00039382454095016024,
"loss": 3.2376,
"step": 59050
},
{
"epoch": 17.215276159403402,
"grad_norm": 0.37080931663513184,
"learning_rate": 0.00039364966482075193,
"loss": 3.2378,
"step": 59100
},
{
"epoch": 17.22984152878117,
"grad_norm": 0.39057183265686035,
"learning_rate": 0.00039347478869134356,
"loss": 3.2199,
"step": 59150
},
{
"epoch": 17.24440689815894,
"grad_norm": 0.41177624464035034,
"learning_rate": 0.0003932999125619353,
"loss": 3.2296,
"step": 59200
},
{
"epoch": 17.258972267536706,
"grad_norm": 0.4065467417240143,
"learning_rate": 0.00039312503643252695,
"loss": 3.2364,
"step": 59250
},
{
"epoch": 17.273537636914472,
"grad_norm": 0.37535977363586426,
"learning_rate": 0.0003929501603031186,
"loss": 3.2309,
"step": 59300
},
{
"epoch": 17.28810300629224,
"grad_norm": 0.4139235019683838,
"learning_rate": 0.00039277528417371027,
"loss": 3.2482,
"step": 59350
},
{
"epoch": 17.302668375670006,
"grad_norm": 0.3840341866016388,
"learning_rate": 0.0003926004080443019,
"loss": 3.2422,
"step": 59400
},
{
"epoch": 17.317233745047773,
"grad_norm": 0.3817002475261688,
"learning_rate": 0.0003924255319148936,
"loss": 3.2387,
"step": 59450
},
{
"epoch": 17.331799114425543,
"grad_norm": 0.3794045150279999,
"learning_rate": 0.00039225065578548523,
"loss": 3.2531,
"step": 59500
},
{
"epoch": 17.34636448380331,
"grad_norm": 0.3869137465953827,
"learning_rate": 0.00039207577965607687,
"loss": 3.2437,
"step": 59550
},
{
"epoch": 17.360929853181077,
"grad_norm": 0.39294636249542236,
"learning_rate": 0.00039190090352666856,
"loss": 3.247,
"step": 59600
},
{
"epoch": 17.375495222558843,
"grad_norm": 0.37759485840797424,
"learning_rate": 0.0003917260273972602,
"loss": 3.2565,
"step": 59650
},
{
"epoch": 17.39006059193661,
"grad_norm": 0.379200279712677,
"learning_rate": 0.00039155115126785194,
"loss": 3.245,
"step": 59700
},
{
"epoch": 17.40462596131438,
"grad_norm": 0.40147554874420166,
"learning_rate": 0.0003913762751384436,
"loss": 3.2482,
"step": 59750
},
{
"epoch": 17.419191330692147,
"grad_norm": 0.38646212220191956,
"learning_rate": 0.00039120139900903527,
"loss": 3.254,
"step": 59800
},
{
"epoch": 17.433756700069914,
"grad_norm": 0.3718118965625763,
"learning_rate": 0.0003910265228796269,
"loss": 3.2701,
"step": 59850
},
{
"epoch": 17.44832206944768,
"grad_norm": 0.4207517206668854,
"learning_rate": 0.00039085164675021854,
"loss": 3.2703,
"step": 59900
},
{
"epoch": 17.462887438825447,
"grad_norm": 0.41934168338775635,
"learning_rate": 0.00039067677062081023,
"loss": 3.2493,
"step": 59950
},
{
"epoch": 17.477452808203218,
"grad_norm": 0.377540647983551,
"learning_rate": 0.00039050189449140187,
"loss": 3.2693,
"step": 60000
},
{
"epoch": 17.477452808203218,
"eval_accuracy": 0.37255406588251616,
"eval_loss": 3.5508663654327393,
"eval_runtime": 179.5955,
"eval_samples_per_second": 92.681,
"eval_steps_per_second": 5.796,
"step": 60000
},
{
"epoch": 17.492018177580984,
"grad_norm": 0.39113548398017883,
"learning_rate": 0.00039032701836199356,
"loss": 3.2626,
"step": 60050
},
{
"epoch": 17.50658354695875,
"grad_norm": 0.4056381285190582,
"learning_rate": 0.0003901521422325852,
"loss": 3.263,
"step": 60100
},
{
"epoch": 17.521148916336518,
"grad_norm": 0.3636176884174347,
"learning_rate": 0.00038997726610317683,
"loss": 3.2644,
"step": 60150
},
{
"epoch": 17.535714285714285,
"grad_norm": 0.3688655495643616,
"learning_rate": 0.0003898023899737686,
"loss": 3.2731,
"step": 60200
},
{
"epoch": 17.55027965509205,
"grad_norm": 0.39269739389419556,
"learning_rate": 0.0003896275138443602,
"loss": 3.2603,
"step": 60250
},
{
"epoch": 17.56484502446982,
"grad_norm": 0.3798394799232483,
"learning_rate": 0.0003894526377149519,
"loss": 3.2662,
"step": 60300
},
{
"epoch": 17.57941039384759,
"grad_norm": 0.3824335038661957,
"learning_rate": 0.00038927776158554354,
"loss": 3.2774,
"step": 60350
},
{
"epoch": 17.593975763225355,
"grad_norm": 0.38308337330818176,
"learning_rate": 0.0003891028854561352,
"loss": 3.2772,
"step": 60400
},
{
"epoch": 17.608541132603122,
"grad_norm": 0.3748604655265808,
"learning_rate": 0.00038892800932672686,
"loss": 3.2586,
"step": 60450
},
{
"epoch": 17.62310650198089,
"grad_norm": 0.40975135564804077,
"learning_rate": 0.0003887531331973185,
"loss": 3.2655,
"step": 60500
},
{
"epoch": 17.63767187135866,
"grad_norm": 0.38740789890289307,
"learning_rate": 0.0003885782570679102,
"loss": 3.2761,
"step": 60550
},
{
"epoch": 17.652237240736426,
"grad_norm": 0.3646203577518463,
"learning_rate": 0.0003884033809385018,
"loss": 3.2751,
"step": 60600
},
{
"epoch": 17.666802610114193,
"grad_norm": 0.37517863512039185,
"learning_rate": 0.00038822850480909357,
"loss": 3.2792,
"step": 60650
},
{
"epoch": 17.68136797949196,
"grad_norm": 0.3650130033493042,
"learning_rate": 0.0003880536286796852,
"loss": 3.2693,
"step": 60700
},
{
"epoch": 17.695933348869726,
"grad_norm": 0.4000101089477539,
"learning_rate": 0.00038787875255027684,
"loss": 3.2749,
"step": 60750
},
{
"epoch": 17.710498718247496,
"grad_norm": 0.35344168543815613,
"learning_rate": 0.00038770387642086853,
"loss": 3.2679,
"step": 60800
},
{
"epoch": 17.725064087625263,
"grad_norm": 0.40958935022354126,
"learning_rate": 0.00038752900029146017,
"loss": 3.2857,
"step": 60850
},
{
"epoch": 17.73962945700303,
"grad_norm": 0.377948135137558,
"learning_rate": 0.00038735412416205186,
"loss": 3.2691,
"step": 60900
},
{
"epoch": 17.754194826380797,
"grad_norm": 0.4192025065422058,
"learning_rate": 0.0003871792480326435,
"loss": 3.2764,
"step": 60950
},
{
"epoch": 17.768760195758563,
"grad_norm": 0.3829701244831085,
"learning_rate": 0.00038700437190323513,
"loss": 3.2845,
"step": 61000
},
{
"epoch": 17.768760195758563,
"eval_accuracy": 0.3727979053787536,
"eval_loss": 3.5411858558654785,
"eval_runtime": 179.8099,
"eval_samples_per_second": 92.57,
"eval_steps_per_second": 5.789,
"step": 61000
},
{
"epoch": 17.78332556513633,
"grad_norm": 0.34948283433914185,
"learning_rate": 0.0003868294957738268,
"loss": 3.2742,
"step": 61050
},
{
"epoch": 17.7978909345141,
"grad_norm": 0.3973924219608307,
"learning_rate": 0.00038665461964441846,
"loss": 3.2799,
"step": 61100
},
{
"epoch": 17.812456303891867,
"grad_norm": 0.39604613184928894,
"learning_rate": 0.0003864797435150102,
"loss": 3.2907,
"step": 61150
},
{
"epoch": 17.827021673269634,
"grad_norm": 0.3890770971775055,
"learning_rate": 0.00038630486738560184,
"loss": 3.2818,
"step": 61200
},
{
"epoch": 17.8415870426474,
"grad_norm": 0.39360764622688293,
"learning_rate": 0.00038612999125619353,
"loss": 3.2866,
"step": 61250
},
{
"epoch": 17.856152412025168,
"grad_norm": 0.3879394233226776,
"learning_rate": 0.00038595511512678517,
"loss": 3.2828,
"step": 61300
},
{
"epoch": 17.870717781402938,
"grad_norm": 0.3946910500526428,
"learning_rate": 0.0003857802389973768,
"loss": 3.2841,
"step": 61350
},
{
"epoch": 17.885283150780705,
"grad_norm": 0.3722352981567383,
"learning_rate": 0.0003856053628679685,
"loss": 3.2906,
"step": 61400
},
{
"epoch": 17.89984852015847,
"grad_norm": 0.37943729758262634,
"learning_rate": 0.00038543048673856013,
"loss": 3.2828,
"step": 61450
},
{
"epoch": 17.914413889536238,
"grad_norm": 0.3946760594844818,
"learning_rate": 0.0003852556106091518,
"loss": 3.2874,
"step": 61500
},
{
"epoch": 17.928979258914005,
"grad_norm": 0.38303500413894653,
"learning_rate": 0.00038508073447974346,
"loss": 3.2743,
"step": 61550
},
{
"epoch": 17.943544628291775,
"grad_norm": 0.35602617263793945,
"learning_rate": 0.0003849058583503351,
"loss": 3.2839,
"step": 61600
},
{
"epoch": 17.958109997669542,
"grad_norm": 0.364681601524353,
"learning_rate": 0.00038473098222092684,
"loss": 3.2894,
"step": 61650
},
{
"epoch": 17.97267536704731,
"grad_norm": 0.3929082155227661,
"learning_rate": 0.0003845561060915185,
"loss": 3.2816,
"step": 61700
},
{
"epoch": 17.987240736425075,
"grad_norm": 0.37357085943222046,
"learning_rate": 0.00038438122996211016,
"loss": 3.295,
"step": 61750
},
{
"epoch": 18.001747844325333,
"grad_norm": 0.4023423492908478,
"learning_rate": 0.0003842063538327018,
"loss": 3.2778,
"step": 61800
},
{
"epoch": 18.0163132137031,
"grad_norm": 0.3764852285385132,
"learning_rate": 0.0003840314777032935,
"loss": 3.175,
"step": 61850
},
{
"epoch": 18.030878583080867,
"grad_norm": 0.4034103453159332,
"learning_rate": 0.0003838566015738851,
"loss": 3.1882,
"step": 61900
},
{
"epoch": 18.045443952458633,
"grad_norm": 0.3662955164909363,
"learning_rate": 0.00038368172544447676,
"loss": 3.1941,
"step": 61950
},
{
"epoch": 18.0600093218364,
"grad_norm": 0.3883397579193115,
"learning_rate": 0.00038350684931506845,
"loss": 3.2022,
"step": 62000
},
{
"epoch": 18.0600093218364,
"eval_accuracy": 0.3724488410275824,
"eval_loss": 3.5511727333068848,
"eval_runtime": 179.5651,
"eval_samples_per_second": 92.696,
"eval_steps_per_second": 5.797,
"step": 62000
},
{
"epoch": 18.07457469121417,
"grad_norm": 0.39406126737594604,
"learning_rate": 0.0003833319731856601,
"loss": 3.1855,
"step": 62050
},
{
"epoch": 18.089140060591937,
"grad_norm": 0.39789289236068726,
"learning_rate": 0.00038315709705625183,
"loss": 3.1935,
"step": 62100
},
{
"epoch": 18.103705429969704,
"grad_norm": 0.38348227739334106,
"learning_rate": 0.00038298222092684347,
"loss": 3.1978,
"step": 62150
},
{
"epoch": 18.11827079934747,
"grad_norm": 0.3813340663909912,
"learning_rate": 0.0003828073447974351,
"loss": 3.2017,
"step": 62200
},
{
"epoch": 18.132836168725238,
"grad_norm": 0.4284285008907318,
"learning_rate": 0.0003826324686680268,
"loss": 3.2196,
"step": 62250
},
{
"epoch": 18.147401538103008,
"grad_norm": 0.39629238843917847,
"learning_rate": 0.00038245759253861843,
"loss": 3.2077,
"step": 62300
},
{
"epoch": 18.161966907480775,
"grad_norm": 0.40169140696525574,
"learning_rate": 0.0003822827164092101,
"loss": 3.2137,
"step": 62350
},
{
"epoch": 18.17653227685854,
"grad_norm": 0.3855275511741638,
"learning_rate": 0.00038210784027980176,
"loss": 3.2136,
"step": 62400
},
{
"epoch": 18.191097646236308,
"grad_norm": 0.3961770236492157,
"learning_rate": 0.0003819329641503934,
"loss": 3.2086,
"step": 62450
},
{
"epoch": 18.205663015614075,
"grad_norm": 0.37826651334762573,
"learning_rate": 0.0003817580880209851,
"loss": 3.2079,
"step": 62500
},
{
"epoch": 18.22022838499184,
"grad_norm": 0.4296334385871887,
"learning_rate": 0.0003815832118915767,
"loss": 3.2111,
"step": 62550
},
{
"epoch": 18.234793754369612,
"grad_norm": 0.40578290820121765,
"learning_rate": 0.00038140833576216847,
"loss": 3.2258,
"step": 62600
},
{
"epoch": 18.24935912374738,
"grad_norm": 0.4326179027557373,
"learning_rate": 0.0003812334596327601,
"loss": 3.2318,
"step": 62650
},
{
"epoch": 18.263924493125145,
"grad_norm": 0.40364038944244385,
"learning_rate": 0.0003810585835033518,
"loss": 3.2235,
"step": 62700
},
{
"epoch": 18.278489862502912,
"grad_norm": 0.41583460569381714,
"learning_rate": 0.00038088370737394343,
"loss": 3.2385,
"step": 62750
},
{
"epoch": 18.29305523188068,
"grad_norm": 0.3775072693824768,
"learning_rate": 0.00038070883124453507,
"loss": 3.2353,
"step": 62800
},
{
"epoch": 18.30762060125845,
"grad_norm": 0.4191801846027374,
"learning_rate": 0.00038053395511512676,
"loss": 3.2385,
"step": 62850
},
{
"epoch": 18.322185970636216,
"grad_norm": 0.3996080458164215,
"learning_rate": 0.0003803590789857184,
"loss": 3.22,
"step": 62900
},
{
"epoch": 18.336751340013983,
"grad_norm": 0.3744161128997803,
"learning_rate": 0.0003801842028563101,
"loss": 3.242,
"step": 62950
},
{
"epoch": 18.35131670939175,
"grad_norm": 0.3929766118526459,
"learning_rate": 0.0003800093267269017,
"loss": 3.2434,
"step": 63000
},
{
"epoch": 18.35131670939175,
"eval_accuracy": 0.3726039154227306,
"eval_loss": 3.551494836807251,
"eval_runtime": 179.6728,
"eval_samples_per_second": 92.641,
"eval_steps_per_second": 5.794,
"step": 63000
},
{
"epoch": 18.365882078769516,
"grad_norm": 0.37301939725875854,
"learning_rate": 0.00037983445059749335,
"loss": 3.2367,
"step": 63050
},
{
"epoch": 18.380447448147287,
"grad_norm": 0.3646920323371887,
"learning_rate": 0.0003796595744680851,
"loss": 3.2411,
"step": 63100
},
{
"epoch": 18.395012817525053,
"grad_norm": 0.41786086559295654,
"learning_rate": 0.00037948469833867674,
"loss": 3.238,
"step": 63150
},
{
"epoch": 18.40957818690282,
"grad_norm": 0.39375847578048706,
"learning_rate": 0.0003793098222092684,
"loss": 3.2392,
"step": 63200
},
{
"epoch": 18.424143556280587,
"grad_norm": 0.3906821310520172,
"learning_rate": 0.00037913494607986006,
"loss": 3.2303,
"step": 63250
},
{
"epoch": 18.438708925658354,
"grad_norm": 0.38584890961647034,
"learning_rate": 0.00037896006995045175,
"loss": 3.2507,
"step": 63300
},
{
"epoch": 18.45327429503612,
"grad_norm": 0.37560147047042847,
"learning_rate": 0.0003787851938210434,
"loss": 3.2401,
"step": 63350
},
{
"epoch": 18.46783966441389,
"grad_norm": 0.39870715141296387,
"learning_rate": 0.000378610317691635,
"loss": 3.2559,
"step": 63400
},
{
"epoch": 18.482405033791657,
"grad_norm": 0.416790634393692,
"learning_rate": 0.0003784354415622267,
"loss": 3.252,
"step": 63450
},
{
"epoch": 18.496970403169424,
"grad_norm": 0.3930261731147766,
"learning_rate": 0.00037826056543281835,
"loss": 3.2556,
"step": 63500
},
{
"epoch": 18.51153577254719,
"grad_norm": 0.38571596145629883,
"learning_rate": 0.0003780856893034101,
"loss": 3.2523,
"step": 63550
},
{
"epoch": 18.526101141924958,
"grad_norm": 0.3790442943572998,
"learning_rate": 0.00037791081317400173,
"loss": 3.2502,
"step": 63600
},
{
"epoch": 18.540666511302728,
"grad_norm": 0.40154215693473816,
"learning_rate": 0.00037773593704459337,
"loss": 3.2554,
"step": 63650
},
{
"epoch": 18.555231880680495,
"grad_norm": 0.3869607746601105,
"learning_rate": 0.00037756106091518506,
"loss": 3.2674,
"step": 63700
},
{
"epoch": 18.56979725005826,
"grad_norm": 0.36808493733406067,
"learning_rate": 0.0003773861847857767,
"loss": 3.2566,
"step": 63750
},
{
"epoch": 18.58436261943603,
"grad_norm": 0.4031069278717041,
"learning_rate": 0.0003772113086563684,
"loss": 3.2647,
"step": 63800
},
{
"epoch": 18.598927988813795,
"grad_norm": 0.39664480090141296,
"learning_rate": 0.00037703643252696,
"loss": 3.2611,
"step": 63850
},
{
"epoch": 18.613493358191565,
"grad_norm": 0.4211257994174957,
"learning_rate": 0.0003768615563975517,
"loss": 3.2466,
"step": 63900
},
{
"epoch": 18.628058727569332,
"grad_norm": 0.37485969066619873,
"learning_rate": 0.00037668668026814335,
"loss": 3.2698,
"step": 63950
},
{
"epoch": 18.6426240969471,
"grad_norm": 0.3820188343524933,
"learning_rate": 0.000376511804138735,
"loss": 3.2583,
"step": 64000
},
{
"epoch": 18.6426240969471,
"eval_accuracy": 0.37283764392732077,
"eval_loss": 3.542705535888672,
"eval_runtime": 179.7091,
"eval_samples_per_second": 92.622,
"eval_steps_per_second": 5.793,
"step": 64000
},
{
"epoch": 18.657189466324866,
"grad_norm": 0.3915201425552368,
"learning_rate": 0.00037633692800932673,
"loss": 3.2704,
"step": 64050
},
{
"epoch": 18.671754835702632,
"grad_norm": 0.36770007014274597,
"learning_rate": 0.00037616205187991837,
"loss": 3.2531,
"step": 64100
},
{
"epoch": 18.6863202050804,
"grad_norm": 0.4022904336452484,
"learning_rate": 0.00037598717575051006,
"loss": 3.2499,
"step": 64150
},
{
"epoch": 18.70088557445817,
"grad_norm": 0.36411207914352417,
"learning_rate": 0.0003758122996211017,
"loss": 3.2623,
"step": 64200
},
{
"epoch": 18.715450943835936,
"grad_norm": 0.37535756826400757,
"learning_rate": 0.00037563742349169333,
"loss": 3.2548,
"step": 64250
},
{
"epoch": 18.730016313213703,
"grad_norm": 0.3946349322795868,
"learning_rate": 0.000375462547362285,
"loss": 3.2668,
"step": 64300
},
{
"epoch": 18.74458168259147,
"grad_norm": 0.4044114053249359,
"learning_rate": 0.00037528767123287665,
"loss": 3.2717,
"step": 64350
},
{
"epoch": 18.759147051969236,
"grad_norm": 0.3657906949520111,
"learning_rate": 0.00037511279510346834,
"loss": 3.2569,
"step": 64400
},
{
"epoch": 18.773712421347007,
"grad_norm": 0.3859136402606964,
"learning_rate": 0.00037493791897406,
"loss": 3.2739,
"step": 64450
},
{
"epoch": 18.788277790724774,
"grad_norm": 0.38982921838760376,
"learning_rate": 0.0003747630428446516,
"loss": 3.2765,
"step": 64500
},
{
"epoch": 18.80284316010254,
"grad_norm": 0.3761852979660034,
"learning_rate": 0.00037458816671524336,
"loss": 3.2637,
"step": 64550
},
{
"epoch": 18.817408529480307,
"grad_norm": 0.3764474093914032,
"learning_rate": 0.000374413290585835,
"loss": 3.2715,
"step": 64600
},
{
"epoch": 18.831973898858074,
"grad_norm": 0.37012961506843567,
"learning_rate": 0.0003742384144564267,
"loss": 3.2756,
"step": 64650
},
{
"epoch": 18.846539268235844,
"grad_norm": 0.4159339964389801,
"learning_rate": 0.0003740635383270183,
"loss": 3.2728,
"step": 64700
},
{
"epoch": 18.86110463761361,
"grad_norm": 0.3688717484474182,
"learning_rate": 0.00037388866219761,
"loss": 3.2715,
"step": 64750
},
{
"epoch": 18.875670006991378,
"grad_norm": 0.4111153185367584,
"learning_rate": 0.00037371378606820165,
"loss": 3.283,
"step": 64800
},
{
"epoch": 18.890235376369144,
"grad_norm": 0.4147163927555084,
"learning_rate": 0.0003735389099387933,
"loss": 3.2711,
"step": 64850
},
{
"epoch": 18.90480074574691,
"grad_norm": 0.36633679270744324,
"learning_rate": 0.000373364033809385,
"loss": 3.2735,
"step": 64900
},
{
"epoch": 18.919366115124678,
"grad_norm": 0.3624868094921112,
"learning_rate": 0.0003731891576799766,
"loss": 3.2772,
"step": 64950
},
{
"epoch": 18.93393148450245,
"grad_norm": 0.38773536682128906,
"learning_rate": 0.00037301428155056836,
"loss": 3.2754,
"step": 65000
},
{
"epoch": 18.93393148450245,
"eval_accuracy": 0.3732179818107963,
"eval_loss": 3.5359609127044678,
"eval_runtime": 179.6247,
"eval_samples_per_second": 92.665,
"eval_steps_per_second": 5.795,
"step": 65000
},
{
"epoch": 18.948496853880215,
"grad_norm": 0.37369510531425476,
"learning_rate": 0.00037283940542116,
"loss": 3.2842,
"step": 65050
},
{
"epoch": 18.96306222325798,
"grad_norm": 0.4039534032344818,
"learning_rate": 0.00037266452929175163,
"loss": 3.2827,
"step": 65100
},
{
"epoch": 18.97762759263575,
"grad_norm": 0.36381030082702637,
"learning_rate": 0.0003724896531623433,
"loss": 3.2838,
"step": 65150
},
{
"epoch": 18.992192962013515,
"grad_norm": 0.38101911544799805,
"learning_rate": 0.00037231477703293496,
"loss": 3.277,
"step": 65200
},
{
"epoch": 19.006700069913773,
"grad_norm": 0.3940986692905426,
"learning_rate": 0.00037213990090352665,
"loss": 3.2168,
"step": 65250
},
{
"epoch": 19.02126543929154,
"grad_norm": 0.44007158279418945,
"learning_rate": 0.0003719650247741183,
"loss": 3.1807,
"step": 65300
},
{
"epoch": 19.035830808669306,
"grad_norm": 0.3865497410297394,
"learning_rate": 0.00037179014864471,
"loss": 3.1847,
"step": 65350
},
{
"epoch": 19.050396178047077,
"grad_norm": 0.40062960982322693,
"learning_rate": 0.0003716152725153016,
"loss": 3.1729,
"step": 65400
},
{
"epoch": 19.064961547424844,
"grad_norm": 0.3954075276851654,
"learning_rate": 0.00037144039638589325,
"loss": 3.1875,
"step": 65450
},
{
"epoch": 19.07952691680261,
"grad_norm": 0.37583020329475403,
"learning_rate": 0.000371265520256485,
"loss": 3.1852,
"step": 65500
},
{
"epoch": 19.094092286180377,
"grad_norm": 0.38721004128456116,
"learning_rate": 0.00037109064412707663,
"loss": 3.1877,
"step": 65550
},
{
"epoch": 19.108657655558144,
"grad_norm": 0.4319014847278595,
"learning_rate": 0.0003709157679976683,
"loss": 3.2013,
"step": 65600
},
{
"epoch": 19.123223024935914,
"grad_norm": 0.36834290623664856,
"learning_rate": 0.00037074089186825995,
"loss": 3.1977,
"step": 65650
},
{
"epoch": 19.13778839431368,
"grad_norm": 0.4034636318683624,
"learning_rate": 0.0003705660157388516,
"loss": 3.1942,
"step": 65700
},
{
"epoch": 19.152353763691448,
"grad_norm": 0.3813159763813019,
"learning_rate": 0.0003703911396094433,
"loss": 3.1803,
"step": 65750
},
{
"epoch": 19.166919133069214,
"grad_norm": 0.3532137870788574,
"learning_rate": 0.0003702162634800349,
"loss": 3.2012,
"step": 65800
},
{
"epoch": 19.18148450244698,
"grad_norm": 0.38538220524787903,
"learning_rate": 0.0003700413873506266,
"loss": 3.2056,
"step": 65850
},
{
"epoch": 19.196049871824748,
"grad_norm": 0.36290931701660156,
"learning_rate": 0.00036986651122121824,
"loss": 3.214,
"step": 65900
},
{
"epoch": 19.210615241202518,
"grad_norm": 0.4123310446739197,
"learning_rate": 0.00036969163509181,
"loss": 3.2063,
"step": 65950
},
{
"epoch": 19.225180610580285,
"grad_norm": 0.41633448004722595,
"learning_rate": 0.0003695167589624016,
"loss": 3.1981,
"step": 66000
},
{
"epoch": 19.225180610580285,
"eval_accuracy": 0.3726028572956977,
"eval_loss": 3.5563158988952637,
"eval_runtime": 179.6893,
"eval_samples_per_second": 92.632,
"eval_steps_per_second": 5.793,
"step": 66000
},
{
"epoch": 19.23974597995805,
"grad_norm": 0.4212048053741455,
"learning_rate": 0.00036934188283299326,
"loss": 3.221,
"step": 66050
},
{
"epoch": 19.25431134933582,
"grad_norm": 0.4007203280925751,
"learning_rate": 0.00036916700670358495,
"loss": 3.2141,
"step": 66100
},
{
"epoch": 19.268876718713585,
"grad_norm": 0.4050043523311615,
"learning_rate": 0.0003689921305741766,
"loss": 3.2165,
"step": 66150
},
{
"epoch": 19.283442088091356,
"grad_norm": 0.41472339630126953,
"learning_rate": 0.0003688172544447683,
"loss": 3.2188,
"step": 66200
},
{
"epoch": 19.298007457469122,
"grad_norm": 0.3750508725643158,
"learning_rate": 0.0003686423783153599,
"loss": 3.2196,
"step": 66250
},
{
"epoch": 19.31257282684689,
"grad_norm": 0.3770619034767151,
"learning_rate": 0.00036846750218595155,
"loss": 3.2164,
"step": 66300
},
{
"epoch": 19.327138196224656,
"grad_norm": 0.36972129344940186,
"learning_rate": 0.00036829262605654324,
"loss": 3.2416,
"step": 66350
},
{
"epoch": 19.341703565602423,
"grad_norm": 0.3639586567878723,
"learning_rate": 0.0003681177499271349,
"loss": 3.2269,
"step": 66400
},
{
"epoch": 19.356268934980193,
"grad_norm": 0.410324364900589,
"learning_rate": 0.0003679428737977266,
"loss": 3.2258,
"step": 66450
},
{
"epoch": 19.37083430435796,
"grad_norm": 0.4071573317050934,
"learning_rate": 0.00036776799766831826,
"loss": 3.2323,
"step": 66500
},
{
"epoch": 19.385399673735726,
"grad_norm": 0.3902466595172882,
"learning_rate": 0.0003675931215389099,
"loss": 3.2259,
"step": 66550
},
{
"epoch": 19.399965043113493,
"grad_norm": 0.37968191504478455,
"learning_rate": 0.0003674182454095016,
"loss": 3.2274,
"step": 66600
},
{
"epoch": 19.41453041249126,
"grad_norm": 0.39837968349456787,
"learning_rate": 0.0003672433692800932,
"loss": 3.2345,
"step": 66650
},
{
"epoch": 19.429095781869027,
"grad_norm": 0.38949036598205566,
"learning_rate": 0.0003670684931506849,
"loss": 3.2259,
"step": 66700
},
{
"epoch": 19.443661151246797,
"grad_norm": 0.4259556531906128,
"learning_rate": 0.00036689361702127655,
"loss": 3.2506,
"step": 66750
},
{
"epoch": 19.458226520624564,
"grad_norm": 0.3879312574863434,
"learning_rate": 0.00036671874089186824,
"loss": 3.2416,
"step": 66800
},
{
"epoch": 19.47279189000233,
"grad_norm": 0.3887031078338623,
"learning_rate": 0.00036654386476245987,
"loss": 3.2328,
"step": 66850
},
{
"epoch": 19.487357259380097,
"grad_norm": 0.41111478209495544,
"learning_rate": 0.0003663689886330515,
"loss": 3.2329,
"step": 66900
},
{
"epoch": 19.501922628757864,
"grad_norm": 0.406820684671402,
"learning_rate": 0.00036619411250364325,
"loss": 3.2492,
"step": 66950
},
{
"epoch": 19.516487998135634,
"grad_norm": 0.3900870382785797,
"learning_rate": 0.0003660192363742349,
"loss": 3.2406,
"step": 67000
},
{
"epoch": 19.516487998135634,
"eval_accuracy": 0.37308265912026145,
"eval_loss": 3.544171094894409,
"eval_runtime": 179.6633,
"eval_samples_per_second": 92.646,
"eval_steps_per_second": 5.794,
"step": 67000
},
{
"epoch": 19.5310533675134,
"grad_norm": 0.3849544823169708,
"learning_rate": 0.0003658443602448266,
"loss": 3.2406,
"step": 67050
},
{
"epoch": 19.545618736891168,
"grad_norm": 0.4000382423400879,
"learning_rate": 0.0003656694841154182,
"loss": 3.2455,
"step": 67100
},
{
"epoch": 19.560184106268935,
"grad_norm": 0.39659613370895386,
"learning_rate": 0.00036549460798600985,
"loss": 3.2367,
"step": 67150
},
{
"epoch": 19.5747494756467,
"grad_norm": 0.36925145983695984,
"learning_rate": 0.00036531973185660154,
"loss": 3.2491,
"step": 67200
},
{
"epoch": 19.589314845024468,
"grad_norm": 0.3747584819793701,
"learning_rate": 0.0003651448557271932,
"loss": 3.2431,
"step": 67250
},
{
"epoch": 19.60388021440224,
"grad_norm": 0.371640682220459,
"learning_rate": 0.00036496997959778487,
"loss": 3.2545,
"step": 67300
},
{
"epoch": 19.618445583780005,
"grad_norm": 0.38793015480041504,
"learning_rate": 0.0003647951034683765,
"loss": 3.2471,
"step": 67350
},
{
"epoch": 19.633010953157772,
"grad_norm": 0.4079042375087738,
"learning_rate": 0.00036462022733896825,
"loss": 3.2497,
"step": 67400
},
{
"epoch": 19.64757632253554,
"grad_norm": 0.39877283573150635,
"learning_rate": 0.0003644453512095599,
"loss": 3.2512,
"step": 67450
},
{
"epoch": 19.662141691913305,
"grad_norm": 0.40305206179618835,
"learning_rate": 0.0003642704750801515,
"loss": 3.2577,
"step": 67500
},
{
"epoch": 19.676707061291076,
"grad_norm": 0.3949699103832245,
"learning_rate": 0.0003640955989507432,
"loss": 3.2556,
"step": 67550
},
{
"epoch": 19.691272430668842,
"grad_norm": 0.3933976888656616,
"learning_rate": 0.00036392072282133485,
"loss": 3.2607,
"step": 67600
},
{
"epoch": 19.70583780004661,
"grad_norm": 0.4123631417751312,
"learning_rate": 0.00036374584669192654,
"loss": 3.2436,
"step": 67650
},
{
"epoch": 19.720403169424376,
"grad_norm": 0.36142420768737793,
"learning_rate": 0.0003635709705625182,
"loss": 3.2589,
"step": 67700
},
{
"epoch": 19.734968538802143,
"grad_norm": 0.4008404612541199,
"learning_rate": 0.0003633960944331098,
"loss": 3.2537,
"step": 67750
},
{
"epoch": 19.749533908179913,
"grad_norm": 0.36675870418548584,
"learning_rate": 0.0003632212183037015,
"loss": 3.256,
"step": 67800
},
{
"epoch": 19.76409927755768,
"grad_norm": 0.3671972155570984,
"learning_rate": 0.00036304634217429314,
"loss": 3.242,
"step": 67850
},
{
"epoch": 19.778664646935447,
"grad_norm": 0.40335404872894287,
"learning_rate": 0.0003628714660448849,
"loss": 3.2533,
"step": 67900
},
{
"epoch": 19.793230016313213,
"grad_norm": 0.4161832630634308,
"learning_rate": 0.0003626965899154765,
"loss": 3.2499,
"step": 67950
},
{
"epoch": 19.80779538569098,
"grad_norm": 0.4160614311695099,
"learning_rate": 0.0003625217137860682,
"loss": 3.2603,
"step": 68000
},
{
"epoch": 19.80779538569098,
"eval_accuracy": 0.3735364780476851,
"eval_loss": 3.539116859436035,
"eval_runtime": 179.5961,
"eval_samples_per_second": 92.68,
"eval_steps_per_second": 5.796,
"step": 68000
},
{
"epoch": 19.822360755068747,
"grad_norm": 0.3749236464500427,
"learning_rate": 0.00036234683765665985,
"loss": 3.2614,
"step": 68050
},
{
"epoch": 19.836926124446517,
"grad_norm": 0.39177680015563965,
"learning_rate": 0.0003621719615272515,
"loss": 3.257,
"step": 68100
},
{
"epoch": 19.851491493824284,
"grad_norm": 0.3916817307472229,
"learning_rate": 0.00036199708539784317,
"loss": 3.264,
"step": 68150
},
{
"epoch": 19.86605686320205,
"grad_norm": 0.43153202533721924,
"learning_rate": 0.0003618222092684348,
"loss": 3.2513,
"step": 68200
},
{
"epoch": 19.880622232579817,
"grad_norm": 0.40334609150886536,
"learning_rate": 0.0003616473331390265,
"loss": 3.2527,
"step": 68250
},
{
"epoch": 19.895187601957584,
"grad_norm": 0.4108611047267914,
"learning_rate": 0.00036147245700961813,
"loss": 3.2539,
"step": 68300
},
{
"epoch": 19.909752971335354,
"grad_norm": 0.39317360520362854,
"learning_rate": 0.00036129758088020977,
"loss": 3.2706,
"step": 68350
},
{
"epoch": 19.92431834071312,
"grad_norm": 0.3866609036922455,
"learning_rate": 0.0003611227047508015,
"loss": 3.2614,
"step": 68400
},
{
"epoch": 19.938883710090888,
"grad_norm": 0.4029618501663208,
"learning_rate": 0.00036094782862139315,
"loss": 3.2586,
"step": 68450
},
{
"epoch": 19.953449079468655,
"grad_norm": 0.3921782672405243,
"learning_rate": 0.00036077295249198484,
"loss": 3.2668,
"step": 68500
},
{
"epoch": 19.96801444884642,
"grad_norm": 0.37679243087768555,
"learning_rate": 0.0003605980763625765,
"loss": 3.2669,
"step": 68550
},
{
"epoch": 19.982579818224192,
"grad_norm": 0.39157822728157043,
"learning_rate": 0.0003604232002331681,
"loss": 3.2711,
"step": 68600
},
{
"epoch": 19.99714518760196,
"grad_norm": 0.37484461069107056,
"learning_rate": 0.0003602483241037598,
"loss": 3.2748,
"step": 68650
},
{
"epoch": 20.011652295502213,
"grad_norm": 0.3911686837673187,
"learning_rate": 0.00036007344797435144,
"loss": 3.184,
"step": 68700
},
{
"epoch": 20.026217664879983,
"grad_norm": 0.37566766142845154,
"learning_rate": 0.00035989857184494313,
"loss": 3.1588,
"step": 68750
},
{
"epoch": 20.04078303425775,
"grad_norm": 0.3786637485027313,
"learning_rate": 0.00035972369571553477,
"loss": 3.1597,
"step": 68800
},
{
"epoch": 20.055348403635517,
"grad_norm": 0.4000002145767212,
"learning_rate": 0.0003595488195861265,
"loss": 3.1753,
"step": 68850
},
{
"epoch": 20.069913773013283,
"grad_norm": 0.41424959897994995,
"learning_rate": 0.00035937394345671815,
"loss": 3.1716,
"step": 68900
},
{
"epoch": 20.08447914239105,
"grad_norm": 0.4069176912307739,
"learning_rate": 0.0003591990673273098,
"loss": 3.1743,
"step": 68950
},
{
"epoch": 20.099044511768817,
"grad_norm": 0.39178600907325745,
"learning_rate": 0.0003590241911979015,
"loss": 3.1759,
"step": 69000
},
{
"epoch": 20.099044511768817,
"eval_accuracy": 0.3731403858283871,
"eval_loss": 3.5491995811462402,
"eval_runtime": 179.7147,
"eval_samples_per_second": 92.619,
"eval_steps_per_second": 5.793,
"step": 69000
},
{
"epoch": 20.113609881146587,
"grad_norm": 0.42330384254455566,
"learning_rate": 0.0003588493150684931,
"loss": 3.1797,
"step": 69050
},
{
"epoch": 20.128175250524354,
"grad_norm": 0.3965478241443634,
"learning_rate": 0.0003586744389390848,
"loss": 3.1863,
"step": 69100
},
{
"epoch": 20.14274061990212,
"grad_norm": 0.41020357608795166,
"learning_rate": 0.00035849956280967644,
"loss": 3.1812,
"step": 69150
},
{
"epoch": 20.157305989279887,
"grad_norm": 0.42909374833106995,
"learning_rate": 0.0003583246866802681,
"loss": 3.2021,
"step": 69200
},
{
"epoch": 20.171871358657654,
"grad_norm": 0.38205522298812866,
"learning_rate": 0.00035814981055085976,
"loss": 3.2022,
"step": 69250
},
{
"epoch": 20.186436728035424,
"grad_norm": 0.3973395526409149,
"learning_rate": 0.0003579749344214514,
"loss": 3.2015,
"step": 69300
},
{
"epoch": 20.20100209741319,
"grad_norm": 0.41039976477622986,
"learning_rate": 0.00035780005829204315,
"loss": 3.1961,
"step": 69350
},
{
"epoch": 20.215567466790958,
"grad_norm": 0.3577198088169098,
"learning_rate": 0.0003576251821626348,
"loss": 3.2056,
"step": 69400
},
{
"epoch": 20.230132836168725,
"grad_norm": 0.43166354298591614,
"learning_rate": 0.00035745030603322647,
"loss": 3.2002,
"step": 69450
},
{
"epoch": 20.24469820554649,
"grad_norm": 0.3968643546104431,
"learning_rate": 0.0003572754299038181,
"loss": 3.2117,
"step": 69500
},
{
"epoch": 20.25926357492426,
"grad_norm": 0.3748406171798706,
"learning_rate": 0.00035710055377440974,
"loss": 3.199,
"step": 69550
},
{
"epoch": 20.27382894430203,
"grad_norm": 0.41351601481437683,
"learning_rate": 0.00035692567764500143,
"loss": 3.1976,
"step": 69600
},
{
"epoch": 20.288394313679795,
"grad_norm": 0.40381181240081787,
"learning_rate": 0.00035675080151559307,
"loss": 3.1988,
"step": 69650
},
{
"epoch": 20.302959683057562,
"grad_norm": 0.38123536109924316,
"learning_rate": 0.00035657592538618476,
"loss": 3.2122,
"step": 69700
},
{
"epoch": 20.31752505243533,
"grad_norm": 0.40208685398101807,
"learning_rate": 0.0003564010492567764,
"loss": 3.2187,
"step": 69750
},
{
"epoch": 20.332090421813096,
"grad_norm": 0.40056926012039185,
"learning_rate": 0.00035622617312736803,
"loss": 3.2165,
"step": 69800
},
{
"epoch": 20.346655791190866,
"grad_norm": 0.40889421105384827,
"learning_rate": 0.0003560512969979598,
"loss": 3.2228,
"step": 69850
},
{
"epoch": 20.361221160568633,
"grad_norm": 0.3698402941226959,
"learning_rate": 0.0003558764208685514,
"loss": 3.2188,
"step": 69900
},
{
"epoch": 20.3757865299464,
"grad_norm": 0.4034403860569,
"learning_rate": 0.0003557015447391431,
"loss": 3.2169,
"step": 69950
},
{
"epoch": 20.390351899324166,
"grad_norm": 0.37178969383239746,
"learning_rate": 0.00035552666860973474,
"loss": 3.2213,
"step": 70000
},
{
"epoch": 20.390351899324166,
"eval_accuracy": 0.3731590794059675,
"eval_loss": 3.5475502014160156,
"eval_runtime": 179.6766,
"eval_samples_per_second": 92.639,
"eval_steps_per_second": 5.794,
"step": 70000
},
{
"epoch": 20.404917268701933,
"grad_norm": 0.40938612818717957,
"learning_rate": 0.00035535179248032643,
"loss": 3.2173,
"step": 70050
},
{
"epoch": 20.419482638079703,
"grad_norm": 0.4097348749637604,
"learning_rate": 0.00035517691635091807,
"loss": 3.225,
"step": 70100
},
{
"epoch": 20.43404800745747,
"grad_norm": 0.37782126665115356,
"learning_rate": 0.0003550020402215097,
"loss": 3.2252,
"step": 70150
},
{
"epoch": 20.448613376835237,
"grad_norm": 0.36499133706092834,
"learning_rate": 0.0003548271640921014,
"loss": 3.2237,
"step": 70200
},
{
"epoch": 20.463178746213003,
"grad_norm": 0.39261844754219055,
"learning_rate": 0.00035465228796269303,
"loss": 3.2361,
"step": 70250
},
{
"epoch": 20.47774411559077,
"grad_norm": 0.42958515882492065,
"learning_rate": 0.0003544774118332848,
"loss": 3.2253,
"step": 70300
},
{
"epoch": 20.49230948496854,
"grad_norm": 0.3969309628009796,
"learning_rate": 0.0003543025357038764,
"loss": 3.2322,
"step": 70350
},
{
"epoch": 20.506874854346307,
"grad_norm": 0.37618428468704224,
"learning_rate": 0.00035412765957446805,
"loss": 3.2311,
"step": 70400
},
{
"epoch": 20.521440223724074,
"grad_norm": 0.413291871547699,
"learning_rate": 0.00035395278344505974,
"loss": 3.2176,
"step": 70450
},
{
"epoch": 20.53600559310184,
"grad_norm": 0.38615167140960693,
"learning_rate": 0.0003537779073156514,
"loss": 3.236,
"step": 70500
},
{
"epoch": 20.550570962479608,
"grad_norm": 0.39638757705688477,
"learning_rate": 0.00035360303118624306,
"loss": 3.2306,
"step": 70550
},
{
"epoch": 20.565136331857374,
"grad_norm": 0.4069354236125946,
"learning_rate": 0.0003534281550568347,
"loss": 3.228,
"step": 70600
},
{
"epoch": 20.579701701235145,
"grad_norm": 0.38820722699165344,
"learning_rate": 0.0003532532789274264,
"loss": 3.2352,
"step": 70650
},
{
"epoch": 20.59426707061291,
"grad_norm": 0.47593021392822266,
"learning_rate": 0.000353078402798018,
"loss": 3.2209,
"step": 70700
},
{
"epoch": 20.608832439990678,
"grad_norm": 0.38806480169296265,
"learning_rate": 0.00035290352666860966,
"loss": 3.2287,
"step": 70750
},
{
"epoch": 20.623397809368445,
"grad_norm": 0.399444580078125,
"learning_rate": 0.0003527286505392014,
"loss": 3.2371,
"step": 70800
},
{
"epoch": 20.63796317874621,
"grad_norm": 0.3905348479747772,
"learning_rate": 0.00035255377440979304,
"loss": 3.2353,
"step": 70850
},
{
"epoch": 20.652528548123982,
"grad_norm": 0.42255914211273193,
"learning_rate": 0.00035237889828038473,
"loss": 3.2346,
"step": 70900
},
{
"epoch": 20.66709391750175,
"grad_norm": 0.40701016783714294,
"learning_rate": 0.00035220402215097637,
"loss": 3.2384,
"step": 70950
},
{
"epoch": 20.681659286879515,
"grad_norm": 0.39621832966804504,
"learning_rate": 0.000352029146021568,
"loss": 3.2306,
"step": 71000
},
{
"epoch": 20.681659286879515,
"eval_accuracy": 0.3734405411967064,
"eval_loss": 3.54192852973938,
"eval_runtime": 179.7033,
"eval_samples_per_second": 92.625,
"eval_steps_per_second": 5.793,
"step": 71000
},
{
"epoch": 20.696224656257282,
"grad_norm": 0.43927934765815735,
"learning_rate": 0.0003518542698921597,
"loss": 3.2395,
"step": 71050
},
{
"epoch": 20.71079002563505,
"grad_norm": 0.4314327538013458,
"learning_rate": 0.00035167939376275133,
"loss": 3.2466,
"step": 71100
},
{
"epoch": 20.72535539501282,
"grad_norm": 0.4066350758075714,
"learning_rate": 0.000351504517633343,
"loss": 3.2542,
"step": 71150
},
{
"epoch": 20.739920764390586,
"grad_norm": 0.3890044093132019,
"learning_rate": 0.00035132964150393466,
"loss": 3.2435,
"step": 71200
},
{
"epoch": 20.754486133768353,
"grad_norm": 0.38141369819641113,
"learning_rate": 0.0003511547653745263,
"loss": 3.2417,
"step": 71250
},
{
"epoch": 20.76905150314612,
"grad_norm": 0.3846435546875,
"learning_rate": 0.00035097988924511804,
"loss": 3.237,
"step": 71300
},
{
"epoch": 20.783616872523886,
"grad_norm": 0.3892623782157898,
"learning_rate": 0.0003508050131157097,
"loss": 3.2461,
"step": 71350
},
{
"epoch": 20.798182241901653,
"grad_norm": 0.39577746391296387,
"learning_rate": 0.00035063013698630137,
"loss": 3.2561,
"step": 71400
},
{
"epoch": 20.812747611279423,
"grad_norm": 0.43198904395103455,
"learning_rate": 0.000350455260856893,
"loss": 3.2542,
"step": 71450
},
{
"epoch": 20.82731298065719,
"grad_norm": 0.3951154947280884,
"learning_rate": 0.0003502803847274847,
"loss": 3.2507,
"step": 71500
},
{
"epoch": 20.841878350034957,
"grad_norm": 0.3816114068031311,
"learning_rate": 0.00035010550859807633,
"loss": 3.2455,
"step": 71550
},
{
"epoch": 20.856443719412724,
"grad_norm": 0.4048319160938263,
"learning_rate": 0.00034993063246866797,
"loss": 3.2574,
"step": 71600
},
{
"epoch": 20.87100908879049,
"grad_norm": 0.41361382603645325,
"learning_rate": 0.00034975575633925966,
"loss": 3.2607,
"step": 71650
},
{
"epoch": 20.88557445816826,
"grad_norm": 0.3653562068939209,
"learning_rate": 0.0003495808802098513,
"loss": 3.2585,
"step": 71700
},
{
"epoch": 20.900139827546028,
"grad_norm": 0.3662196099758148,
"learning_rate": 0.00034940600408044304,
"loss": 3.2576,
"step": 71750
},
{
"epoch": 20.914705196923794,
"grad_norm": 0.4024188220500946,
"learning_rate": 0.0003492311279510347,
"loss": 3.2611,
"step": 71800
},
{
"epoch": 20.92927056630156,
"grad_norm": 0.4076612889766693,
"learning_rate": 0.0003490562518216263,
"loss": 3.2563,
"step": 71850
},
{
"epoch": 20.943835935679328,
"grad_norm": 0.3958513140678406,
"learning_rate": 0.000348881375692218,
"loss": 3.2524,
"step": 71900
},
{
"epoch": 20.958401305057095,
"grad_norm": 0.40390610694885254,
"learning_rate": 0.00034870649956280964,
"loss": 3.2448,
"step": 71950
},
{
"epoch": 20.972966674434865,
"grad_norm": 0.41278237104415894,
"learning_rate": 0.0003485316234334013,
"loss": 3.259,
"step": 72000
},
{
"epoch": 20.972966674434865,
"eval_accuracy": 0.3740693037935618,
"eval_loss": 3.5322375297546387,
"eval_runtime": 179.5665,
"eval_samples_per_second": 92.695,
"eval_steps_per_second": 5.797,
"step": 72000
},
{
"epoch": 20.98753204381263,
"grad_norm": 0.3775636553764343,
"learning_rate": 0.00034835674730399296,
"loss": 3.262,
"step": 72050
},
{
"epoch": 21.002039151712886,
"grad_norm": 0.4008495509624481,
"learning_rate": 0.00034818187117458465,
"loss": 3.2479,
"step": 72100
},
{
"epoch": 21.016604521090656,
"grad_norm": 0.38264980912208557,
"learning_rate": 0.0003480069950451763,
"loss": 3.1549,
"step": 72150
},
{
"epoch": 21.031169890468423,
"grad_norm": 0.40952587127685547,
"learning_rate": 0.0003478321189157679,
"loss": 3.1519,
"step": 72200
},
{
"epoch": 21.04573525984619,
"grad_norm": 0.40433329343795776,
"learning_rate": 0.00034765724278635967,
"loss": 3.159,
"step": 72250
},
{
"epoch": 21.060300629223956,
"grad_norm": 0.40058109164237976,
"learning_rate": 0.0003474823666569513,
"loss": 3.155,
"step": 72300
},
{
"epoch": 21.074865998601723,
"grad_norm": 0.39246848225593567,
"learning_rate": 0.000347307490527543,
"loss": 3.1671,
"step": 72350
},
{
"epoch": 21.089431367979493,
"grad_norm": 0.40194451808929443,
"learning_rate": 0.00034713261439813463,
"loss": 3.1781,
"step": 72400
},
{
"epoch": 21.10399673735726,
"grad_norm": 0.3998311161994934,
"learning_rate": 0.00034695773826872627,
"loss": 3.1801,
"step": 72450
},
{
"epoch": 21.118562106735027,
"grad_norm": 0.3767092525959015,
"learning_rate": 0.00034678286213931796,
"loss": 3.1863,
"step": 72500
},
{
"epoch": 21.133127476112794,
"grad_norm": 0.3758101761341095,
"learning_rate": 0.0003466079860099096,
"loss": 3.1936,
"step": 72550
},
{
"epoch": 21.14769284549056,
"grad_norm": 0.37861061096191406,
"learning_rate": 0.0003464331098805013,
"loss": 3.1772,
"step": 72600
},
{
"epoch": 21.16225821486833,
"grad_norm": 0.39717918634414673,
"learning_rate": 0.0003462582337510929,
"loss": 3.1924,
"step": 72650
},
{
"epoch": 21.176823584246097,
"grad_norm": 0.37728017568588257,
"learning_rate": 0.00034608335762168467,
"loss": 3.1804,
"step": 72700
},
{
"epoch": 21.191388953623864,
"grad_norm": 0.3761899769306183,
"learning_rate": 0.0003459084814922763,
"loss": 3.1765,
"step": 72750
},
{
"epoch": 21.20595432300163,
"grad_norm": 0.3872700333595276,
"learning_rate": 0.00034573360536286794,
"loss": 3.1829,
"step": 72800
},
{
"epoch": 21.220519692379398,
"grad_norm": 0.4359159469604492,
"learning_rate": 0.00034555872923345963,
"loss": 3.184,
"step": 72850
},
{
"epoch": 21.235085061757164,
"grad_norm": 0.42497533559799194,
"learning_rate": 0.00034538385310405127,
"loss": 3.1963,
"step": 72900
},
{
"epoch": 21.249650431134935,
"grad_norm": 0.41894423961639404,
"learning_rate": 0.00034520897697464296,
"loss": 3.1796,
"step": 72950
},
{
"epoch": 21.2642158005127,
"grad_norm": 0.5145085453987122,
"learning_rate": 0.0003450341008452346,
"loss": 3.2027,
"step": 73000
},
{
"epoch": 21.2642158005127,
"eval_accuracy": 0.3728435224108366,
"eval_loss": 3.552633047103882,
"eval_runtime": 179.6553,
"eval_samples_per_second": 92.65,
"eval_steps_per_second": 5.794,
"step": 73000
},
{
"epoch": 21.27878116989047,
"grad_norm": 0.4553317427635193,
"learning_rate": 0.00034485922471582623,
"loss": 3.2022,
"step": 73050
},
{
"epoch": 21.293346539268235,
"grad_norm": 0.3895247280597687,
"learning_rate": 0.0003446843485864179,
"loss": 3.2031,
"step": 73100
},
{
"epoch": 21.307911908646002,
"grad_norm": 0.4008367955684662,
"learning_rate": 0.00034450947245700955,
"loss": 3.1877,
"step": 73150
},
{
"epoch": 21.322477278023772,
"grad_norm": 0.39398273825645447,
"learning_rate": 0.0003443345963276013,
"loss": 3.1962,
"step": 73200
},
{
"epoch": 21.33704264740154,
"grad_norm": 0.42520371079444885,
"learning_rate": 0.00034415972019819294,
"loss": 3.2137,
"step": 73250
},
{
"epoch": 21.351608016779306,
"grad_norm": 0.3961343467235565,
"learning_rate": 0.00034398484406878457,
"loss": 3.2061,
"step": 73300
},
{
"epoch": 21.366173386157072,
"grad_norm": 0.40649378299713135,
"learning_rate": 0.00034380996793937626,
"loss": 3.2057,
"step": 73350
},
{
"epoch": 21.38073875553484,
"grad_norm": 0.40000006556510925,
"learning_rate": 0.0003436350918099679,
"loss": 3.2122,
"step": 73400
},
{
"epoch": 21.39530412491261,
"grad_norm": 0.386966347694397,
"learning_rate": 0.0003434602156805596,
"loss": 3.2013,
"step": 73450
},
{
"epoch": 21.409869494290376,
"grad_norm": 0.3852634131908417,
"learning_rate": 0.0003432853395511512,
"loss": 3.2136,
"step": 73500
},
{
"epoch": 21.424434863668143,
"grad_norm": 0.4111328721046448,
"learning_rate": 0.0003431104634217429,
"loss": 3.2111,
"step": 73550
},
{
"epoch": 21.43900023304591,
"grad_norm": 0.36920034885406494,
"learning_rate": 0.00034293558729233455,
"loss": 3.2099,
"step": 73600
},
{
"epoch": 21.453565602423676,
"grad_norm": 0.3971029818058014,
"learning_rate": 0.0003427607111629262,
"loss": 3.2053,
"step": 73650
},
{
"epoch": 21.468130971801443,
"grad_norm": 0.38907453417778015,
"learning_rate": 0.00034258583503351793,
"loss": 3.2086,
"step": 73700
},
{
"epoch": 21.482696341179214,
"grad_norm": 0.4156521260738373,
"learning_rate": 0.00034241095890410957,
"loss": 3.2186,
"step": 73750
},
{
"epoch": 21.49726171055698,
"grad_norm": 0.37278106808662415,
"learning_rate": 0.00034223608277470126,
"loss": 3.2098,
"step": 73800
},
{
"epoch": 21.511827079934747,
"grad_norm": 0.39464330673217773,
"learning_rate": 0.0003420612066452929,
"loss": 3.2295,
"step": 73850
},
{
"epoch": 21.526392449312514,
"grad_norm": 0.4381686747074127,
"learning_rate": 0.00034188633051588453,
"loss": 3.2146,
"step": 73900
},
{
"epoch": 21.54095781869028,
"grad_norm": 0.3839596211910248,
"learning_rate": 0.0003417114543864762,
"loss": 3.2248,
"step": 73950
},
{
"epoch": 21.55552318806805,
"grad_norm": 0.4019978940486908,
"learning_rate": 0.00034153657825706786,
"loss": 3.2301,
"step": 74000
},
{
"epoch": 21.55552318806805,
"eval_accuracy": 0.3734910961549427,
"eval_loss": 3.544201612472534,
"eval_runtime": 179.6317,
"eval_samples_per_second": 92.662,
"eval_steps_per_second": 5.795,
"step": 74000
},
{
"epoch": 21.570088557445818,
"grad_norm": 0.3982439935207367,
"learning_rate": 0.00034136170212765955,
"loss": 3.2257,
"step": 74050
},
{
"epoch": 21.584653926823584,
"grad_norm": 0.40451157093048096,
"learning_rate": 0.0003411868259982512,
"loss": 3.223,
"step": 74100
},
{
"epoch": 21.59921929620135,
"grad_norm": 0.40273866057395935,
"learning_rate": 0.00034101194986884293,
"loss": 3.2108,
"step": 74150
},
{
"epoch": 21.613784665579118,
"grad_norm": 0.4382562041282654,
"learning_rate": 0.00034083707373943456,
"loss": 3.217,
"step": 74200
},
{
"epoch": 21.62835003495689,
"grad_norm": 0.4187087118625641,
"learning_rate": 0.0003406621976100262,
"loss": 3.225,
"step": 74250
},
{
"epoch": 21.642915404334655,
"grad_norm": 0.42689085006713867,
"learning_rate": 0.0003404873214806179,
"loss": 3.2501,
"step": 74300
},
{
"epoch": 21.65748077371242,
"grad_norm": 0.4216347336769104,
"learning_rate": 0.00034031244535120953,
"loss": 3.2472,
"step": 74350
},
{
"epoch": 21.67204614309019,
"grad_norm": 0.36499667167663574,
"learning_rate": 0.0003401375692218012,
"loss": 3.228,
"step": 74400
},
{
"epoch": 21.686611512467955,
"grad_norm": 0.38649997115135193,
"learning_rate": 0.00033996269309239285,
"loss": 3.2288,
"step": 74450
},
{
"epoch": 21.701176881845722,
"grad_norm": 0.41683053970336914,
"learning_rate": 0.0003397878169629845,
"loss": 3.2331,
"step": 74500
},
{
"epoch": 21.715742251223492,
"grad_norm": 0.3885643780231476,
"learning_rate": 0.0003396129408335762,
"loss": 3.2369,
"step": 74550
},
{
"epoch": 21.73030762060126,
"grad_norm": 0.37198543548583984,
"learning_rate": 0.0003394380647041678,
"loss": 3.2315,
"step": 74600
},
{
"epoch": 21.744872989979026,
"grad_norm": 0.3774627149105072,
"learning_rate": 0.00033926318857475956,
"loss": 3.2361,
"step": 74650
},
{
"epoch": 21.759438359356793,
"grad_norm": 0.3771449327468872,
"learning_rate": 0.0003390883124453512,
"loss": 3.2282,
"step": 74700
},
{
"epoch": 21.77400372873456,
"grad_norm": 0.4178008437156677,
"learning_rate": 0.0003389134363159429,
"loss": 3.2358,
"step": 74750
},
{
"epoch": 21.78856909811233,
"grad_norm": 0.3946438133716583,
"learning_rate": 0.0003387385601865345,
"loss": 3.2344,
"step": 74800
},
{
"epoch": 21.803134467490096,
"grad_norm": 0.39481014013290405,
"learning_rate": 0.00033856368405712616,
"loss": 3.2308,
"step": 74850
},
{
"epoch": 21.817699836867863,
"grad_norm": 0.39908483624458313,
"learning_rate": 0.00033838880792771785,
"loss": 3.2348,
"step": 74900
},
{
"epoch": 21.83226520624563,
"grad_norm": 0.37811824679374695,
"learning_rate": 0.0003382139317983095,
"loss": 3.2501,
"step": 74950
},
{
"epoch": 21.846830575623397,
"grad_norm": 0.4090801775455475,
"learning_rate": 0.0003380390556689012,
"loss": 3.2353,
"step": 75000
},
{
"epoch": 21.846830575623397,
"eval_accuracy": 0.37407236060499,
"eval_loss": 3.5360472202301025,
"eval_runtime": 180.4467,
"eval_samples_per_second": 92.243,
"eval_steps_per_second": 5.769,
"step": 75000
},
{
"epoch": 21.861395945001163,
"grad_norm": 0.37465283274650574,
"learning_rate": 0.0003378641795394928,
"loss": 3.238,
"step": 75050
},
{
"epoch": 21.875961314378934,
"grad_norm": 0.39519503712654114,
"learning_rate": 0.00033768930341008445,
"loss": 3.233,
"step": 75100
},
{
"epoch": 21.8905266837567,
"grad_norm": 0.41474273800849915,
"learning_rate": 0.0003375144272806762,
"loss": 3.2464,
"step": 75150
},
{
"epoch": 21.905092053134467,
"grad_norm": 0.42501193284988403,
"learning_rate": 0.00033733955115126783,
"loss": 3.2312,
"step": 75200
},
{
"epoch": 21.919657422512234,
"grad_norm": 0.3937487006187439,
"learning_rate": 0.0003371646750218595,
"loss": 3.2464,
"step": 75250
},
{
"epoch": 21.93422279189,
"grad_norm": 0.38560178875923157,
"learning_rate": 0.00033698979889245116,
"loss": 3.2429,
"step": 75300
},
{
"epoch": 21.94878816126777,
"grad_norm": 0.40010857582092285,
"learning_rate": 0.0003368149227630428,
"loss": 3.2451,
"step": 75350
},
{
"epoch": 21.963353530645538,
"grad_norm": 0.3742867410182953,
"learning_rate": 0.0003366400466336345,
"loss": 3.2484,
"step": 75400
},
{
"epoch": 21.977918900023305,
"grad_norm": 0.3764432370662689,
"learning_rate": 0.0003364651705042261,
"loss": 3.2551,
"step": 75450
},
{
"epoch": 21.99248426940107,
"grad_norm": 0.39993754029273987,
"learning_rate": 0.0003362902943748178,
"loss": 3.2462,
"step": 75500
},
{
"epoch": 22.00699137730133,
"grad_norm": 0.415519654750824,
"learning_rate": 0.00033611541824540945,
"loss": 3.1819,
"step": 75550
},
{
"epoch": 22.021556746679096,
"grad_norm": 0.4040631651878357,
"learning_rate": 0.0003359405421160012,
"loss": 3.1398,
"step": 75600
},
{
"epoch": 22.036122116056863,
"grad_norm": 0.3827342689037323,
"learning_rate": 0.0003357656659865928,
"loss": 3.1443,
"step": 75650
},
{
"epoch": 22.05068748543463,
"grad_norm": 0.42973458766937256,
"learning_rate": 0.00033559078985718446,
"loss": 3.1524,
"step": 75700
},
{
"epoch": 22.0652528548124,
"grad_norm": 0.3999570906162262,
"learning_rate": 0.00033541591372777615,
"loss": 3.1535,
"step": 75750
},
{
"epoch": 22.079818224190166,
"grad_norm": 0.4020233750343323,
"learning_rate": 0.0003352410375983678,
"loss": 3.1444,
"step": 75800
},
{
"epoch": 22.094383593567933,
"grad_norm": 0.39696866273880005,
"learning_rate": 0.0003350661614689595,
"loss": 3.1719,
"step": 75850
},
{
"epoch": 22.1089489629457,
"grad_norm": 0.3695342242717743,
"learning_rate": 0.0003348912853395511,
"loss": 3.1644,
"step": 75900
},
{
"epoch": 22.123514332323467,
"grad_norm": 0.40176528692245483,
"learning_rate": 0.00033471640921014275,
"loss": 3.1743,
"step": 75950
},
{
"epoch": 22.138079701701233,
"grad_norm": 0.39200496673583984,
"learning_rate": 0.00033454153308073444,
"loss": 3.1793,
"step": 76000
},
{
"epoch": 22.138079701701233,
"eval_accuracy": 0.37305079773960553,
"eval_loss": 3.5540237426757812,
"eval_runtime": 180.2832,
"eval_samples_per_second": 92.327,
"eval_steps_per_second": 5.774,
"step": 76000
},
{
"epoch": 22.152645071079004,
"grad_norm": 0.39109188318252563,
"learning_rate": 0.0003343666569513261,
"loss": 3.1723,
"step": 76050
},
{
"epoch": 22.16721044045677,
"grad_norm": 0.4028526544570923,
"learning_rate": 0.0003341917808219178,
"loss": 3.1775,
"step": 76100
},
{
"epoch": 22.181775809834537,
"grad_norm": 0.4098780155181885,
"learning_rate": 0.00033401690469250946,
"loss": 3.1614,
"step": 76150
},
{
"epoch": 22.196341179212304,
"grad_norm": 0.4064423441886902,
"learning_rate": 0.00033384202856310115,
"loss": 3.1635,
"step": 76200
},
{
"epoch": 22.21090654859007,
"grad_norm": 0.3825031816959381,
"learning_rate": 0.0003336671524336928,
"loss": 3.1861,
"step": 76250
},
{
"epoch": 22.22547191796784,
"grad_norm": 0.41401293873786926,
"learning_rate": 0.0003334922763042844,
"loss": 3.183,
"step": 76300
},
{
"epoch": 22.240037287345608,
"grad_norm": 0.404306560754776,
"learning_rate": 0.0003333174001748761,
"loss": 3.1818,
"step": 76350
},
{
"epoch": 22.254602656723375,
"grad_norm": 0.43731221556663513,
"learning_rate": 0.00033314252404546775,
"loss": 3.1921,
"step": 76400
},
{
"epoch": 22.26916802610114,
"grad_norm": 0.42374587059020996,
"learning_rate": 0.00033296764791605944,
"loss": 3.1832,
"step": 76450
},
{
"epoch": 22.283733395478908,
"grad_norm": 0.4058934450149536,
"learning_rate": 0.0003327927717866511,
"loss": 3.1896,
"step": 76500
},
{
"epoch": 22.29829876485668,
"grad_norm": 0.4124061167240143,
"learning_rate": 0.0003326178956572427,
"loss": 3.1953,
"step": 76550
},
{
"epoch": 22.312864134234445,
"grad_norm": 0.37550002336502075,
"learning_rate": 0.00033244301952783446,
"loss": 3.1904,
"step": 76600
},
{
"epoch": 22.327429503612212,
"grad_norm": 0.38566455245018005,
"learning_rate": 0.0003322681433984261,
"loss": 3.1946,
"step": 76650
},
{
"epoch": 22.34199487298998,
"grad_norm": 0.41883614659309387,
"learning_rate": 0.0003320932672690178,
"loss": 3.1925,
"step": 76700
},
{
"epoch": 22.356560242367745,
"grad_norm": 0.4119344651699066,
"learning_rate": 0.0003319183911396094,
"loss": 3.177,
"step": 76750
},
{
"epoch": 22.371125611745512,
"grad_norm": 0.4048214852809906,
"learning_rate": 0.0003317435150102011,
"loss": 3.2043,
"step": 76800
},
{
"epoch": 22.385690981123282,
"grad_norm": 0.43698737025260925,
"learning_rate": 0.00033156863888079275,
"loss": 3.1895,
"step": 76850
},
{
"epoch": 22.40025635050105,
"grad_norm": 0.40300291776657104,
"learning_rate": 0.0003313937627513844,
"loss": 3.2016,
"step": 76900
},
{
"epoch": 22.414821719878816,
"grad_norm": 0.39613571763038635,
"learning_rate": 0.00033121888662197607,
"loss": 3.1993,
"step": 76950
},
{
"epoch": 22.429387089256583,
"grad_norm": 0.4303956627845764,
"learning_rate": 0.0003310440104925677,
"loss": 3.2118,
"step": 77000
},
{
"epoch": 22.429387089256583,
"eval_accuracy": 0.3733571843004516,
"eval_loss": 3.5473194122314453,
"eval_runtime": 180.1499,
"eval_samples_per_second": 92.395,
"eval_steps_per_second": 5.779,
"step": 77000
},
{
"epoch": 22.44395245863435,
"grad_norm": 0.40987488627433777,
"learning_rate": 0.00033086913436315945,
"loss": 3.2011,
"step": 77050
},
{
"epoch": 22.45851782801212,
"grad_norm": 0.5552119612693787,
"learning_rate": 0.0003306942582337511,
"loss": 3.2065,
"step": 77100
},
{
"epoch": 22.473083197389887,
"grad_norm": 0.4037717878818512,
"learning_rate": 0.0003305193821043427,
"loss": 3.1915,
"step": 77150
},
{
"epoch": 22.487648566767653,
"grad_norm": 0.39489609003067017,
"learning_rate": 0.0003303445059749344,
"loss": 3.2056,
"step": 77200
},
{
"epoch": 22.50221393614542,
"grad_norm": 0.4151366353034973,
"learning_rate": 0.00033016962984552605,
"loss": 3.2079,
"step": 77250
},
{
"epoch": 22.516779305523187,
"grad_norm": 0.3901776373386383,
"learning_rate": 0.00032999475371611774,
"loss": 3.2011,
"step": 77300
},
{
"epoch": 22.531344674900957,
"grad_norm": 0.39502573013305664,
"learning_rate": 0.0003298198775867094,
"loss": 3.2107,
"step": 77350
},
{
"epoch": 22.545910044278724,
"grad_norm": 0.4188506305217743,
"learning_rate": 0.000329645001457301,
"loss": 3.2172,
"step": 77400
},
{
"epoch": 22.56047541365649,
"grad_norm": 0.40013113617897034,
"learning_rate": 0.0003294701253278927,
"loss": 3.2247,
"step": 77450
},
{
"epoch": 22.575040783034257,
"grad_norm": 0.42045021057128906,
"learning_rate": 0.00032929524919848434,
"loss": 3.2161,
"step": 77500
},
{
"epoch": 22.589606152412024,
"grad_norm": 0.44547539949417114,
"learning_rate": 0.0003291203730690761,
"loss": 3.2248,
"step": 77550
},
{
"epoch": 22.60417152178979,
"grad_norm": 0.4550730884075165,
"learning_rate": 0.0003289454969396677,
"loss": 3.219,
"step": 77600
},
{
"epoch": 22.61873689116756,
"grad_norm": 0.40038976073265076,
"learning_rate": 0.0003287706208102594,
"loss": 3.2128,
"step": 77650
},
{
"epoch": 22.633302260545328,
"grad_norm": 0.4054413139820099,
"learning_rate": 0.00032859574468085105,
"loss": 3.2153,
"step": 77700
},
{
"epoch": 22.647867629923095,
"grad_norm": 0.39146846532821655,
"learning_rate": 0.0003284208685514427,
"loss": 3.2212,
"step": 77750
},
{
"epoch": 22.66243299930086,
"grad_norm": 0.402034193277359,
"learning_rate": 0.0003282459924220344,
"loss": 3.2227,
"step": 77800
},
{
"epoch": 22.67699836867863,
"grad_norm": 0.38476046919822693,
"learning_rate": 0.000328071116292626,
"loss": 3.2133,
"step": 77850
},
{
"epoch": 22.6915637380564,
"grad_norm": 0.412173867225647,
"learning_rate": 0.0003278962401632177,
"loss": 3.2176,
"step": 77900
},
{
"epoch": 22.706129107434165,
"grad_norm": 0.3986635208129883,
"learning_rate": 0.00032772136403380934,
"loss": 3.2216,
"step": 77950
},
{
"epoch": 22.720694476811932,
"grad_norm": 0.4033834934234619,
"learning_rate": 0.000327546487904401,
"loss": 3.2325,
"step": 78000
},
{
"epoch": 22.720694476811932,
"eval_accuracy": 0.37393880145950986,
"eval_loss": 3.539693832397461,
"eval_runtime": 180.3316,
"eval_samples_per_second": 92.302,
"eval_steps_per_second": 5.773,
"step": 78000
},
{
"epoch": 22.7352598461897,
"grad_norm": 0.3904629945755005,
"learning_rate": 0.0003273716117749927,
"loss": 3.2313,
"step": 78050
},
{
"epoch": 22.749825215567466,
"grad_norm": 0.4374755322933197,
"learning_rate": 0.00032719673564558435,
"loss": 3.2222,
"step": 78100
},
{
"epoch": 22.764390584945236,
"grad_norm": 0.3812626302242279,
"learning_rate": 0.00032702185951617605,
"loss": 3.2216,
"step": 78150
},
{
"epoch": 22.778955954323003,
"grad_norm": 0.41498416662216187,
"learning_rate": 0.0003268469833867677,
"loss": 3.2297,
"step": 78200
},
{
"epoch": 22.79352132370077,
"grad_norm": 0.37831243872642517,
"learning_rate": 0.00032667210725735937,
"loss": 3.2222,
"step": 78250
},
{
"epoch": 22.808086693078536,
"grad_norm": 0.4045485258102417,
"learning_rate": 0.000326497231127951,
"loss": 3.2285,
"step": 78300
},
{
"epoch": 22.822652062456303,
"grad_norm": 0.4422939717769623,
"learning_rate": 0.00032632235499854264,
"loss": 3.2316,
"step": 78350
},
{
"epoch": 22.83721743183407,
"grad_norm": 0.42193302512168884,
"learning_rate": 0.00032614747886913433,
"loss": 3.2325,
"step": 78400
},
{
"epoch": 22.85178280121184,
"grad_norm": 0.40983685851097107,
"learning_rate": 0.00032597260273972597,
"loss": 3.2343,
"step": 78450
},
{
"epoch": 22.866348170589607,
"grad_norm": 0.40650826692581177,
"learning_rate": 0.0003257977266103177,
"loss": 3.2459,
"step": 78500
},
{
"epoch": 22.880913539967374,
"grad_norm": 0.4187788963317871,
"learning_rate": 0.00032562285048090935,
"loss": 3.2262,
"step": 78550
},
{
"epoch": 22.89547890934514,
"grad_norm": 0.4191761314868927,
"learning_rate": 0.000325447974351501,
"loss": 3.2322,
"step": 78600
},
{
"epoch": 22.910044278722907,
"grad_norm": 0.4233628511428833,
"learning_rate": 0.0003252730982220927,
"loss": 3.2224,
"step": 78650
},
{
"epoch": 22.924609648100677,
"grad_norm": 0.406550794839859,
"learning_rate": 0.0003250982220926843,
"loss": 3.2343,
"step": 78700
},
{
"epoch": 22.939175017478444,
"grad_norm": 0.4227450489997864,
"learning_rate": 0.000324923345963276,
"loss": 3.2362,
"step": 78750
},
{
"epoch": 22.95374038685621,
"grad_norm": 0.4387967884540558,
"learning_rate": 0.00032474846983386764,
"loss": 3.2464,
"step": 78800
},
{
"epoch": 22.968305756233978,
"grad_norm": 0.4019660949707031,
"learning_rate": 0.00032457359370445933,
"loss": 3.2288,
"step": 78850
},
{
"epoch": 22.982871125611744,
"grad_norm": 0.3797237277030945,
"learning_rate": 0.00032439871757505097,
"loss": 3.2282,
"step": 78900
},
{
"epoch": 22.997436494989515,
"grad_norm": 0.4136102497577667,
"learning_rate": 0.0003242238414456426,
"loss": 3.2305,
"step": 78950
},
{
"epoch": 23.01194360288977,
"grad_norm": 0.3978010416030884,
"learning_rate": 0.00032404896531623435,
"loss": 3.1549,
"step": 79000
},
{
"epoch": 23.01194360288977,
"eval_accuracy": 0.37382040880150064,
"eval_loss": 3.54584002494812,
"eval_runtime": 180.1859,
"eval_samples_per_second": 92.377,
"eval_steps_per_second": 5.777,
"step": 79000
},
{
"epoch": 23.026508972267536,
"grad_norm": 0.4399334788322449,
"learning_rate": 0.000323874089186826,
"loss": 3.1371,
"step": 79050
},
{
"epoch": 23.041074341645306,
"grad_norm": 0.4500805139541626,
"learning_rate": 0.0003236992130574177,
"loss": 3.1506,
"step": 79100
},
{
"epoch": 23.055639711023073,
"grad_norm": 0.42291468381881714,
"learning_rate": 0.0003235243369280093,
"loss": 3.1273,
"step": 79150
},
{
"epoch": 23.07020508040084,
"grad_norm": 0.3858034610748291,
"learning_rate": 0.00032334946079860095,
"loss": 3.1407,
"step": 79200
},
{
"epoch": 23.084770449778606,
"grad_norm": 0.4042983651161194,
"learning_rate": 0.00032317458466919264,
"loss": 3.1527,
"step": 79250
},
{
"epoch": 23.099335819156373,
"grad_norm": 0.39408230781555176,
"learning_rate": 0.0003229997085397843,
"loss": 3.1524,
"step": 79300
},
{
"epoch": 23.11390118853414,
"grad_norm": 0.3985508680343628,
"learning_rate": 0.00032282483241037596,
"loss": 3.1637,
"step": 79350
},
{
"epoch": 23.12846655791191,
"grad_norm": 0.4050256907939911,
"learning_rate": 0.0003226499562809676,
"loss": 3.1487,
"step": 79400
},
{
"epoch": 23.143031927289677,
"grad_norm": 0.469816118478775,
"learning_rate": 0.00032247508015155924,
"loss": 3.1614,
"step": 79450
},
{
"epoch": 23.157597296667443,
"grad_norm": 0.4118306636810303,
"learning_rate": 0.000322300204022151,
"loss": 3.1541,
"step": 79500
},
{
"epoch": 23.17216266604521,
"grad_norm": 0.3904615640640259,
"learning_rate": 0.0003221253278927426,
"loss": 3.1667,
"step": 79550
},
{
"epoch": 23.186728035422977,
"grad_norm": 0.40412455797195435,
"learning_rate": 0.0003219504517633343,
"loss": 3.1724,
"step": 79600
},
{
"epoch": 23.201293404800747,
"grad_norm": 0.43276599049568176,
"learning_rate": 0.00032177557563392594,
"loss": 3.1651,
"step": 79650
},
{
"epoch": 23.215858774178514,
"grad_norm": 0.38202792406082153,
"learning_rate": 0.00032160069950451763,
"loss": 3.1693,
"step": 79700
},
{
"epoch": 23.23042414355628,
"grad_norm": 0.4077218472957611,
"learning_rate": 0.00032142582337510927,
"loss": 3.1715,
"step": 79750
},
{
"epoch": 23.244989512934048,
"grad_norm": 0.42455849051475525,
"learning_rate": 0.0003212509472457009,
"loss": 3.1702,
"step": 79800
},
{
"epoch": 23.259554882311814,
"grad_norm": 0.4102165400981903,
"learning_rate": 0.0003210760711162926,
"loss": 3.1808,
"step": 79850
},
{
"epoch": 23.27412025168958,
"grad_norm": 0.4249970316886902,
"learning_rate": 0.00032090119498688423,
"loss": 3.1731,
"step": 79900
},
{
"epoch": 23.28868562106735,
"grad_norm": 0.39110127091407776,
"learning_rate": 0.0003207263188574759,
"loss": 3.1802,
"step": 79950
},
{
"epoch": 23.303250990445118,
"grad_norm": 0.43393391370773315,
"learning_rate": 0.0003205514427280676,
"loss": 3.1875,
"step": 80000
},
{
"epoch": 23.303250990445118,
"eval_accuracy": 0.3732344415646407,
"eval_loss": 3.552424907684326,
"eval_runtime": 180.3512,
"eval_samples_per_second": 92.292,
"eval_steps_per_second": 5.772,
"step": 80000
},
{
"epoch": 23.317816359822885,
"grad_norm": 0.3961530029773712,
"learning_rate": 0.00032037656659865925,
"loss": 3.128,
"step": 80050
},
{
"epoch": 23.33238172920065,
"grad_norm": 0.43098706007003784,
"learning_rate": 0.00032020169046925094,
"loss": 3.1388,
"step": 80100
},
{
"epoch": 23.34694709857842,
"grad_norm": 0.4158017337322235,
"learning_rate": 0.0003200268143398426,
"loss": 3.1452,
"step": 80150
},
{
"epoch": 23.36151246795619,
"grad_norm": 0.40117818117141724,
"learning_rate": 0.00031985193821043427,
"loss": 3.1548,
"step": 80200
},
{
"epoch": 23.376077837333956,
"grad_norm": 0.4437515139579773,
"learning_rate": 0.0003196770620810259,
"loss": 3.1432,
"step": 80250
},
{
"epoch": 23.390643206711722,
"grad_norm": 0.41832467913627625,
"learning_rate": 0.0003195021859516176,
"loss": 3.1679,
"step": 80300
},
{
"epoch": 23.40520857608949,
"grad_norm": 0.41394323110580444,
"learning_rate": 0.00031932730982220923,
"loss": 3.1668,
"step": 80350
},
{
"epoch": 23.419773945467256,
"grad_norm": 0.4065753221511841,
"learning_rate": 0.00031915243369280087,
"loss": 3.1507,
"step": 80400
},
{
"epoch": 23.434339314845026,
"grad_norm": 0.41089168190956116,
"learning_rate": 0.00031897755756339256,
"loss": 3.1685,
"step": 80450
},
{
"epoch": 23.448904684222793,
"grad_norm": 0.4270135164260864,
"learning_rate": 0.0003188026814339842,
"loss": 3.1697,
"step": 80500
},
{
"epoch": 23.46347005360056,
"grad_norm": 0.4168857932090759,
"learning_rate": 0.00031862780530457594,
"loss": 3.1673,
"step": 80550
},
{
"epoch": 23.478035422978326,
"grad_norm": 0.39209380745887756,
"learning_rate": 0.0003184529291751676,
"loss": 3.1786,
"step": 80600
},
{
"epoch": 23.492600792356093,
"grad_norm": 0.45542216300964355,
"learning_rate": 0.0003182780530457592,
"loss": 3.1646,
"step": 80650
},
{
"epoch": 23.50716616173386,
"grad_norm": 0.45212727785110474,
"learning_rate": 0.0003181031769163509,
"loss": 3.1683,
"step": 80700
},
{
"epoch": 23.52173153111163,
"grad_norm": 0.4164752960205078,
"learning_rate": 0.00031792830078694254,
"loss": 3.1663,
"step": 80750
},
{
"epoch": 23.536296900489397,
"grad_norm": 0.4018906354904175,
"learning_rate": 0.0003177534246575342,
"loss": 3.1639,
"step": 80800
},
{
"epoch": 23.550862269867164,
"grad_norm": 0.41436582803726196,
"learning_rate": 0.00031757854852812586,
"loss": 3.1684,
"step": 80850
},
{
"epoch": 23.56542763924493,
"grad_norm": 0.4052976369857788,
"learning_rate": 0.00031740367239871755,
"loss": 3.1658,
"step": 80900
},
{
"epoch": 23.579993008622697,
"grad_norm": 0.39007437229156494,
"learning_rate": 0.0003172287962693092,
"loss": 3.167,
"step": 80950
},
{
"epoch": 23.594558378000468,
"grad_norm": 0.41448211669921875,
"learning_rate": 0.0003170539201399008,
"loss": 3.1815,
"step": 81000
},
{
"epoch": 23.594558378000468,
"eval_accuracy": 0.3736459354107502,
"eval_loss": 3.551520347595215,
"eval_runtime": 180.7918,
"eval_samples_per_second": 92.067,
"eval_steps_per_second": 5.758,
"step": 81000
},
{
"epoch": 23.609123747378234,
"grad_norm": 0.42460495233535767,
"learning_rate": 0.00031687904401049257,
"loss": 3.1911,
"step": 81050
},
{
"epoch": 23.623689116756,
"grad_norm": 0.44785287976264954,
"learning_rate": 0.0003167041678810842,
"loss": 3.1938,
"step": 81100
},
{
"epoch": 23.638254486133768,
"grad_norm": 0.4176446497440338,
"learning_rate": 0.0003165292917516759,
"loss": 3.188,
"step": 81150
},
{
"epoch": 23.652819855511535,
"grad_norm": 0.40342187881469727,
"learning_rate": 0.00031635441562226753,
"loss": 3.1827,
"step": 81200
},
{
"epoch": 23.667385224889305,
"grad_norm": 0.4379565119743347,
"learning_rate": 0.00031617953949285917,
"loss": 3.1919,
"step": 81250
},
{
"epoch": 23.68195059426707,
"grad_norm": 0.4336557388305664,
"learning_rate": 0.00031600466336345086,
"loss": 3.1789,
"step": 81300
},
{
"epoch": 23.69651596364484,
"grad_norm": 0.4131733775138855,
"learning_rate": 0.0003158297872340425,
"loss": 3.1905,
"step": 81350
},
{
"epoch": 23.711081333022605,
"grad_norm": 0.4182552397251129,
"learning_rate": 0.0003156549111046342,
"loss": 3.1819,
"step": 81400
},
{
"epoch": 23.725646702400372,
"grad_norm": 0.42562443017959595,
"learning_rate": 0.0003154800349752258,
"loss": 3.1956,
"step": 81450
},
{
"epoch": 23.74021207177814,
"grad_norm": 0.40080058574676514,
"learning_rate": 0.00031530515884581757,
"loss": 3.2053,
"step": 81500
},
{
"epoch": 23.75477744115591,
"grad_norm": 0.3980943560600281,
"learning_rate": 0.0003151302827164092,
"loss": 3.2019,
"step": 81550
},
{
"epoch": 23.769342810533676,
"grad_norm": 0.41131657361984253,
"learning_rate": 0.00031495540658700084,
"loss": 3.1842,
"step": 81600
},
{
"epoch": 23.783908179911442,
"grad_norm": 0.4096752405166626,
"learning_rate": 0.00031478053045759253,
"loss": 3.1931,
"step": 81650
},
{
"epoch": 23.79847354928921,
"grad_norm": 0.4075625538825989,
"learning_rate": 0.00031460565432818417,
"loss": 3.1858,
"step": 81700
},
{
"epoch": 23.813038918666976,
"grad_norm": 0.3922245502471924,
"learning_rate": 0.00031443077819877586,
"loss": 3.1979,
"step": 81750
},
{
"epoch": 23.827604288044746,
"grad_norm": 0.4519116282463074,
"learning_rate": 0.0003142559020693675,
"loss": 3.2028,
"step": 81800
},
{
"epoch": 23.842169657422513,
"grad_norm": 0.3990562856197357,
"learning_rate": 0.00031408102593995913,
"loss": 3.1981,
"step": 81850
},
{
"epoch": 23.85673502680028,
"grad_norm": 0.4091752767562866,
"learning_rate": 0.0003139061498105508,
"loss": 3.2087,
"step": 81900
},
{
"epoch": 23.871300396178047,
"grad_norm": 0.43344610929489136,
"learning_rate": 0.00031373127368114245,
"loss": 3.2012,
"step": 81950
},
{
"epoch": 23.885865765555813,
"grad_norm": 0.42658382654190063,
"learning_rate": 0.0003135563975517342,
"loss": 3.2017,
"step": 82000
},
{
"epoch": 23.885865765555813,
"eval_accuracy": 0.3737183583276655,
"eval_loss": 3.5459020137786865,
"eval_runtime": 180.2368,
"eval_samples_per_second": 92.351,
"eval_steps_per_second": 5.776,
"step": 82000
},
{
"epoch": 23.900431134933584,
"grad_norm": 0.41615602374076843,
"learning_rate": 0.00031338152142232584,
"loss": 3.2067,
"step": 82050
},
{
"epoch": 23.91499650431135,
"grad_norm": 0.4384036958217621,
"learning_rate": 0.00031320664529291747,
"loss": 3.2036,
"step": 82100
},
{
"epoch": 23.929561873689117,
"grad_norm": 0.40373918414115906,
"learning_rate": 0.00031303176916350916,
"loss": 3.2073,
"step": 82150
},
{
"epoch": 23.944127243066884,
"grad_norm": 0.4283479154109955,
"learning_rate": 0.0003128568930341008,
"loss": 3.2195,
"step": 82200
},
{
"epoch": 23.95869261244465,
"grad_norm": 0.42980676889419556,
"learning_rate": 0.0003126820169046925,
"loss": 3.2104,
"step": 82250
},
{
"epoch": 23.973257981822417,
"grad_norm": 0.4395185708999634,
"learning_rate": 0.0003125071407752841,
"loss": 3.2022,
"step": 82300
},
{
"epoch": 23.987823351200188,
"grad_norm": 0.4349308907985687,
"learning_rate": 0.0003123322646458758,
"loss": 3.2058,
"step": 82350
},
{
"epoch": 24.002621766487998,
"grad_norm": 0.37979841232299805,
"learning_rate": 0.00031215738851646745,
"loss": 3.2653,
"step": 82400
},
{
"epoch": 24.017187135865765,
"grad_norm": 0.39357176423072815,
"learning_rate": 0.0003119825123870591,
"loss": 3.1359,
"step": 82450
},
{
"epoch": 24.03175250524353,
"grad_norm": 0.40956372022628784,
"learning_rate": 0.00031180763625765083,
"loss": 3.1277,
"step": 82500
},
{
"epoch": 24.0463178746213,
"grad_norm": 0.3902159631252289,
"learning_rate": 0.00031163276012824247,
"loss": 3.1391,
"step": 82550
},
{
"epoch": 24.06088324399907,
"grad_norm": 0.43166887760162354,
"learning_rate": 0.00031145788399883416,
"loss": 3.1339,
"step": 82600
},
{
"epoch": 24.075448613376835,
"grad_norm": 0.40408581495285034,
"learning_rate": 0.0003112830078694258,
"loss": 3.1298,
"step": 82650
},
{
"epoch": 24.090013982754602,
"grad_norm": 0.44965606927871704,
"learning_rate": 0.00031110813174001743,
"loss": 3.1439,
"step": 82700
},
{
"epoch": 24.10457935213237,
"grad_norm": 0.43857842683792114,
"learning_rate": 0.0003109332556106091,
"loss": 3.1466,
"step": 82750
},
{
"epoch": 24.11914472151014,
"grad_norm": 0.42380571365356445,
"learning_rate": 0.00031075837948120076,
"loss": 3.1439,
"step": 82800
},
{
"epoch": 24.133710090887906,
"grad_norm": 0.4155460298061371,
"learning_rate": 0.00031058350335179245,
"loss": 3.1448,
"step": 82850
},
{
"epoch": 24.148275460265673,
"grad_norm": 0.42540451884269714,
"learning_rate": 0.0003104086272223841,
"loss": 3.1484,
"step": 82900
},
{
"epoch": 24.16284082964344,
"grad_norm": 0.4119025468826294,
"learning_rate": 0.00031023375109297583,
"loss": 3.1508,
"step": 82950
},
{
"epoch": 24.177406199021206,
"grad_norm": 0.4322400391101837,
"learning_rate": 0.00031005887496356746,
"loss": 3.1608,
"step": 83000
},
{
"epoch": 24.177406199021206,
"eval_accuracy": 0.3733360217597946,
"eval_loss": 3.553809642791748,
"eval_runtime": 180.1586,
"eval_samples_per_second": 92.391,
"eval_steps_per_second": 5.778,
"step": 83000
},
{
"epoch": 24.191971568398973,
"grad_norm": 0.416153222322464,
"learning_rate": 0.0003098839988341591,
"loss": 3.1627,
"step": 83050
},
{
"epoch": 24.206536937776743,
"grad_norm": 0.4277220070362091,
"learning_rate": 0.0003097091227047508,
"loss": 3.1689,
"step": 83100
},
{
"epoch": 24.22110230715451,
"grad_norm": 0.3882681727409363,
"learning_rate": 0.00030953424657534243,
"loss": 3.168,
"step": 83150
},
{
"epoch": 24.235667676532277,
"grad_norm": 0.41531258821487427,
"learning_rate": 0.0003093593704459341,
"loss": 3.168,
"step": 83200
},
{
"epoch": 24.250233045910043,
"grad_norm": 0.44125837087631226,
"learning_rate": 0.00030918449431652575,
"loss": 3.1814,
"step": 83250
},
{
"epoch": 24.26479841528781,
"grad_norm": 0.4468991160392761,
"learning_rate": 0.0003090096181871174,
"loss": 3.1816,
"step": 83300
},
{
"epoch": 24.27936378466558,
"grad_norm": 0.40799108147621155,
"learning_rate": 0.0003088347420577091,
"loss": 3.1606,
"step": 83350
},
{
"epoch": 24.293929154043347,
"grad_norm": 0.4418475329875946,
"learning_rate": 0.0003086598659283007,
"loss": 3.1716,
"step": 83400
},
{
"epoch": 24.308494523421114,
"grad_norm": 0.435165137052536,
"learning_rate": 0.00030848498979889246,
"loss": 3.1786,
"step": 83450
},
{
"epoch": 24.32305989279888,
"grad_norm": 0.40027478337287903,
"learning_rate": 0.0003083101136694841,
"loss": 3.1831,
"step": 83500
},
{
"epoch": 24.337625262176648,
"grad_norm": 0.4050601124763489,
"learning_rate": 0.0003081352375400758,
"loss": 3.169,
"step": 83550
},
{
"epoch": 24.352190631554418,
"grad_norm": 0.4074668288230896,
"learning_rate": 0.0003079603614106674,
"loss": 3.1695,
"step": 83600
},
{
"epoch": 24.366756000932185,
"grad_norm": 0.4010626971721649,
"learning_rate": 0.00030778548528125906,
"loss": 3.1853,
"step": 83650
},
{
"epoch": 24.38132137030995,
"grad_norm": 0.44930556416511536,
"learning_rate": 0.00030761060915185075,
"loss": 3.1887,
"step": 83700
},
{
"epoch": 24.395886739687718,
"grad_norm": 0.42292553186416626,
"learning_rate": 0.0003074357330224424,
"loss": 3.1904,
"step": 83750
},
{
"epoch": 24.410452109065485,
"grad_norm": 0.4097443222999573,
"learning_rate": 0.0003072608568930341,
"loss": 3.1923,
"step": 83800
},
{
"epoch": 24.42501747844325,
"grad_norm": 0.3933105766773224,
"learning_rate": 0.0003070859807636257,
"loss": 3.197,
"step": 83850
},
{
"epoch": 24.439582847821022,
"grad_norm": 0.3940275311470032,
"learning_rate": 0.00030691110463421735,
"loss": 3.1935,
"step": 83900
},
{
"epoch": 24.45414821719879,
"grad_norm": 0.4267207086086273,
"learning_rate": 0.0003067362285048091,
"loss": 3.1905,
"step": 83950
},
{
"epoch": 24.468713586576555,
"grad_norm": 0.4132387936115265,
"learning_rate": 0.00030656135237540073,
"loss": 3.1977,
"step": 84000
},
{
"epoch": 24.468713586576555,
"eval_accuracy": 0.37396490192632026,
"eval_loss": 3.546501398086548,
"eval_runtime": 180.189,
"eval_samples_per_second": 92.375,
"eval_steps_per_second": 5.777,
"step": 84000
},
{
"epoch": 24.483278955954322,
"grad_norm": 0.4435485303401947,
"learning_rate": 0.0003063864762459924,
"loss": 3.1924,
"step": 84050
},
{
"epoch": 24.49784432533209,
"grad_norm": 0.402010440826416,
"learning_rate": 0.00030621160011658406,
"loss": 3.1967,
"step": 84100
},
{
"epoch": 24.51240969470986,
"grad_norm": 0.42332908511161804,
"learning_rate": 0.0003060367239871757,
"loss": 3.2062,
"step": 84150
},
{
"epoch": 24.526975064087626,
"grad_norm": 0.40317755937576294,
"learning_rate": 0.0003058618478577674,
"loss": 3.1848,
"step": 84200
},
{
"epoch": 24.541540433465393,
"grad_norm": 0.4025932848453522,
"learning_rate": 0.000305686971728359,
"loss": 3.2035,
"step": 84250
},
{
"epoch": 24.55610580284316,
"grad_norm": 0.41158631443977356,
"learning_rate": 0.0003055120955989507,
"loss": 3.2026,
"step": 84300
},
{
"epoch": 24.570671172220926,
"grad_norm": 0.4377208650112152,
"learning_rate": 0.00030533721946954235,
"loss": 3.2001,
"step": 84350
},
{
"epoch": 24.585236541598697,
"grad_norm": 0.3986060917377472,
"learning_rate": 0.0003051623433401341,
"loss": 3.195,
"step": 84400
},
{
"epoch": 24.599801910976463,
"grad_norm": 0.4415377080440521,
"learning_rate": 0.00030498746721072573,
"loss": 3.1899,
"step": 84450
},
{
"epoch": 24.61436728035423,
"grad_norm": 0.43405723571777344,
"learning_rate": 0.00030481259108131736,
"loss": 3.2059,
"step": 84500
},
{
"epoch": 24.628932649731997,
"grad_norm": 0.42428821325302124,
"learning_rate": 0.00030463771495190905,
"loss": 3.1971,
"step": 84550
},
{
"epoch": 24.643498019109764,
"grad_norm": 0.40802910923957825,
"learning_rate": 0.0003044628388225007,
"loss": 3.1907,
"step": 84600
},
{
"epoch": 24.65806338848753,
"grad_norm": 0.4088672995567322,
"learning_rate": 0.0003042879626930924,
"loss": 3.2058,
"step": 84650
},
{
"epoch": 24.6726287578653,
"grad_norm": 0.44554194808006287,
"learning_rate": 0.000304113086563684,
"loss": 3.2052,
"step": 84700
},
{
"epoch": 24.687194127243067,
"grad_norm": 0.40504729747772217,
"learning_rate": 0.00030393821043427565,
"loss": 3.2203,
"step": 84750
},
{
"epoch": 24.701759496620834,
"grad_norm": 0.40326762199401855,
"learning_rate": 0.00030376333430486734,
"loss": 3.2086,
"step": 84800
},
{
"epoch": 24.7163248659986,
"grad_norm": 0.4128248989582062,
"learning_rate": 0.000303588458175459,
"loss": 3.1938,
"step": 84850
},
{
"epoch": 24.730890235376368,
"grad_norm": 0.4104618430137634,
"learning_rate": 0.0003034135820460507,
"loss": 3.2093,
"step": 84900
},
{
"epoch": 24.745455604754138,
"grad_norm": 0.43903848528862,
"learning_rate": 0.00030323870591664236,
"loss": 3.2078,
"step": 84950
},
{
"epoch": 24.760020974131905,
"grad_norm": 0.4496263861656189,
"learning_rate": 0.00030306382978723405,
"loss": 3.2052,
"step": 85000
},
{
"epoch": 24.760020974131905,
"eval_accuracy": 0.3744878518198903,
"eval_loss": 3.5395586490631104,
"eval_runtime": 180.1456,
"eval_samples_per_second": 92.397,
"eval_steps_per_second": 5.779,
"step": 85000
},
{
"epoch": 24.77458634350967,
"grad_norm": 0.39450597763061523,
"learning_rate": 0.0003028889536578257,
"loss": 3.1969,
"step": 85050
},
{
"epoch": 24.78915171288744,
"grad_norm": 0.4104599058628082,
"learning_rate": 0.0003027140775284173,
"loss": 3.2143,
"step": 85100
},
{
"epoch": 24.803717082265205,
"grad_norm": 0.435557097196579,
"learning_rate": 0.000302539201399009,
"loss": 3.2116,
"step": 85150
},
{
"epoch": 24.818282451642972,
"grad_norm": 0.3877793848514557,
"learning_rate": 0.00030236432526960065,
"loss": 3.204,
"step": 85200
},
{
"epoch": 24.832847821020742,
"grad_norm": 0.41204720735549927,
"learning_rate": 0.00030218944914019234,
"loss": 3.2179,
"step": 85250
},
{
"epoch": 24.84741319039851,
"grad_norm": 0.4013805389404297,
"learning_rate": 0.000302014573010784,
"loss": 3.2141,
"step": 85300
},
{
"epoch": 24.861978559776276,
"grad_norm": 0.42889299988746643,
"learning_rate": 0.0003018396968813756,
"loss": 3.212,
"step": 85350
},
{
"epoch": 24.876543929154042,
"grad_norm": 0.403367280960083,
"learning_rate": 0.00030166482075196736,
"loss": 3.2101,
"step": 85400
},
{
"epoch": 24.89110929853181,
"grad_norm": 0.4000912010669708,
"learning_rate": 0.000301489944622559,
"loss": 3.2086,
"step": 85450
},
{
"epoch": 24.90567466790958,
"grad_norm": 0.43617841601371765,
"learning_rate": 0.0003013150684931507,
"loss": 3.2146,
"step": 85500
},
{
"epoch": 24.920240037287346,
"grad_norm": 0.39104828238487244,
"learning_rate": 0.0003011401923637423,
"loss": 3.2112,
"step": 85550
},
{
"epoch": 24.934805406665113,
"grad_norm": 0.3803448975086212,
"learning_rate": 0.000300965316234334,
"loss": 3.2246,
"step": 85600
},
{
"epoch": 24.94937077604288,
"grad_norm": 0.40348923206329346,
"learning_rate": 0.00030079044010492565,
"loss": 3.2231,
"step": 85650
},
{
"epoch": 24.963936145420647,
"grad_norm": 0.40725383162498474,
"learning_rate": 0.0003006155639755173,
"loss": 3.2106,
"step": 85700
},
{
"epoch": 24.978501514798417,
"grad_norm": 1.2001488208770752,
"learning_rate": 0.00030044068784610897,
"loss": 3.2361,
"step": 85750
},
{
"epoch": 24.993066884176184,
"grad_norm": 0.4001549780368805,
"learning_rate": 0.0003002658117167006,
"loss": 3.226,
"step": 85800
},
{
"epoch": 25.007573992076438,
"grad_norm": 0.44032537937164307,
"learning_rate": 0.00030009093558729235,
"loss": 3.1571,
"step": 85850
},
{
"epoch": 25.022139361454208,
"grad_norm": 0.3807680606842041,
"learning_rate": 0.000299916059457884,
"loss": 3.1191,
"step": 85900
},
{
"epoch": 25.036704730831975,
"grad_norm": 0.412787526845932,
"learning_rate": 0.0002997411833284756,
"loss": 3.1055,
"step": 85950
},
{
"epoch": 25.05127010020974,
"grad_norm": 0.4120820462703705,
"learning_rate": 0.0002995663071990673,
"loss": 3.122,
"step": 86000
},
{
"epoch": 25.05127010020974,
"eval_accuracy": 0.37377279308502226,
"eval_loss": 3.5503273010253906,
"eval_runtime": 180.5607,
"eval_samples_per_second": 92.185,
"eval_steps_per_second": 5.765,
"step": 86000
},
{
"epoch": 25.06583546958751,
"grad_norm": 0.4001462757587433,
"learning_rate": 0.00029939143106965895,
"loss": 3.123,
"step": 86050
},
{
"epoch": 25.080400838965275,
"grad_norm": 0.38697749376296997,
"learning_rate": 0.00029921655494025064,
"loss": 3.1187,
"step": 86100
},
{
"epoch": 25.094966208343042,
"grad_norm": 0.4431687891483307,
"learning_rate": 0.0002990416788108423,
"loss": 3.1332,
"step": 86150
},
{
"epoch": 25.109531577720812,
"grad_norm": 0.39642491936683655,
"learning_rate": 0.00029886680268143397,
"loss": 3.1289,
"step": 86200
},
{
"epoch": 25.12409694709858,
"grad_norm": 0.4230706989765167,
"learning_rate": 0.0002986919265520256,
"loss": 3.1309,
"step": 86250
},
{
"epoch": 25.138662316476346,
"grad_norm": 0.41169217228889465,
"learning_rate": 0.0002985170504226173,
"loss": 3.1399,
"step": 86300
},
{
"epoch": 25.153227685854112,
"grad_norm": 0.42555660009384155,
"learning_rate": 0.00029834217429320893,
"loss": 3.1517,
"step": 86350
},
{
"epoch": 25.16779305523188,
"grad_norm": 0.4415445029735565,
"learning_rate": 0.0002981672981638006,
"loss": 3.138,
"step": 86400
},
{
"epoch": 25.18235842460965,
"grad_norm": 0.3974837064743042,
"learning_rate": 0.00029799242203439226,
"loss": 3.154,
"step": 86450
},
{
"epoch": 25.196923793987416,
"grad_norm": 0.3946952521800995,
"learning_rate": 0.00029781754590498395,
"loss": 3.1533,
"step": 86500
},
{
"epoch": 25.211489163365183,
"grad_norm": 0.40269792079925537,
"learning_rate": 0.00029764266977557564,
"loss": 3.1555,
"step": 86550
},
{
"epoch": 25.22605453274295,
"grad_norm": 0.42831283807754517,
"learning_rate": 0.0002974677936461673,
"loss": 3.16,
"step": 86600
},
{
"epoch": 25.240619902120716,
"grad_norm": 0.4184077978134155,
"learning_rate": 0.0002972929175167589,
"loss": 3.1594,
"step": 86650
},
{
"epoch": 25.255185271498487,
"grad_norm": 0.40978071093559265,
"learning_rate": 0.0002971180413873506,
"loss": 3.1551,
"step": 86700
},
{
"epoch": 25.269750640876254,
"grad_norm": 0.4196624755859375,
"learning_rate": 0.00029694316525794224,
"loss": 3.164,
"step": 86750
},
{
"epoch": 25.28431601025402,
"grad_norm": 0.4315820336341858,
"learning_rate": 0.00029676828912853393,
"loss": 3.1674,
"step": 86800
},
{
"epoch": 25.298881379631787,
"grad_norm": 0.3970293700695038,
"learning_rate": 0.0002965934129991256,
"loss": 3.173,
"step": 86850
},
{
"epoch": 25.313446749009554,
"grad_norm": 0.4287879168987274,
"learning_rate": 0.00029641853686971726,
"loss": 3.1667,
"step": 86900
},
{
"epoch": 25.32801211838732,
"grad_norm": 0.4140596091747284,
"learning_rate": 0.0002962436607403089,
"loss": 3.1729,
"step": 86950
},
{
"epoch": 25.34257748776509,
"grad_norm": 0.43017467856407166,
"learning_rate": 0.0002960687846109006,
"loss": 3.1656,
"step": 87000
},
{
"epoch": 25.34257748776509,
"eval_accuracy": 0.3738958885298442,
"eval_loss": 3.5481033325195312,
"eval_runtime": 180.694,
"eval_samples_per_second": 92.117,
"eval_steps_per_second": 5.761,
"step": 87000
},
{
"epoch": 25.357142857142858,
"grad_norm": 0.4702874422073364,
"learning_rate": 0.00029589390848149227,
"loss": 3.1753,
"step": 87050
},
{
"epoch": 25.371708226520624,
"grad_norm": 0.4354191720485687,
"learning_rate": 0.0002957190323520839,
"loss": 3.1733,
"step": 87100
},
{
"epoch": 25.38627359589839,
"grad_norm": 0.42500242590904236,
"learning_rate": 0.0002955441562226756,
"loss": 3.1809,
"step": 87150
},
{
"epoch": 25.400838965276158,
"grad_norm": 0.41088682413101196,
"learning_rate": 0.00029536928009326723,
"loss": 3.1821,
"step": 87200
},
{
"epoch": 25.41540433465393,
"grad_norm": 0.4109274446964264,
"learning_rate": 0.00029519440396385887,
"loss": 3.1791,
"step": 87250
},
{
"epoch": 25.429969704031695,
"grad_norm": 0.41495808959007263,
"learning_rate": 0.00029501952783445056,
"loss": 3.1909,
"step": 87300
},
{
"epoch": 25.44453507340946,
"grad_norm": 0.4120579659938812,
"learning_rate": 0.00029484465170504225,
"loss": 3.1717,
"step": 87350
},
{
"epoch": 25.45910044278723,
"grad_norm": 0.4141676425933838,
"learning_rate": 0.0002946697755756339,
"loss": 3.1782,
"step": 87400
},
{
"epoch": 25.473665812164995,
"grad_norm": 0.47025105357170105,
"learning_rate": 0.0002944948994462256,
"loss": 3.1839,
"step": 87450
},
{
"epoch": 25.488231181542766,
"grad_norm": 0.40863239765167236,
"learning_rate": 0.00029432002331681727,
"loss": 3.1862,
"step": 87500
},
{
"epoch": 25.502796550920532,
"grad_norm": 0.4443865120410919,
"learning_rate": 0.0002941451471874089,
"loss": 3.1814,
"step": 87550
},
{
"epoch": 25.5173619202983,
"grad_norm": 0.416560560464859,
"learning_rate": 0.00029397027105800054,
"loss": 3.1751,
"step": 87600
},
{
"epoch": 25.531927289676066,
"grad_norm": 0.4450867474079132,
"learning_rate": 0.00029379539492859223,
"loss": 3.1802,
"step": 87650
},
{
"epoch": 25.546492659053833,
"grad_norm": 0.39942240715026855,
"learning_rate": 0.00029362051879918387,
"loss": 3.176,
"step": 87700
},
{
"epoch": 25.5610580284316,
"grad_norm": 0.4375326931476593,
"learning_rate": 0.00029344564266977556,
"loss": 3.1886,
"step": 87750
},
{
"epoch": 25.57562339780937,
"grad_norm": 0.4080294668674469,
"learning_rate": 0.00029327076654036725,
"loss": 3.194,
"step": 87800
},
{
"epoch": 25.590188767187136,
"grad_norm": 0.438039630651474,
"learning_rate": 0.0002930958904109589,
"loss": 3.1934,
"step": 87850
},
{
"epoch": 25.604754136564903,
"grad_norm": 0.3943357765674591,
"learning_rate": 0.0002929210142815505,
"loss": 3.1864,
"step": 87900
},
{
"epoch": 25.61931950594267,
"grad_norm": 0.42448991537094116,
"learning_rate": 0.0002927461381521422,
"loss": 3.1733,
"step": 87950
},
{
"epoch": 25.633884875320437,
"grad_norm": 0.4249454140663147,
"learning_rate": 0.0002925712620227339,
"loss": 3.1894,
"step": 88000
},
{
"epoch": 25.633884875320437,
"eval_accuracy": 0.3742290809755226,
"eval_loss": 3.5432162284851074,
"eval_runtime": 180.1053,
"eval_samples_per_second": 92.418,
"eval_steps_per_second": 5.78,
"step": 88000
},
{
"epoch": 25.648450244698207,
"grad_norm": 0.4210646152496338,
"learning_rate": 0.00029239638589332554,
"loss": 3.1882,
"step": 88050
},
{
"epoch": 25.663015614075974,
"grad_norm": 0.423607736825943,
"learning_rate": 0.0002922215097639172,
"loss": 3.1924,
"step": 88100
},
{
"epoch": 25.67758098345374,
"grad_norm": 0.40322986245155334,
"learning_rate": 0.00029204663363450886,
"loss": 3.1921,
"step": 88150
},
{
"epoch": 25.692146352831507,
"grad_norm": 0.4051462709903717,
"learning_rate": 0.0002918717575051005,
"loss": 3.1993,
"step": 88200
},
{
"epoch": 25.706711722209274,
"grad_norm": 0.46788325905799866,
"learning_rate": 0.0002916968813756922,
"loss": 3.196,
"step": 88250
},
{
"epoch": 25.721277091587044,
"grad_norm": 0.42460089921951294,
"learning_rate": 0.0002915220052462839,
"loss": 3.1927,
"step": 88300
},
{
"epoch": 25.73584246096481,
"grad_norm": 0.40352416038513184,
"learning_rate": 0.0002913471291168755,
"loss": 3.2058,
"step": 88350
},
{
"epoch": 25.750407830342578,
"grad_norm": 0.4355858266353607,
"learning_rate": 0.00029117225298746715,
"loss": 3.1969,
"step": 88400
},
{
"epoch": 25.764973199720345,
"grad_norm": 0.4477996528148651,
"learning_rate": 0.00029099737685805884,
"loss": 3.2066,
"step": 88450
},
{
"epoch": 25.77953856909811,
"grad_norm": 0.41174259781837463,
"learning_rate": 0.00029082250072865053,
"loss": 3.1981,
"step": 88500
},
{
"epoch": 25.794103938475878,
"grad_norm": 0.41174444556236267,
"learning_rate": 0.00029064762459924217,
"loss": 3.2023,
"step": 88550
},
{
"epoch": 25.80866930785365,
"grad_norm": 0.426297664642334,
"learning_rate": 0.00029047274846983386,
"loss": 3.2076,
"step": 88600
},
{
"epoch": 25.823234677231415,
"grad_norm": 0.4066264033317566,
"learning_rate": 0.0002902978723404255,
"loss": 3.2045,
"step": 88650
},
{
"epoch": 25.837800046609182,
"grad_norm": 0.4121791124343872,
"learning_rate": 0.00029012299621101713,
"loss": 3.2059,
"step": 88700
},
{
"epoch": 25.85236541598695,
"grad_norm": 0.3962363302707672,
"learning_rate": 0.0002899481200816088,
"loss": 3.2033,
"step": 88750
},
{
"epoch": 25.866930785364715,
"grad_norm": 0.3978932201862335,
"learning_rate": 0.0002897732439522005,
"loss": 3.2063,
"step": 88800
},
{
"epoch": 25.881496154742486,
"grad_norm": 0.4174967408180237,
"learning_rate": 0.00028959836782279215,
"loss": 3.1895,
"step": 88850
},
{
"epoch": 25.896061524120253,
"grad_norm": 0.40950289368629456,
"learning_rate": 0.00028942349169338384,
"loss": 3.2054,
"step": 88900
},
{
"epoch": 25.91062689349802,
"grad_norm": 0.4066673517227173,
"learning_rate": 0.00028924861556397553,
"loss": 3.1945,
"step": 88950
},
{
"epoch": 25.925192262875786,
"grad_norm": 0.4098004102706909,
"learning_rate": 0.00028907373943456717,
"loss": 3.2088,
"step": 89000
},
{
"epoch": 25.925192262875786,
"eval_accuracy": 0.37464927497723555,
"eval_loss": 3.532310724258423,
"eval_runtime": 180.277,
"eval_samples_per_second": 92.33,
"eval_steps_per_second": 5.774,
"step": 89000
},
{
"epoch": 25.939757632253553,
"grad_norm": 0.41714373230934143,
"learning_rate": 0.0002888988633051588,
"loss": 3.2092,
"step": 89050
},
{
"epoch": 25.954323001631323,
"grad_norm": 0.4226542115211487,
"learning_rate": 0.0002887239871757505,
"loss": 3.2017,
"step": 89100
},
{
"epoch": 25.96888837100909,
"grad_norm": 0.4572153091430664,
"learning_rate": 0.00028854911104634213,
"loss": 3.2083,
"step": 89150
},
{
"epoch": 25.983453740386857,
"grad_norm": 0.42790505290031433,
"learning_rate": 0.0002883742349169338,
"loss": 3.2144,
"step": 89200
},
{
"epoch": 25.998019109764623,
"grad_norm": 0.4000281095504761,
"learning_rate": 0.0002881993587875255,
"loss": 3.2087,
"step": 89250
},
{
"epoch": 26.01252621766488,
"grad_norm": 0.40959304571151733,
"learning_rate": 0.00028802448265811715,
"loss": 3.1098,
"step": 89300
},
{
"epoch": 26.027091587042648,
"grad_norm": 0.43922939896583557,
"learning_rate": 0.0002878496065287088,
"loss": 3.1144,
"step": 89350
},
{
"epoch": 26.041656956420415,
"grad_norm": 0.43358200788497925,
"learning_rate": 0.0002876747303993005,
"loss": 3.1118,
"step": 89400
},
{
"epoch": 26.05622232579818,
"grad_norm": 0.4154283106327057,
"learning_rate": 0.00028749985426989216,
"loss": 3.1089,
"step": 89450
},
{
"epoch": 26.070787695175948,
"grad_norm": 0.445342093706131,
"learning_rate": 0.0002873249781404838,
"loss": 3.1359,
"step": 89500
},
{
"epoch": 26.08535306455372,
"grad_norm": 0.4268057644367218,
"learning_rate": 0.0002871501020110755,
"loss": 3.11,
"step": 89550
},
{
"epoch": 26.099918433931485,
"grad_norm": 0.41992899775505066,
"learning_rate": 0.0002869752258816671,
"loss": 3.1328,
"step": 89600
},
{
"epoch": 26.114483803309252,
"grad_norm": 0.4173597991466522,
"learning_rate": 0.00028680034975225876,
"loss": 3.1256,
"step": 89650
},
{
"epoch": 26.12904917268702,
"grad_norm": 0.4342476427555084,
"learning_rate": 0.00028662547362285045,
"loss": 3.1181,
"step": 89700
},
{
"epoch": 26.143614542064785,
"grad_norm": 0.4344112277030945,
"learning_rate": 0.00028645059749344214,
"loss": 3.1259,
"step": 89750
},
{
"epoch": 26.158179911442556,
"grad_norm": 0.4283551573753357,
"learning_rate": 0.0002862757213640338,
"loss": 3.141,
"step": 89800
},
{
"epoch": 26.172745280820322,
"grad_norm": 0.4363539516925812,
"learning_rate": 0.00028610084523462547,
"loss": 3.1399,
"step": 89850
},
{
"epoch": 26.18731065019809,
"grad_norm": 0.403045654296875,
"learning_rate": 0.0002859259691052171,
"loss": 3.1433,
"step": 89900
},
{
"epoch": 26.201876019575856,
"grad_norm": 0.39886170625686646,
"learning_rate": 0.0002857510929758088,
"loss": 3.1444,
"step": 89950
},
{
"epoch": 26.216441388953623,
"grad_norm": 0.419612854719162,
"learning_rate": 0.00028557621684640043,
"loss": 3.1508,
"step": 90000
},
{
"epoch": 26.216441388953623,
"eval_accuracy": 0.3737163596432701,
"eval_loss": 3.5523500442504883,
"eval_runtime": 180.3889,
"eval_samples_per_second": 92.273,
"eval_steps_per_second": 5.771,
"step": 90000
},
{
"epoch": 26.23100675833139,
"grad_norm": 0.42642802000045776,
"learning_rate": 0.0002854013407169921,
"loss": 3.1504,
"step": 90050
},
{
"epoch": 26.24557212770916,
"grad_norm": 0.4583907127380371,
"learning_rate": 0.00028522646458758376,
"loss": 3.1457,
"step": 90100
},
{
"epoch": 26.260137497086927,
"grad_norm": 0.42439937591552734,
"learning_rate": 0.00028505158845817545,
"loss": 3.1559,
"step": 90150
},
{
"epoch": 26.274702866464693,
"grad_norm": 0.44137027859687805,
"learning_rate": 0.0002848767123287671,
"loss": 3.1467,
"step": 90200
},
{
"epoch": 26.28926823584246,
"grad_norm": 0.42748647928237915,
"learning_rate": 0.0002847018361993588,
"loss": 3.157,
"step": 90250
},
{
"epoch": 26.303833605220227,
"grad_norm": 0.4357030391693115,
"learning_rate": 0.0002845269600699504,
"loss": 3.1568,
"step": 90300
},
{
"epoch": 26.318398974597997,
"grad_norm": 0.4434165358543396,
"learning_rate": 0.0002843520839405421,
"loss": 3.1537,
"step": 90350
},
{
"epoch": 26.332964343975764,
"grad_norm": 0.4163769781589508,
"learning_rate": 0.0002841772078111338,
"loss": 3.1612,
"step": 90400
},
{
"epoch": 26.34752971335353,
"grad_norm": 0.4222979247570038,
"learning_rate": 0.00028400233168172543,
"loss": 3.1499,
"step": 90450
},
{
"epoch": 26.362095082731297,
"grad_norm": 0.42296892404556274,
"learning_rate": 0.00028382745555231707,
"loss": 3.1515,
"step": 90500
},
{
"epoch": 26.376660452109064,
"grad_norm": 0.4424600899219513,
"learning_rate": 0.00028365257942290876,
"loss": 3.1693,
"step": 90550
},
{
"epoch": 26.391225821486834,
"grad_norm": 0.49197620153427124,
"learning_rate": 0.0002834777032935004,
"loss": 3.1578,
"step": 90600
},
{
"epoch": 26.4057911908646,
"grad_norm": 0.42167529463768005,
"learning_rate": 0.0002833028271640921,
"loss": 3.16,
"step": 90650
},
{
"epoch": 26.420356560242368,
"grad_norm": 0.41751259565353394,
"learning_rate": 0.00028312795103468377,
"loss": 3.1661,
"step": 90700
},
{
"epoch": 26.434921929620135,
"grad_norm": 0.4054003357887268,
"learning_rate": 0.0002829530749052754,
"loss": 3.1695,
"step": 90750
},
{
"epoch": 26.4494872989979,
"grad_norm": 0.4270538091659546,
"learning_rate": 0.00028277819877586705,
"loss": 3.1721,
"step": 90800
},
{
"epoch": 26.46405266837567,
"grad_norm": 0.4425770342350006,
"learning_rate": 0.00028260332264645874,
"loss": 3.1652,
"step": 90850
},
{
"epoch": 26.47861803775344,
"grad_norm": 0.45323696732521057,
"learning_rate": 0.0002824284465170504,
"loss": 3.1668,
"step": 90900
},
{
"epoch": 26.493183407131205,
"grad_norm": 0.4280482232570648,
"learning_rate": 0.00028225357038764206,
"loss": 3.1749,
"step": 90950
},
{
"epoch": 26.507748776508972,
"grad_norm": 0.40407219529151917,
"learning_rate": 0.00028207869425823375,
"loss": 3.1713,
"step": 91000
},
{
"epoch": 26.507748776508972,
"eval_accuracy": 0.3742687019544194,
"eval_loss": 3.5423803329467773,
"eval_runtime": 180.32,
"eval_samples_per_second": 92.308,
"eval_steps_per_second": 5.773,
"step": 91000
},
{
"epoch": 26.52231414588674,
"grad_norm": 0.44610825181007385,
"learning_rate": 0.0002819038181288254,
"loss": 3.1699,
"step": 91050
},
{
"epoch": 26.536879515264506,
"grad_norm": 0.4127073585987091,
"learning_rate": 0.000281728941999417,
"loss": 3.1862,
"step": 91100
},
{
"epoch": 26.551444884642276,
"grad_norm": 0.5033323764801025,
"learning_rate": 0.0002815540658700087,
"loss": 3.1827,
"step": 91150
},
{
"epoch": 26.566010254020043,
"grad_norm": 0.40992677211761475,
"learning_rate": 0.0002813791897406004,
"loss": 3.1775,
"step": 91200
},
{
"epoch": 26.58057562339781,
"grad_norm": 0.42564913630485535,
"learning_rate": 0.00028120431361119204,
"loss": 3.1795,
"step": 91250
},
{
"epoch": 26.595140992775576,
"grad_norm": 0.4116276502609253,
"learning_rate": 0.00028102943748178373,
"loss": 3.1807,
"step": 91300
},
{
"epoch": 26.609706362153343,
"grad_norm": 0.41622668504714966,
"learning_rate": 0.00028085456135237537,
"loss": 3.1831,
"step": 91350
},
{
"epoch": 26.624271731531113,
"grad_norm": 0.409170925617218,
"learning_rate": 0.00028067968522296706,
"loss": 3.1773,
"step": 91400
},
{
"epoch": 26.63883710090888,
"grad_norm": 0.43525442481040955,
"learning_rate": 0.0002805048090935587,
"loss": 3.1753,
"step": 91450
},
{
"epoch": 26.653402470286647,
"grad_norm": 0.4610871970653534,
"learning_rate": 0.0002803299329641504,
"loss": 3.167,
"step": 91500
},
{
"epoch": 26.667967839664414,
"grad_norm": 0.4206131100654602,
"learning_rate": 0.000280155056834742,
"loss": 3.192,
"step": 91550
},
{
"epoch": 26.68253320904218,
"grad_norm": 0.41161221265792847,
"learning_rate": 0.0002799801807053337,
"loss": 3.1936,
"step": 91600
},
{
"epoch": 26.697098578419947,
"grad_norm": 0.46236079931259155,
"learning_rate": 0.00027980530457592535,
"loss": 3.1873,
"step": 91650
},
{
"epoch": 26.711663947797717,
"grad_norm": 0.43499431014060974,
"learning_rate": 0.00027963042844651704,
"loss": 3.1807,
"step": 91700
},
{
"epoch": 26.726229317175484,
"grad_norm": 0.41808217763900757,
"learning_rate": 0.0002794555523171087,
"loss": 3.1958,
"step": 91750
},
{
"epoch": 26.74079468655325,
"grad_norm": 0.4619711637496948,
"learning_rate": 0.00027928067618770037,
"loss": 3.2044,
"step": 91800
},
{
"epoch": 26.755360055931018,
"grad_norm": 0.4205552935600281,
"learning_rate": 0.00027910580005829206,
"loss": 3.1972,
"step": 91850
},
{
"epoch": 26.769925425308784,
"grad_norm": 0.448356568813324,
"learning_rate": 0.0002789309239288837,
"loss": 3.1929,
"step": 91900
},
{
"epoch": 26.784490794686555,
"grad_norm": 0.443820595741272,
"learning_rate": 0.00027875604779947533,
"loss": 3.1916,
"step": 91950
},
{
"epoch": 26.79905616406432,
"grad_norm": 0.39487943053245544,
"learning_rate": 0.000278581171670067,
"loss": 3.1843,
"step": 92000
},
{
"epoch": 26.79905616406432,
"eval_accuracy": 0.37467796197679293,
"eval_loss": 3.5352799892425537,
"eval_runtime": 180.1788,
"eval_samples_per_second": 92.38,
"eval_steps_per_second": 5.778,
"step": 92000
},
{
"epoch": 26.813621533442088,
"grad_norm": 0.42585235834121704,
"learning_rate": 0.00027840629554065865,
"loss": 3.2017,
"step": 92050
},
{
"epoch": 26.828186902819855,
"grad_norm": 0.45854058861732483,
"learning_rate": 0.00027823141941125034,
"loss": 3.1888,
"step": 92100
},
{
"epoch": 26.84275227219762,
"grad_norm": 0.4178307354450226,
"learning_rate": 0.00027805654328184204,
"loss": 3.1904,
"step": 92150
},
{
"epoch": 26.857317641575392,
"grad_norm": 0.4266476631164551,
"learning_rate": 0.00027788166715243367,
"loss": 3.1942,
"step": 92200
},
{
"epoch": 26.87188301095316,
"grad_norm": 0.42667528986930847,
"learning_rate": 0.0002777067910230253,
"loss": 3.1938,
"step": 92250
},
{
"epoch": 26.886448380330926,
"grad_norm": 0.44742685556411743,
"learning_rate": 0.000277531914893617,
"loss": 3.2002,
"step": 92300
},
{
"epoch": 26.901013749708692,
"grad_norm": 0.4107263386249542,
"learning_rate": 0.00027735703876420863,
"loss": 3.1956,
"step": 92350
},
{
"epoch": 26.91557911908646,
"grad_norm": 0.4186893403530121,
"learning_rate": 0.0002771821626348003,
"loss": 3.2089,
"step": 92400
},
{
"epoch": 26.930144488464226,
"grad_norm": 0.42867034673690796,
"learning_rate": 0.000277007286505392,
"loss": 3.1904,
"step": 92450
},
{
"epoch": 26.944709857841996,
"grad_norm": 0.4075056314468384,
"learning_rate": 0.00027683241037598365,
"loss": 3.1973,
"step": 92500
},
{
"epoch": 26.959275227219763,
"grad_norm": 0.435605525970459,
"learning_rate": 0.0002766575342465753,
"loss": 3.1938,
"step": 92550
},
{
"epoch": 26.97384059659753,
"grad_norm": 0.4103272259235382,
"learning_rate": 0.000276482658117167,
"loss": 3.2019,
"step": 92600
},
{
"epoch": 26.988405965975296,
"grad_norm": 0.42094647884368896,
"learning_rate": 0.00027630778198775867,
"loss": 3.1926,
"step": 92650
},
{
"epoch": 27.002913073875554,
"grad_norm": 0.44302475452423096,
"learning_rate": 0.0002761329058583503,
"loss": 3.1699,
"step": 92700
},
{
"epoch": 27.01747844325332,
"grad_norm": 0.41190701723098755,
"learning_rate": 0.000275958029728942,
"loss": 3.1075,
"step": 92750
},
{
"epoch": 27.032043812631088,
"grad_norm": 0.40939658880233765,
"learning_rate": 0.00027578315359953363,
"loss": 3.0978,
"step": 92800
},
{
"epoch": 27.046609182008854,
"grad_norm": 0.470214307308197,
"learning_rate": 0.00027560827747012527,
"loss": 3.1039,
"step": 92850
},
{
"epoch": 27.061174551386625,
"grad_norm": 0.4300399720668793,
"learning_rate": 0.00027543340134071696,
"loss": 3.1086,
"step": 92900
},
{
"epoch": 27.07573992076439,
"grad_norm": 0.4189159572124481,
"learning_rate": 0.00027525852521130865,
"loss": 3.1083,
"step": 92950
},
{
"epoch": 27.090305290142158,
"grad_norm": 0.42655813694000244,
"learning_rate": 0.0002750836490819003,
"loss": 3.1242,
"step": 93000
},
{
"epoch": 27.090305290142158,
"eval_accuracy": 0.3741485457513554,
"eval_loss": 3.5492360591888428,
"eval_runtime": 180.2167,
"eval_samples_per_second": 92.361,
"eval_steps_per_second": 5.776,
"step": 93000
},
{
"epoch": 27.104870659519925,
"grad_norm": 0.42484626173973083,
"learning_rate": 0.000274908772952492,
"loss": 3.1262,
"step": 93050
},
{
"epoch": 27.11943602889769,
"grad_norm": 0.4413082003593445,
"learning_rate": 0.0002747338968230836,
"loss": 3.119,
"step": 93100
},
{
"epoch": 27.134001398275462,
"grad_norm": 0.467857301235199,
"learning_rate": 0.0002745590206936753,
"loss": 3.1178,
"step": 93150
},
{
"epoch": 27.14856676765323,
"grad_norm": 0.44680628180503845,
"learning_rate": 0.00027438414456426694,
"loss": 3.1284,
"step": 93200
},
{
"epoch": 27.163132137030995,
"grad_norm": 0.4301452040672302,
"learning_rate": 0.00027420926843485863,
"loss": 3.1252,
"step": 93250
},
{
"epoch": 27.177697506408762,
"grad_norm": 0.4273969531059265,
"learning_rate": 0.00027403439230545026,
"loss": 3.1284,
"step": 93300
},
{
"epoch": 27.19226287578653,
"grad_norm": 0.4401126801967621,
"learning_rate": 0.00027385951617604195,
"loss": 3.1346,
"step": 93350
},
{
"epoch": 27.206828245164296,
"grad_norm": 0.4096077084541321,
"learning_rate": 0.0002736846400466336,
"loss": 3.1379,
"step": 93400
},
{
"epoch": 27.221393614542066,
"grad_norm": 0.4174330532550812,
"learning_rate": 0.0002735097639172253,
"loss": 3.1322,
"step": 93450
},
{
"epoch": 27.235958983919833,
"grad_norm": 0.4422391653060913,
"learning_rate": 0.0002733348877878169,
"loss": 3.1311,
"step": 93500
},
{
"epoch": 27.2505243532976,
"grad_norm": 0.4375896453857422,
"learning_rate": 0.0002731600116584086,
"loss": 3.1417,
"step": 93550
},
{
"epoch": 27.265089722675366,
"grad_norm": 0.4318002164363861,
"learning_rate": 0.0002729851355290003,
"loss": 3.1503,
"step": 93600
},
{
"epoch": 27.279655092053133,
"grad_norm": 0.421117901802063,
"learning_rate": 0.00027281025939959193,
"loss": 3.1242,
"step": 93650
},
{
"epoch": 27.294220461430903,
"grad_norm": 0.4362134635448456,
"learning_rate": 0.00027263538327018357,
"loss": 3.1582,
"step": 93700
},
{
"epoch": 27.30878583080867,
"grad_norm": 0.40824374556541443,
"learning_rate": 0.00027246050714077526,
"loss": 3.1412,
"step": 93750
},
{
"epoch": 27.323351200186437,
"grad_norm": 0.4372468590736389,
"learning_rate": 0.0002722856310113669,
"loss": 3.1593,
"step": 93800
},
{
"epoch": 27.337916569564204,
"grad_norm": 0.4370090365409851,
"learning_rate": 0.0002721107548819586,
"loss": 3.1438,
"step": 93850
},
{
"epoch": 27.35248193894197,
"grad_norm": 0.4227096736431122,
"learning_rate": 0.0002719358787525503,
"loss": 3.1577,
"step": 93900
},
{
"epoch": 27.36704730831974,
"grad_norm": 0.44676777720451355,
"learning_rate": 0.0002717610026231419,
"loss": 3.1549,
"step": 93950
},
{
"epoch": 27.381612677697508,
"grad_norm": 0.42205673456192017,
"learning_rate": 0.00027158612649373355,
"loss": 3.1564,
"step": 94000
},
{
"epoch": 27.381612677697508,
"eval_accuracy": 0.37452947148318255,
"eval_loss": 3.5454885959625244,
"eval_runtime": 180.3004,
"eval_samples_per_second": 92.318,
"eval_steps_per_second": 5.774,
"step": 94000
},
{
"epoch": 27.396178047075274,
"grad_norm": 0.41671884059906006,
"learning_rate": 0.00027141125036432524,
"loss": 3.1507,
"step": 94050
},
{
"epoch": 27.41074341645304,
"grad_norm": 0.4220370352268219,
"learning_rate": 0.00027123637423491693,
"loss": 3.1518,
"step": 94100
},
{
"epoch": 27.425308785830808,
"grad_norm": 0.41778427362442017,
"learning_rate": 0.00027106149810550857,
"loss": 3.1651,
"step": 94150
},
{
"epoch": 27.439874155208575,
"grad_norm": 0.44522127509117126,
"learning_rate": 0.00027088662197610026,
"loss": 3.1566,
"step": 94200
},
{
"epoch": 27.454439524586345,
"grad_norm": 0.41769787669181824,
"learning_rate": 0.0002707117458466919,
"loss": 3.1643,
"step": 94250
},
{
"epoch": 27.46900489396411,
"grad_norm": 0.4191843569278717,
"learning_rate": 0.00027053686971728353,
"loss": 3.1574,
"step": 94300
},
{
"epoch": 27.48357026334188,
"grad_norm": 0.3989506661891937,
"learning_rate": 0.0002703619935878752,
"loss": 3.1625,
"step": 94350
},
{
"epoch": 27.498135632719645,
"grad_norm": 0.4043886661529541,
"learning_rate": 0.0002701871174584669,
"loss": 3.1676,
"step": 94400
},
{
"epoch": 27.512701002097412,
"grad_norm": 0.4305398166179657,
"learning_rate": 0.00027001224132905855,
"loss": 3.1538,
"step": 94450
},
{
"epoch": 27.527266371475182,
"grad_norm": 0.43826940655708313,
"learning_rate": 0.00026983736519965024,
"loss": 3.1715,
"step": 94500
},
{
"epoch": 27.54183174085295,
"grad_norm": 0.41890406608581543,
"learning_rate": 0.0002696624890702419,
"loss": 3.1744,
"step": 94550
},
{
"epoch": 27.556397110230716,
"grad_norm": 0.42319968342781067,
"learning_rate": 0.00026948761294083356,
"loss": 3.1698,
"step": 94600
},
{
"epoch": 27.570962479608482,
"grad_norm": 0.45210587978363037,
"learning_rate": 0.0002693127368114252,
"loss": 3.1734,
"step": 94650
},
{
"epoch": 27.58552784898625,
"grad_norm": 0.43893709778785706,
"learning_rate": 0.0002691378606820169,
"loss": 3.1757,
"step": 94700
},
{
"epoch": 27.600093218364016,
"grad_norm": 0.41676828265190125,
"learning_rate": 0.0002689629845526085,
"loss": 3.1645,
"step": 94750
},
{
"epoch": 27.614658587741786,
"grad_norm": 0.4389956295490265,
"learning_rate": 0.0002687881084232002,
"loss": 3.1572,
"step": 94800
},
{
"epoch": 27.629223957119553,
"grad_norm": 0.45625096559524536,
"learning_rate": 0.00026861323229379185,
"loss": 3.1791,
"step": 94850
},
{
"epoch": 27.64378932649732,
"grad_norm": 0.43963149189949036,
"learning_rate": 0.00026843835616438354,
"loss": 3.1701,
"step": 94900
},
{
"epoch": 27.658354695875087,
"grad_norm": 0.4541909694671631,
"learning_rate": 0.0002682634800349752,
"loss": 3.1809,
"step": 94950
},
{
"epoch": 27.672920065252853,
"grad_norm": 0.4407740831375122,
"learning_rate": 0.00026808860390556687,
"loss": 3.1625,
"step": 95000
},
{
"epoch": 27.672920065252853,
"eval_accuracy": 0.3749729442796183,
"eval_loss": 3.5385658740997314,
"eval_runtime": 180.3064,
"eval_samples_per_second": 92.315,
"eval_steps_per_second": 5.774,
"step": 95000
},
{
"epoch": 27.687485434630624,
"grad_norm": 0.4178466200828552,
"learning_rate": 0.00026791372777615856,
"loss": 3.1759,
"step": 95050
},
{
"epoch": 27.70205080400839,
"grad_norm": 0.42705100774765015,
"learning_rate": 0.0002677388516467502,
"loss": 3.1759,
"step": 95100
},
{
"epoch": 27.716616173386157,
"grad_norm": 0.4739421308040619,
"learning_rate": 0.00026756397551734183,
"loss": 3.1809,
"step": 95150
},
{
"epoch": 27.731181542763924,
"grad_norm": 0.4194541871547699,
"learning_rate": 0.0002673890993879335,
"loss": 3.1845,
"step": 95200
},
{
"epoch": 27.74574691214169,
"grad_norm": 0.440497487783432,
"learning_rate": 0.00026721422325852516,
"loss": 3.1882,
"step": 95250
},
{
"epoch": 27.76031228151946,
"grad_norm": 0.42385539412498474,
"learning_rate": 0.00026703934712911685,
"loss": 3.1663,
"step": 95300
},
{
"epoch": 27.774877650897228,
"grad_norm": 0.4445149898529053,
"learning_rate": 0.00026686447099970854,
"loss": 3.186,
"step": 95350
},
{
"epoch": 27.789443020274994,
"grad_norm": 0.4726870059967041,
"learning_rate": 0.0002666895948703002,
"loss": 3.1859,
"step": 95400
},
{
"epoch": 27.80400838965276,
"grad_norm": 0.44043588638305664,
"learning_rate": 0.0002665147187408918,
"loss": 3.1927,
"step": 95450
},
{
"epoch": 27.818573759030528,
"grad_norm": 0.4248933792114258,
"learning_rate": 0.0002663398426114835,
"loss": 3.1934,
"step": 95500
},
{
"epoch": 27.833139128408295,
"grad_norm": 0.4195023775100708,
"learning_rate": 0.0002661649664820752,
"loss": 3.1849,
"step": 95550
},
{
"epoch": 27.847704497786065,
"grad_norm": 0.4301508963108063,
"learning_rate": 0.00026599009035266683,
"loss": 3.1771,
"step": 95600
},
{
"epoch": 27.862269867163832,
"grad_norm": 0.4297967851161957,
"learning_rate": 0.0002658152142232585,
"loss": 3.1853,
"step": 95650
},
{
"epoch": 27.8768352365416,
"grad_norm": 0.42951589822769165,
"learning_rate": 0.00026564033809385016,
"loss": 3.1878,
"step": 95700
},
{
"epoch": 27.891400605919365,
"grad_norm": 0.39632225036621094,
"learning_rate": 0.0002654654619644418,
"loss": 3.1832,
"step": 95750
},
{
"epoch": 27.905965975297132,
"grad_norm": 0.4643055200576782,
"learning_rate": 0.0002652905858350335,
"loss": 3.1983,
"step": 95800
},
{
"epoch": 27.920531344674902,
"grad_norm": 0.41126561164855957,
"learning_rate": 0.00026511570970562517,
"loss": 3.1842,
"step": 95850
},
{
"epoch": 27.93509671405267,
"grad_norm": 0.40602535009384155,
"learning_rate": 0.0002649408335762168,
"loss": 3.1804,
"step": 95900
},
{
"epoch": 27.949662083430436,
"grad_norm": 0.42574018239974976,
"learning_rate": 0.0002647659574468085,
"loss": 3.1809,
"step": 95950
},
{
"epoch": 27.964227452808203,
"grad_norm": 0.4175705313682556,
"learning_rate": 0.0002645910813174002,
"loss": 3.1875,
"step": 96000
},
{
"epoch": 27.964227452808203,
"eval_accuracy": 0.37480446694205405,
"eval_loss": 3.5358710289001465,
"eval_runtime": 180.3859,
"eval_samples_per_second": 92.274,
"eval_steps_per_second": 5.771,
"step": 96000
},
{
"epoch": 27.97879282218597,
"grad_norm": 0.4382724165916443,
"learning_rate": 0.0002644162051879918,
"loss": 3.1871,
"step": 96050
},
{
"epoch": 27.99335819156374,
"grad_norm": 0.4022817611694336,
"learning_rate": 0.00026424132905858346,
"loss": 3.1892,
"step": 96100
},
{
"epoch": 28.007865299463994,
"grad_norm": 0.41491270065307617,
"learning_rate": 0.00026406645292917515,
"loss": 3.1331,
"step": 96150
},
{
"epoch": 28.02243066884176,
"grad_norm": 0.45356419682502747,
"learning_rate": 0.0002638915767997668,
"loss": 3.0958,
"step": 96200
},
{
"epoch": 28.03699603821953,
"grad_norm": 0.4329795837402344,
"learning_rate": 0.0002637167006703585,
"loss": 3.0965,
"step": 96250
},
{
"epoch": 28.051561407597298,
"grad_norm": 0.43281567096710205,
"learning_rate": 0.00026354182454095017,
"loss": 3.0937,
"step": 96300
},
{
"epoch": 28.066126776975064,
"grad_norm": 0.42949551343917847,
"learning_rate": 0.0002633669484115418,
"loss": 3.0962,
"step": 96350
},
{
"epoch": 28.08069214635283,
"grad_norm": 0.46434319019317627,
"learning_rate": 0.00026319207228213344,
"loss": 3.0916,
"step": 96400
},
{
"epoch": 28.095257515730598,
"grad_norm": 0.43190398812294006,
"learning_rate": 0.00026301719615272513,
"loss": 3.1137,
"step": 96450
},
{
"epoch": 28.109822885108365,
"grad_norm": 0.44686824083328247,
"learning_rate": 0.0002628423200233168,
"loss": 3.1218,
"step": 96500
},
{
"epoch": 28.124388254486135,
"grad_norm": 0.4280760884284973,
"learning_rate": 0.00026266744389390846,
"loss": 3.1093,
"step": 96550
},
{
"epoch": 28.1389536238639,
"grad_norm": 0.41021299362182617,
"learning_rate": 0.00026249256776450015,
"loss": 3.1239,
"step": 96600
},
{
"epoch": 28.15351899324167,
"grad_norm": 0.4637337923049927,
"learning_rate": 0.0002623176916350918,
"loss": 3.1269,
"step": 96650
},
{
"epoch": 28.168084362619435,
"grad_norm": 0.4205281436443329,
"learning_rate": 0.0002621428155056834,
"loss": 3.1244,
"step": 96700
},
{
"epoch": 28.182649731997202,
"grad_norm": 0.43963325023651123,
"learning_rate": 0.0002619679393762751,
"loss": 3.1256,
"step": 96750
},
{
"epoch": 28.197215101374972,
"grad_norm": 0.40746745467185974,
"learning_rate": 0.0002617930632468668,
"loss": 3.1245,
"step": 96800
},
{
"epoch": 28.21178047075274,
"grad_norm": 0.43309372663497925,
"learning_rate": 0.00026161818711745844,
"loss": 3.1279,
"step": 96850
},
{
"epoch": 28.226345840130506,
"grad_norm": 0.46507173776626587,
"learning_rate": 0.0002614433109880501,
"loss": 3.1247,
"step": 96900
},
{
"epoch": 28.240911209508273,
"grad_norm": 0.4328254759311676,
"learning_rate": 0.00026126843485864176,
"loss": 3.1296,
"step": 96950
},
{
"epoch": 28.25547657888604,
"grad_norm": 0.42537787556648254,
"learning_rate": 0.00026109355872923345,
"loss": 3.1266,
"step": 97000
},
{
"epoch": 28.25547657888604,
"eval_accuracy": 0.37432748678957795,
"eval_loss": 3.5481479167938232,
"eval_runtime": 180.2202,
"eval_samples_per_second": 92.359,
"eval_steps_per_second": 5.776,
"step": 97000
},
{
"epoch": 28.27004194826381,
"grad_norm": 0.4779875576496124,
"learning_rate": 0.0002609186825998251,
"loss": 3.1381,
"step": 97050
},
{
"epoch": 28.284607317641576,
"grad_norm": 0.45657962560653687,
"learning_rate": 0.0002607438064704168,
"loss": 3.1405,
"step": 97100
},
{
"epoch": 28.299172687019343,
"grad_norm": 0.44699814915657043,
"learning_rate": 0.0002605689303410084,
"loss": 3.1292,
"step": 97150
},
{
"epoch": 28.31373805639711,
"grad_norm": 0.42782288789749146,
"learning_rate": 0.00026039405421160005,
"loss": 3.1424,
"step": 97200
},
{
"epoch": 28.328303425774877,
"grad_norm": 0.43705010414123535,
"learning_rate": 0.00026021917808219174,
"loss": 3.1352,
"step": 97250
},
{
"epoch": 28.342868795152643,
"grad_norm": 0.4696052074432373,
"learning_rate": 0.00026004430195278343,
"loss": 3.1416,
"step": 97300
},
{
"epoch": 28.357434164530414,
"grad_norm": 0.4281631112098694,
"learning_rate": 0.00025986942582337507,
"loss": 3.1427,
"step": 97350
},
{
"epoch": 28.37199953390818,
"grad_norm": 0.4597308337688446,
"learning_rate": 0.00025969454969396676,
"loss": 3.1431,
"step": 97400
},
{
"epoch": 28.386564903285947,
"grad_norm": 0.43386340141296387,
"learning_rate": 0.00025951967356455845,
"loss": 3.1386,
"step": 97450
},
{
"epoch": 28.401130272663714,
"grad_norm": 0.44734376668930054,
"learning_rate": 0.0002593447974351501,
"loss": 3.1467,
"step": 97500
},
{
"epoch": 28.41569564204148,
"grad_norm": 0.4462273120880127,
"learning_rate": 0.0002591699213057417,
"loss": 3.1383,
"step": 97550
},
{
"epoch": 28.43026101141925,
"grad_norm": 0.4251599907875061,
"learning_rate": 0.0002589950451763334,
"loss": 3.1555,
"step": 97600
},
{
"epoch": 28.444826380797018,
"grad_norm": 0.42566782236099243,
"learning_rate": 0.00025882016904692505,
"loss": 3.1627,
"step": 97650
},
{
"epoch": 28.459391750174785,
"grad_norm": 0.4228300452232361,
"learning_rate": 0.00025864529291751674,
"loss": 3.1494,
"step": 97700
},
{
"epoch": 28.47395711955255,
"grad_norm": 0.44921255111694336,
"learning_rate": 0.00025847041678810843,
"loss": 3.1556,
"step": 97750
},
{
"epoch": 28.488522488930318,
"grad_norm": 0.4171012043952942,
"learning_rate": 0.00025829554065870007,
"loss": 3.1618,
"step": 97800
},
{
"epoch": 28.503087858308085,
"grad_norm": 0.4369617700576782,
"learning_rate": 0.0002581206645292917,
"loss": 3.157,
"step": 97850
},
{
"epoch": 28.517653227685855,
"grad_norm": 0.4576139748096466,
"learning_rate": 0.0002579457883998834,
"loss": 3.1497,
"step": 97900
},
{
"epoch": 28.532218597063622,
"grad_norm": 0.46645256876945496,
"learning_rate": 0.0002577709122704751,
"loss": 3.1585,
"step": 97950
},
{
"epoch": 28.54678396644139,
"grad_norm": 0.4609380066394806,
"learning_rate": 0.0002575960361410667,
"loss": 3.1698,
"step": 98000
},
{
"epoch": 28.54678396644139,
"eval_accuracy": 0.37467278891129896,
"eval_loss": 3.543879747390747,
"eval_runtime": 180.2696,
"eval_samples_per_second": 92.334,
"eval_steps_per_second": 5.775,
"step": 98000
},
{
"epoch": 28.561349335819155,
"grad_norm": 0.4351586401462555,
"learning_rate": 0.0002574211600116584,
"loss": 3.1573,
"step": 98050
},
{
"epoch": 28.575914705196922,
"grad_norm": 0.4162726402282715,
"learning_rate": 0.00025724628388225005,
"loss": 3.1631,
"step": 98100
},
{
"epoch": 28.590480074574693,
"grad_norm": 0.44690102338790894,
"learning_rate": 0.0002570714077528417,
"loss": 3.1584,
"step": 98150
},
{
"epoch": 28.60504544395246,
"grad_norm": 0.43216705322265625,
"learning_rate": 0.0002568965316234334,
"loss": 3.1693,
"step": 98200
},
{
"epoch": 28.619610813330226,
"grad_norm": 0.45666563510894775,
"learning_rate": 0.00025672165549402506,
"loss": 3.1635,
"step": 98250
},
{
"epoch": 28.634176182707993,
"grad_norm": 0.44742050766944885,
"learning_rate": 0.0002565467793646167,
"loss": 3.1559,
"step": 98300
},
{
"epoch": 28.64874155208576,
"grad_norm": 0.4800066351890564,
"learning_rate": 0.0002563719032352084,
"loss": 3.1656,
"step": 98350
},
{
"epoch": 28.66330692146353,
"grad_norm": 0.4308624267578125,
"learning_rate": 0.0002561970271058,
"loss": 3.1618,
"step": 98400
},
{
"epoch": 28.677872290841297,
"grad_norm": 0.4453083276748657,
"learning_rate": 0.0002560221509763917,
"loss": 3.1673,
"step": 98450
},
{
"epoch": 28.692437660219063,
"grad_norm": 0.4215611219406128,
"learning_rate": 0.00025584727484698335,
"loss": 3.1757,
"step": 98500
},
{
"epoch": 28.70700302959683,
"grad_norm": 0.466304749250412,
"learning_rate": 0.00025567239871757504,
"loss": 3.1556,
"step": 98550
},
{
"epoch": 28.721568398974597,
"grad_norm": 0.4174833297729492,
"learning_rate": 0.0002554975225881667,
"loss": 3.168,
"step": 98600
},
{
"epoch": 28.736133768352367,
"grad_norm": 0.44293835759162903,
"learning_rate": 0.00025532264645875837,
"loss": 3.1706,
"step": 98650
},
{
"epoch": 28.750699137730134,
"grad_norm": 0.4146621525287628,
"learning_rate": 0.00025514777032935,
"loss": 3.1773,
"step": 98700
},
{
"epoch": 28.7652645071079,
"grad_norm": 0.42804571986198425,
"learning_rate": 0.0002549728941999417,
"loss": 3.1804,
"step": 98750
},
{
"epoch": 28.779829876485667,
"grad_norm": 0.3940410315990448,
"learning_rate": 0.00025479801807053333,
"loss": 3.1746,
"step": 98800
},
{
"epoch": 28.794395245863434,
"grad_norm": 0.4341360330581665,
"learning_rate": 0.000254623141941125,
"loss": 3.1759,
"step": 98850
},
{
"epoch": 28.8089606152412,
"grad_norm": 0.4237947165966034,
"learning_rate": 0.0002544482658117167,
"loss": 3.1689,
"step": 98900
},
{
"epoch": 28.82352598461897,
"grad_norm": 0.4510186016559601,
"learning_rate": 0.00025427338968230835,
"loss": 3.1761,
"step": 98950
},
{
"epoch": 28.838091353996738,
"grad_norm": 0.47351089119911194,
"learning_rate": 0.0002540985135529,
"loss": 3.1624,
"step": 99000
},
{
"epoch": 28.838091353996738,
"eval_accuracy": 0.3749749429640137,
"eval_loss": 3.540158271789551,
"eval_runtime": 180.0523,
"eval_samples_per_second": 92.445,
"eval_steps_per_second": 5.782,
"step": 99000
},
{
"epoch": 28.852656723374505,
"grad_norm": 0.4320347011089325,
"learning_rate": 0.0002539236374234917,
"loss": 3.1782,
"step": 99050
},
{
"epoch": 28.86722209275227,
"grad_norm": 0.4034181833267212,
"learning_rate": 0.0002537487612940833,
"loss": 3.1781,
"step": 99100
},
{
"epoch": 28.88178746213004,
"grad_norm": 0.4507603943347931,
"learning_rate": 0.000253573885164675,
"loss": 3.1755,
"step": 99150
},
{
"epoch": 28.89635283150781,
"grad_norm": 0.41703903675079346,
"learning_rate": 0.0002533990090352667,
"loss": 3.1741,
"step": 99200
},
{
"epoch": 28.910918200885575,
"grad_norm": 0.4325399100780487,
"learning_rate": 0.00025322413290585833,
"loss": 3.1772,
"step": 99250
},
{
"epoch": 28.925483570263342,
"grad_norm": 0.4348280131816864,
"learning_rate": 0.00025304925677644997,
"loss": 3.1738,
"step": 99300
},
{
"epoch": 28.94004893964111,
"grad_norm": 0.423967182636261,
"learning_rate": 0.00025287438064704166,
"loss": 3.184,
"step": 99350
},
{
"epoch": 28.954614309018876,
"grad_norm": 0.4055376350879669,
"learning_rate": 0.00025269950451763335,
"loss": 3.1833,
"step": 99400
},
{
"epoch": 28.969179678396642,
"grad_norm": 0.40919673442840576,
"learning_rate": 0.000252524628388225,
"loss": 3.1864,
"step": 99450
},
{
"epoch": 28.983745047774413,
"grad_norm": 0.45877301692962646,
"learning_rate": 0.0002523497522588167,
"loss": 3.177,
"step": 99500
},
{
"epoch": 28.99831041715218,
"grad_norm": 0.4333646297454834,
"learning_rate": 0.0002521748761294083,
"loss": 3.1833,
"step": 99550
},
{
"epoch": 29.012817525052434,
"grad_norm": 0.4438258409500122,
"learning_rate": 0.00025199999999999995,
"loss": 3.0928,
"step": 99600
},
{
"epoch": 29.027382894430204,
"grad_norm": 0.44858071208000183,
"learning_rate": 0.00025182512387059164,
"loss": 3.092,
"step": 99650
},
{
"epoch": 29.04194826380797,
"grad_norm": 0.4630739390850067,
"learning_rate": 0.0002516502477411833,
"loss": 3.0938,
"step": 99700
},
{
"epoch": 29.056513633185737,
"grad_norm": 0.45132312178611755,
"learning_rate": 0.00025147537161177496,
"loss": 3.0957,
"step": 99750
},
{
"epoch": 29.071079002563504,
"grad_norm": 0.44013890624046326,
"learning_rate": 0.00025130049548236665,
"loss": 3.092,
"step": 99800
},
{
"epoch": 29.08564437194127,
"grad_norm": 0.4376552700996399,
"learning_rate": 0.0002511256193529583,
"loss": 3.0954,
"step": 99850
},
{
"epoch": 29.10020974131904,
"grad_norm": 0.4498368799686432,
"learning_rate": 0.00025095074322355,
"loss": 3.1093,
"step": 99900
},
{
"epoch": 29.114775110696808,
"grad_norm": 0.4516642093658447,
"learning_rate": 0.0002507758670941416,
"loss": 3.1087,
"step": 99950
},
{
"epoch": 29.129340480074575,
"grad_norm": 0.46183279156684875,
"learning_rate": 0.0002506009909647333,
"loss": 3.1128,
"step": 100000
},
{
"epoch": 29.129340480074575,
"eval_accuracy": 0.3746281124365785,
"eval_loss": 3.5496273040771484,
"eval_runtime": 180.3174,
"eval_samples_per_second": 92.309,
"eval_steps_per_second": 5.773,
"step": 100000
},
{
"epoch": 29.129340480074575,
"step": 100000,
"total_flos": 2.090213187452928e+18,
"train_loss": 0.6336719000244141,
"train_runtime": 39937.4764,
"train_samples_per_second": 343.814,
"train_steps_per_second": 4.298
}
],
"logging_steps": 50,
"max_steps": 171650,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.090213187452928e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}