craa's picture
End of training
79ff8eb verified
{
"best_global_step": 79000,
"best_metric": 3.5263609886169434,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_drop_frequency_1032/checkpoint-40000",
"epoch": 28.821151942228177,
"eval_steps": 1000,
"global_step": 99000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014559431599790344,
"grad_norm": 0.9666687846183777,
"learning_rate": 0.000294,
"loss": 8.4834,
"step": 50
},
{
"epoch": 0.029118863199580687,
"grad_norm": 0.6833519339561462,
"learning_rate": 0.0005939999999999999,
"loss": 6.7077,
"step": 100
},
{
"epoch": 0.043678294799371034,
"grad_norm": 0.4742502272129059,
"learning_rate": 0.0005998287212350713,
"loss": 6.3505,
"step": 150
},
{
"epoch": 0.058237726399161374,
"grad_norm": 0.44300776720046997,
"learning_rate": 0.0005996539469851441,
"loss": 6.126,
"step": 200
},
{
"epoch": 0.07279715799895171,
"grad_norm": 0.6068394780158997,
"learning_rate": 0.000599479172735217,
"loss": 6.0028,
"step": 250
},
{
"epoch": 0.08735658959874207,
"grad_norm": 0.3999798893928528,
"learning_rate": 0.0005993043984852897,
"loss": 5.8771,
"step": 300
},
{
"epoch": 0.10191602119853241,
"grad_norm": 0.47448229789733887,
"learning_rate": 0.0005991296242353626,
"loss": 5.7297,
"step": 350
},
{
"epoch": 0.11647545279832275,
"grad_norm": 0.4407774806022644,
"learning_rate": 0.0005989548499854355,
"loss": 5.6118,
"step": 400
},
{
"epoch": 0.1310348843981131,
"grad_norm": 0.4476652145385742,
"learning_rate": 0.0005987800757355083,
"loss": 5.5129,
"step": 450
},
{
"epoch": 0.14559431599790343,
"grad_norm": 0.44474974274635315,
"learning_rate": 0.0005986053014855811,
"loss": 5.4224,
"step": 500
},
{
"epoch": 0.1601537475976938,
"grad_norm": 0.41966310143470764,
"learning_rate": 0.000598430527235654,
"loss": 5.3379,
"step": 550
},
{
"epoch": 0.17471317919748414,
"grad_norm": 0.4177263081073761,
"learning_rate": 0.0005982557529857267,
"loss": 5.2512,
"step": 600
},
{
"epoch": 0.18927261079727448,
"grad_norm": 0.44888487458229065,
"learning_rate": 0.0005980809787357995,
"loss": 5.1803,
"step": 650
},
{
"epoch": 0.20383204239706482,
"grad_norm": 0.43577608466148376,
"learning_rate": 0.0005979062044858724,
"loss": 5.1268,
"step": 700
},
{
"epoch": 0.21839147399685516,
"grad_norm": 0.43620362877845764,
"learning_rate": 0.0005977314302359452,
"loss": 5.0768,
"step": 750
},
{
"epoch": 0.2329509055966455,
"grad_norm": 0.44033217430114746,
"learning_rate": 0.0005975566559860181,
"loss": 5.0148,
"step": 800
},
{
"epoch": 0.24751033719643586,
"grad_norm": 0.4907797574996948,
"learning_rate": 0.0005973818817360908,
"loss": 4.9808,
"step": 850
},
{
"epoch": 0.2620697687962262,
"grad_norm": 0.45115190744400024,
"learning_rate": 0.0005972071074861636,
"loss": 4.9143,
"step": 900
},
{
"epoch": 0.2766292003960165,
"grad_norm": 0.42089030146598816,
"learning_rate": 0.0005970323332362365,
"loss": 4.8585,
"step": 950
},
{
"epoch": 0.29118863199580686,
"grad_norm": 0.441190630197525,
"learning_rate": 0.0005968575589863093,
"loss": 4.8164,
"step": 1000
},
{
"epoch": 0.29118863199580686,
"eval_accuracy": 0.2562200359735874,
"eval_loss": 4.745169639587402,
"eval_runtime": 55.0786,
"eval_samples_per_second": 302.168,
"eval_steps_per_second": 18.9,
"step": 1000
},
{
"epoch": 0.30574806359559725,
"grad_norm": 0.4603753387928009,
"learning_rate": 0.0005966827847363822,
"loss": 4.7669,
"step": 1050
},
{
"epoch": 0.3203074951953876,
"grad_norm": 0.46023908257484436,
"learning_rate": 0.000596508010486455,
"loss": 4.7283,
"step": 1100
},
{
"epoch": 0.33486692679517793,
"grad_norm": 0.47654032707214355,
"learning_rate": 0.0005963332362365277,
"loss": 4.688,
"step": 1150
},
{
"epoch": 0.3494263583949683,
"grad_norm": 0.5129820108413696,
"learning_rate": 0.0005961584619866006,
"loss": 4.6455,
"step": 1200
},
{
"epoch": 0.3639857899947586,
"grad_norm": 0.4901754856109619,
"learning_rate": 0.0005959836877366734,
"loss": 4.6235,
"step": 1250
},
{
"epoch": 0.37854522159454895,
"grad_norm": 0.4569827914237976,
"learning_rate": 0.0005958089134867463,
"loss": 4.5877,
"step": 1300
},
{
"epoch": 0.3931046531943393,
"grad_norm": 0.44484788179397583,
"learning_rate": 0.0005956341392368191,
"loss": 4.5622,
"step": 1350
},
{
"epoch": 0.40766408479412963,
"grad_norm": 0.45020967721939087,
"learning_rate": 0.0005954593649868918,
"loss": 4.5348,
"step": 1400
},
{
"epoch": 0.42222351639392,
"grad_norm": 0.44121989607810974,
"learning_rate": 0.0005952845907369647,
"loss": 4.5162,
"step": 1450
},
{
"epoch": 0.4367829479937103,
"grad_norm": 0.45869946479797363,
"learning_rate": 0.0005951098164870375,
"loss": 4.4997,
"step": 1500
},
{
"epoch": 0.45134237959350065,
"grad_norm": 0.4464726150035858,
"learning_rate": 0.0005949350422371104,
"loss": 4.4767,
"step": 1550
},
{
"epoch": 0.465901811193291,
"grad_norm": 0.40169212222099304,
"learning_rate": 0.0005947602679871832,
"loss": 4.4618,
"step": 1600
},
{
"epoch": 0.48046124279308133,
"grad_norm": 0.40386444330215454,
"learning_rate": 0.000594585493737256,
"loss": 4.4517,
"step": 1650
},
{
"epoch": 0.49502067439287173,
"grad_norm": 0.40597283840179443,
"learning_rate": 0.0005944107194873288,
"loss": 4.4269,
"step": 1700
},
{
"epoch": 0.509580105992662,
"grad_norm": 0.41445210576057434,
"learning_rate": 0.0005942359452374016,
"loss": 4.4003,
"step": 1750
},
{
"epoch": 0.5241395375924524,
"grad_norm": 0.4420674741268158,
"learning_rate": 0.0005940611709874745,
"loss": 4.393,
"step": 1800
},
{
"epoch": 0.5386989691922427,
"grad_norm": 0.4193710386753082,
"learning_rate": 0.0005938863967375473,
"loss": 4.3779,
"step": 1850
},
{
"epoch": 0.553258400792033,
"grad_norm": 0.4820527732372284,
"learning_rate": 0.0005937116224876201,
"loss": 4.3614,
"step": 1900
},
{
"epoch": 0.5678178323918234,
"grad_norm": 0.4935331642627716,
"learning_rate": 0.000593536848237693,
"loss": 4.3505,
"step": 1950
},
{
"epoch": 0.5823772639916137,
"grad_norm": 0.3989650011062622,
"learning_rate": 0.0005933620739877657,
"loss": 4.339,
"step": 2000
},
{
"epoch": 0.5823772639916137,
"eval_accuracy": 0.3002242440625767,
"eval_loss": 4.28311824798584,
"eval_runtime": 54.2851,
"eval_samples_per_second": 306.585,
"eval_steps_per_second": 19.177,
"step": 2000
},
{
"epoch": 0.5969366955914042,
"grad_norm": 0.38092532753944397,
"learning_rate": 0.0005931872997378385,
"loss": 4.3209,
"step": 2050
},
{
"epoch": 0.6114961271911945,
"grad_norm": 0.3823917508125305,
"learning_rate": 0.0005930125254879114,
"loss": 4.3103,
"step": 2100
},
{
"epoch": 0.6260555587909848,
"grad_norm": 0.4137718677520752,
"learning_rate": 0.0005928377512379842,
"loss": 4.301,
"step": 2150
},
{
"epoch": 0.6406149903907752,
"grad_norm": 0.38565006852149963,
"learning_rate": 0.0005926629769880571,
"loss": 4.2898,
"step": 2200
},
{
"epoch": 0.6551744219905655,
"grad_norm": 0.41142529249191284,
"learning_rate": 0.0005924882027381298,
"loss": 4.2786,
"step": 2250
},
{
"epoch": 0.6697338535903559,
"grad_norm": 0.3580577075481415,
"learning_rate": 0.0005923134284882026,
"loss": 4.2655,
"step": 2300
},
{
"epoch": 0.6842932851901462,
"grad_norm": 0.3694271147251129,
"learning_rate": 0.0005921386542382755,
"loss": 4.2603,
"step": 2350
},
{
"epoch": 0.6988527167899365,
"grad_norm": 0.3978845477104187,
"learning_rate": 0.0005919638799883483,
"loss": 4.2569,
"step": 2400
},
{
"epoch": 0.7134121483897269,
"grad_norm": 0.40943846106529236,
"learning_rate": 0.0005917891057384212,
"loss": 4.2379,
"step": 2450
},
{
"epoch": 0.7279715799895172,
"grad_norm": 0.3747680187225342,
"learning_rate": 0.000591614331488494,
"loss": 4.2269,
"step": 2500
},
{
"epoch": 0.7425310115893076,
"grad_norm": 0.36069026589393616,
"learning_rate": 0.0005914395572385667,
"loss": 4.2199,
"step": 2550
},
{
"epoch": 0.7570904431890979,
"grad_norm": 0.3404024541378021,
"learning_rate": 0.0005912647829886396,
"loss": 4.208,
"step": 2600
},
{
"epoch": 0.7716498747888882,
"grad_norm": 0.3656750023365021,
"learning_rate": 0.0005910900087387124,
"loss": 4.1972,
"step": 2650
},
{
"epoch": 0.7862093063886786,
"grad_norm": 0.3920387327671051,
"learning_rate": 0.0005909152344887853,
"loss": 4.1947,
"step": 2700
},
{
"epoch": 0.8007687379884689,
"grad_norm": 0.3375900089740753,
"learning_rate": 0.0005907404602388581,
"loss": 4.1886,
"step": 2750
},
{
"epoch": 0.8153281695882593,
"grad_norm": 0.3581906855106354,
"learning_rate": 0.0005905656859889308,
"loss": 4.1761,
"step": 2800
},
{
"epoch": 0.8298876011880496,
"grad_norm": 0.37578803300857544,
"learning_rate": 0.0005903909117390037,
"loss": 4.1768,
"step": 2850
},
{
"epoch": 0.84444703278784,
"grad_norm": 0.34982478618621826,
"learning_rate": 0.0005902161374890766,
"loss": 4.1671,
"step": 2900
},
{
"epoch": 0.8590064643876303,
"grad_norm": 0.36295685172080994,
"learning_rate": 0.0005900413632391494,
"loss": 4.1643,
"step": 2950
},
{
"epoch": 0.8735658959874206,
"grad_norm": 0.35229530930519104,
"learning_rate": 0.0005898665889892223,
"loss": 4.1293,
"step": 3000
},
{
"epoch": 0.8735658959874206,
"eval_accuracy": 0.3151443347008721,
"eval_loss": 4.097636699676514,
"eval_runtime": 54.4875,
"eval_samples_per_second": 305.446,
"eval_steps_per_second": 19.105,
"step": 3000
},
{
"epoch": 0.888125327587211,
"grad_norm": 0.3671216368675232,
"learning_rate": 0.0005896918147392951,
"loss": 4.1345,
"step": 3050
},
{
"epoch": 0.9026847591870013,
"grad_norm": 0.3501938581466675,
"learning_rate": 0.0005895170404893678,
"loss": 4.1387,
"step": 3100
},
{
"epoch": 0.9172441907867916,
"grad_norm": 0.3753458261489868,
"learning_rate": 0.0005893422662394407,
"loss": 4.1391,
"step": 3150
},
{
"epoch": 0.931803622386582,
"grad_norm": 0.32294249534606934,
"learning_rate": 0.0005891674919895135,
"loss": 4.111,
"step": 3200
},
{
"epoch": 0.9463630539863723,
"grad_norm": 0.3349330425262451,
"learning_rate": 0.0005889927177395864,
"loss": 4.1282,
"step": 3250
},
{
"epoch": 0.9609224855861627,
"grad_norm": 0.3496231138706207,
"learning_rate": 0.0005888179434896592,
"loss": 4.1165,
"step": 3300
},
{
"epoch": 0.975481917185953,
"grad_norm": 0.3345687985420227,
"learning_rate": 0.000588643169239732,
"loss": 4.1009,
"step": 3350
},
{
"epoch": 0.9900413487857435,
"grad_norm": 0.3242470920085907,
"learning_rate": 0.0005884683949898048,
"loss": 4.0959,
"step": 3400
},
{
"epoch": 1.0043678294799372,
"grad_norm": 0.34464624524116516,
"learning_rate": 0.0005882936207398776,
"loss": 4.0755,
"step": 3450
},
{
"epoch": 1.0189272610797275,
"grad_norm": 0.3738979399204254,
"learning_rate": 0.0005881188464899504,
"loss": 4.0271,
"step": 3500
},
{
"epoch": 1.0334866926795179,
"grad_norm": 0.32653385400772095,
"learning_rate": 0.0005879440722400233,
"loss": 4.0212,
"step": 3550
},
{
"epoch": 1.0480461242793082,
"grad_norm": 0.3483313024044037,
"learning_rate": 0.0005877692979900961,
"loss": 4.0193,
"step": 3600
},
{
"epoch": 1.0626055558790986,
"grad_norm": 0.3408651351928711,
"learning_rate": 0.000587594523740169,
"loss": 4.0118,
"step": 3650
},
{
"epoch": 1.077164987478889,
"grad_norm": 0.33649924397468567,
"learning_rate": 0.0005874197494902417,
"loss": 4.0068,
"step": 3700
},
{
"epoch": 1.0917244190786792,
"grad_norm": 0.3527706563472748,
"learning_rate": 0.0005872449752403145,
"loss": 4.0024,
"step": 3750
},
{
"epoch": 1.1062838506784696,
"grad_norm": 0.35581859946250916,
"learning_rate": 0.0005870702009903874,
"loss": 4.0074,
"step": 3800
},
{
"epoch": 1.12084328227826,
"grad_norm": 0.3438279628753662,
"learning_rate": 0.0005868954267404602,
"loss": 4.0077,
"step": 3850
},
{
"epoch": 1.1354027138780503,
"grad_norm": 0.3411901593208313,
"learning_rate": 0.0005867206524905331,
"loss": 3.995,
"step": 3900
},
{
"epoch": 1.1499621454778406,
"grad_norm": 0.3471081852912903,
"learning_rate": 0.0005865458782406058,
"loss": 4.0004,
"step": 3950
},
{
"epoch": 1.164521577077631,
"grad_norm": 0.3431474566459656,
"learning_rate": 0.0005863711039906786,
"loss": 3.9912,
"step": 4000
},
{
"epoch": 1.164521577077631,
"eval_accuracy": 0.3247543409880778,
"eval_loss": 3.995392322540283,
"eval_runtime": 53.9545,
"eval_samples_per_second": 308.464,
"eval_steps_per_second": 19.294,
"step": 4000
},
{
"epoch": 1.1790810086774213,
"grad_norm": 0.33514344692230225,
"learning_rate": 0.0005861963297407515,
"loss": 4.0057,
"step": 4050
},
{
"epoch": 1.1936404402772116,
"grad_norm": 0.33262795209884644,
"learning_rate": 0.0005860215554908243,
"loss": 3.9851,
"step": 4100
},
{
"epoch": 1.208199871877002,
"grad_norm": 0.3248565196990967,
"learning_rate": 0.0005858467812408972,
"loss": 3.9799,
"step": 4150
},
{
"epoch": 1.2227593034767923,
"grad_norm": 0.33513668179512024,
"learning_rate": 0.00058567200699097,
"loss": 3.9747,
"step": 4200
},
{
"epoch": 1.2373187350765826,
"grad_norm": 0.3263963460922241,
"learning_rate": 0.0005854972327410427,
"loss": 3.9791,
"step": 4250
},
{
"epoch": 1.251878166676373,
"grad_norm": 0.34506794810295105,
"learning_rate": 0.0005853224584911156,
"loss": 3.9817,
"step": 4300
},
{
"epoch": 1.2664375982761633,
"grad_norm": 0.319431871175766,
"learning_rate": 0.0005851476842411884,
"loss": 3.9671,
"step": 4350
},
{
"epoch": 1.2809970298759537,
"grad_norm": 0.33263441920280457,
"learning_rate": 0.0005849729099912613,
"loss": 3.9745,
"step": 4400
},
{
"epoch": 1.295556461475744,
"grad_norm": 0.33536776900291443,
"learning_rate": 0.0005847981357413341,
"loss": 3.9744,
"step": 4450
},
{
"epoch": 1.3101158930755343,
"grad_norm": 0.34813007712364197,
"learning_rate": 0.0005846233614914068,
"loss": 3.9551,
"step": 4500
},
{
"epoch": 1.3246753246753247,
"grad_norm": 0.34729769825935364,
"learning_rate": 0.0005844485872414797,
"loss": 3.955,
"step": 4550
},
{
"epoch": 1.339234756275115,
"grad_norm": 0.34845030307769775,
"learning_rate": 0.0005842738129915525,
"loss": 3.9533,
"step": 4600
},
{
"epoch": 1.3537941878749054,
"grad_norm": 0.32806530594825745,
"learning_rate": 0.0005840990387416253,
"loss": 3.9656,
"step": 4650
},
{
"epoch": 1.3683536194746957,
"grad_norm": 0.3774058222770691,
"learning_rate": 0.0005839242644916982,
"loss": 3.9569,
"step": 4700
},
{
"epoch": 1.382913051074486,
"grad_norm": 0.3156019449234009,
"learning_rate": 0.000583749490241771,
"loss": 3.9416,
"step": 4750
},
{
"epoch": 1.3974724826742764,
"grad_norm": 0.3090326488018036,
"learning_rate": 0.0005835747159918438,
"loss": 3.9483,
"step": 4800
},
{
"epoch": 1.4120319142740667,
"grad_norm": 0.33362695574760437,
"learning_rate": 0.0005833999417419166,
"loss": 3.9398,
"step": 4850
},
{
"epoch": 1.426591345873857,
"grad_norm": 0.32493874430656433,
"learning_rate": 0.0005832251674919894,
"loss": 3.9381,
"step": 4900
},
{
"epoch": 1.4411507774736474,
"grad_norm": 0.32908692955970764,
"learning_rate": 0.0005830503932420623,
"loss": 3.9488,
"step": 4950
},
{
"epoch": 1.4557102090734377,
"grad_norm": 0.31451448798179626,
"learning_rate": 0.0005828756189921351,
"loss": 3.9401,
"step": 5000
},
{
"epoch": 1.4557102090734377,
"eval_accuracy": 0.33137442644092774,
"eval_loss": 3.9215424060821533,
"eval_runtime": 54.124,
"eval_samples_per_second": 307.497,
"eval_steps_per_second": 19.234,
"step": 5000
},
{
"epoch": 1.470269640673228,
"grad_norm": 0.3171522319316864,
"learning_rate": 0.000582700844742208,
"loss": 3.9336,
"step": 5050
},
{
"epoch": 1.4848290722730184,
"grad_norm": 0.3146921694278717,
"learning_rate": 0.0005825260704922807,
"loss": 3.9335,
"step": 5100
},
{
"epoch": 1.4993885038728088,
"grad_norm": 0.354244589805603,
"learning_rate": 0.0005823512962423535,
"loss": 3.9259,
"step": 5150
},
{
"epoch": 1.5139479354725993,
"grad_norm": 0.3321908116340637,
"learning_rate": 0.0005821765219924264,
"loss": 3.9218,
"step": 5200
},
{
"epoch": 1.5285073670723897,
"grad_norm": 0.3170032203197479,
"learning_rate": 0.0005820017477424992,
"loss": 3.9297,
"step": 5250
},
{
"epoch": 1.54306679867218,
"grad_norm": 0.3232501149177551,
"learning_rate": 0.0005818269734925721,
"loss": 3.9106,
"step": 5300
},
{
"epoch": 1.5576262302719703,
"grad_norm": 0.321432888507843,
"learning_rate": 0.0005816521992426448,
"loss": 3.924,
"step": 5350
},
{
"epoch": 1.5721856618717607,
"grad_norm": 0.3379034399986267,
"learning_rate": 0.0005814774249927176,
"loss": 3.9177,
"step": 5400
},
{
"epoch": 1.586745093471551,
"grad_norm": 0.31285661458969116,
"learning_rate": 0.0005813026507427905,
"loss": 3.9152,
"step": 5450
},
{
"epoch": 1.6013045250713414,
"grad_norm": 0.3218041658401489,
"learning_rate": 0.0005811278764928634,
"loss": 3.9095,
"step": 5500
},
{
"epoch": 1.6158639566711317,
"grad_norm": 0.29677674174308777,
"learning_rate": 0.0005809531022429362,
"loss": 3.907,
"step": 5550
},
{
"epoch": 1.630423388270922,
"grad_norm": 0.33540773391723633,
"learning_rate": 0.0005807783279930091,
"loss": 3.9074,
"step": 5600
},
{
"epoch": 1.6449828198707124,
"grad_norm": 0.3314766585826874,
"learning_rate": 0.0005806035537430818,
"loss": 3.9054,
"step": 5650
},
{
"epoch": 1.6595422514705027,
"grad_norm": 0.3158515989780426,
"learning_rate": 0.0005804287794931546,
"loss": 3.8869,
"step": 5700
},
{
"epoch": 1.674101683070293,
"grad_norm": 0.30713674426078796,
"learning_rate": 0.0005802540052432275,
"loss": 3.8962,
"step": 5750
},
{
"epoch": 1.6886611146700834,
"grad_norm": 0.334526389837265,
"learning_rate": 0.0005800792309933003,
"loss": 3.8883,
"step": 5800
},
{
"epoch": 1.7032205462698737,
"grad_norm": 0.31845974922180176,
"learning_rate": 0.0005799044567433732,
"loss": 3.9004,
"step": 5850
},
{
"epoch": 1.717779977869664,
"grad_norm": 0.31406348943710327,
"learning_rate": 0.000579729682493446,
"loss": 3.8783,
"step": 5900
},
{
"epoch": 1.7323394094694544,
"grad_norm": 0.30887269973754883,
"learning_rate": 0.0005795549082435187,
"loss": 3.8818,
"step": 5950
},
{
"epoch": 1.7468988410692448,
"grad_norm": 0.3162541687488556,
"learning_rate": 0.0005793801339935916,
"loss": 3.8958,
"step": 6000
},
{
"epoch": 1.7468988410692448,
"eval_accuracy": 0.33682020249576317,
"eval_loss": 3.8626632690429688,
"eval_runtime": 54.4766,
"eval_samples_per_second": 305.507,
"eval_steps_per_second": 19.109,
"step": 6000
},
{
"epoch": 1.761458272669035,
"grad_norm": 0.32636117935180664,
"learning_rate": 0.0005792053597436644,
"loss": 3.8833,
"step": 6050
},
{
"epoch": 1.7760177042688254,
"grad_norm": 0.3097558319568634,
"learning_rate": 0.0005790305854937372,
"loss": 3.8691,
"step": 6100
},
{
"epoch": 1.7905771358686158,
"grad_norm": 0.34034818410873413,
"learning_rate": 0.0005788558112438101,
"loss": 3.8802,
"step": 6150
},
{
"epoch": 1.8051365674684061,
"grad_norm": 0.32233792543411255,
"learning_rate": 0.0005786810369938828,
"loss": 3.8883,
"step": 6200
},
{
"epoch": 1.8196959990681965,
"grad_norm": 0.3292987048625946,
"learning_rate": 0.0005785062627439557,
"loss": 3.8728,
"step": 6250
},
{
"epoch": 1.8342554306679868,
"grad_norm": 0.31761112809181213,
"learning_rate": 0.0005783314884940285,
"loss": 3.8788,
"step": 6300
},
{
"epoch": 1.8488148622677771,
"grad_norm": 0.3174941837787628,
"learning_rate": 0.0005781567142441013,
"loss": 3.8706,
"step": 6350
},
{
"epoch": 1.8633742938675675,
"grad_norm": 0.2970981299877167,
"learning_rate": 0.0005779819399941742,
"loss": 3.8679,
"step": 6400
},
{
"epoch": 1.8779337254673578,
"grad_norm": 0.32310914993286133,
"learning_rate": 0.000577807165744247,
"loss": 3.8601,
"step": 6450
},
{
"epoch": 1.8924931570671482,
"grad_norm": 0.3130098879337311,
"learning_rate": 0.0005776323914943198,
"loss": 3.864,
"step": 6500
},
{
"epoch": 1.9070525886669385,
"grad_norm": 0.2963304817676544,
"learning_rate": 0.0005774576172443926,
"loss": 3.8588,
"step": 6550
},
{
"epoch": 1.9216120202667288,
"grad_norm": 0.32023951411247253,
"learning_rate": 0.0005772828429944654,
"loss": 3.8634,
"step": 6600
},
{
"epoch": 1.9361714518665192,
"grad_norm": 0.306027889251709,
"learning_rate": 0.0005771080687445383,
"loss": 3.8449,
"step": 6650
},
{
"epoch": 1.9507308834663095,
"grad_norm": 0.3124430775642395,
"learning_rate": 0.0005769332944946111,
"loss": 3.8598,
"step": 6700
},
{
"epoch": 1.9652903150660999,
"grad_norm": 0.3299570083618164,
"learning_rate": 0.0005767585202446839,
"loss": 3.848,
"step": 6750
},
{
"epoch": 1.9798497466658902,
"grad_norm": 0.31828752160072327,
"learning_rate": 0.0005765837459947567,
"loss": 3.8571,
"step": 6800
},
{
"epoch": 1.9944091782656805,
"grad_norm": 0.3087295591831207,
"learning_rate": 0.0005764089717448295,
"loss": 3.8415,
"step": 6850
},
{
"epoch": 2.0087356589598744,
"grad_norm": 0.3276985287666321,
"learning_rate": 0.0005762341974949024,
"loss": 3.7926,
"step": 6900
},
{
"epoch": 2.0232950905596647,
"grad_norm": 0.3149563670158386,
"learning_rate": 0.0005760594232449752,
"loss": 3.7356,
"step": 6950
},
{
"epoch": 2.037854522159455,
"grad_norm": 0.2993158996105194,
"learning_rate": 0.0005758846489950481,
"loss": 3.7519,
"step": 7000
},
{
"epoch": 2.037854522159455,
"eval_accuracy": 0.340481644404722,
"eval_loss": 3.821302890777588,
"eval_runtime": 54.3871,
"eval_samples_per_second": 306.01,
"eval_steps_per_second": 19.141,
"step": 7000
},
{
"epoch": 2.0524139537592454,
"grad_norm": 0.31537163257598877,
"learning_rate": 0.0005757098747451208,
"loss": 3.7487,
"step": 7050
},
{
"epoch": 2.0669733853590357,
"grad_norm": 0.30981120467185974,
"learning_rate": 0.0005755351004951936,
"loss": 3.7554,
"step": 7100
},
{
"epoch": 2.081532816958826,
"grad_norm": 0.3070833086967468,
"learning_rate": 0.0005753603262452665,
"loss": 3.7464,
"step": 7150
},
{
"epoch": 2.0960922485586164,
"grad_norm": 0.3225395977497101,
"learning_rate": 0.0005751855519953393,
"loss": 3.7526,
"step": 7200
},
{
"epoch": 2.1106516801584068,
"grad_norm": 0.3187791407108307,
"learning_rate": 0.0005750107777454121,
"loss": 3.7594,
"step": 7250
},
{
"epoch": 2.125211111758197,
"grad_norm": 0.32253599166870117,
"learning_rate": 0.0005748360034954849,
"loss": 3.759,
"step": 7300
},
{
"epoch": 2.1397705433579874,
"grad_norm": 0.3304608464241028,
"learning_rate": 0.0005746612292455577,
"loss": 3.755,
"step": 7350
},
{
"epoch": 2.154329974957778,
"grad_norm": 0.31175366044044495,
"learning_rate": 0.0005744864549956306,
"loss": 3.7636,
"step": 7400
},
{
"epoch": 2.168889406557568,
"grad_norm": 0.3209310472011566,
"learning_rate": 0.0005743116807457034,
"loss": 3.7474,
"step": 7450
},
{
"epoch": 2.1834488381573585,
"grad_norm": 0.32411980628967285,
"learning_rate": 0.0005741369064957762,
"loss": 3.7611,
"step": 7500
},
{
"epoch": 2.198008269757149,
"grad_norm": 0.31499746441841125,
"learning_rate": 0.0005739621322458491,
"loss": 3.7567,
"step": 7550
},
{
"epoch": 2.212567701356939,
"grad_norm": 0.3134678602218628,
"learning_rate": 0.0005737873579959218,
"loss": 3.7556,
"step": 7600
},
{
"epoch": 2.2271271329567295,
"grad_norm": 0.31595176458358765,
"learning_rate": 0.0005736125837459947,
"loss": 3.7522,
"step": 7650
},
{
"epoch": 2.24168656455652,
"grad_norm": 0.33326753973960876,
"learning_rate": 0.0005734378094960675,
"loss": 3.7659,
"step": 7700
},
{
"epoch": 2.25624599615631,
"grad_norm": 0.3168000876903534,
"learning_rate": 0.0005732630352461403,
"loss": 3.7552,
"step": 7750
},
{
"epoch": 2.2708054277561005,
"grad_norm": 0.3116115629673004,
"learning_rate": 0.0005730882609962132,
"loss": 3.7474,
"step": 7800
},
{
"epoch": 2.285364859355891,
"grad_norm": 0.30287280678749084,
"learning_rate": 0.0005729134867462859,
"loss": 3.7573,
"step": 7850
},
{
"epoch": 2.299924290955681,
"grad_norm": 0.3045010566711426,
"learning_rate": 0.0005727387124963588,
"loss": 3.7762,
"step": 7900
},
{
"epoch": 2.3144837225554715,
"grad_norm": 0.3148520290851593,
"learning_rate": 0.0005725639382464317,
"loss": 3.748,
"step": 7950
},
{
"epoch": 2.329043154155262,
"grad_norm": 0.31695887446403503,
"learning_rate": 0.0005723891639965045,
"loss": 3.756,
"step": 8000
},
{
"epoch": 2.329043154155262,
"eval_accuracy": 0.3437476519985189,
"eval_loss": 3.7891645431518555,
"eval_runtime": 53.9955,
"eval_samples_per_second": 308.229,
"eval_steps_per_second": 19.279,
"step": 8000
},
{
"epoch": 2.343602585755052,
"grad_norm": 0.3124977648258209,
"learning_rate": 0.0005722143897465773,
"loss": 3.7641,
"step": 8050
},
{
"epoch": 2.3581620173548425,
"grad_norm": 0.3077414631843567,
"learning_rate": 0.0005720396154966502,
"loss": 3.7611,
"step": 8100
},
{
"epoch": 2.372721448954633,
"grad_norm": 0.31669649481773376,
"learning_rate": 0.0005718648412467229,
"loss": 3.7623,
"step": 8150
},
{
"epoch": 2.3872808805544232,
"grad_norm": 0.3225698173046112,
"learning_rate": 0.0005716900669967958,
"loss": 3.7578,
"step": 8200
},
{
"epoch": 2.4018403121542136,
"grad_norm": 0.31290560960769653,
"learning_rate": 0.0005715152927468686,
"loss": 3.7603,
"step": 8250
},
{
"epoch": 2.416399743754004,
"grad_norm": 0.3104917109012604,
"learning_rate": 0.0005713405184969414,
"loss": 3.7553,
"step": 8300
},
{
"epoch": 2.4309591753537942,
"grad_norm": 0.3150444030761719,
"learning_rate": 0.0005711657442470143,
"loss": 3.7486,
"step": 8350
},
{
"epoch": 2.4455186069535846,
"grad_norm": 0.31220582127571106,
"learning_rate": 0.000570990969997087,
"loss": 3.7551,
"step": 8400
},
{
"epoch": 2.460078038553375,
"grad_norm": 0.29195117950439453,
"learning_rate": 0.0005708161957471599,
"loss": 3.7531,
"step": 8450
},
{
"epoch": 2.4746374701531653,
"grad_norm": 0.31497254967689514,
"learning_rate": 0.0005706414214972327,
"loss": 3.7579,
"step": 8500
},
{
"epoch": 2.4891969017529556,
"grad_norm": 0.31675657629966736,
"learning_rate": 0.0005704666472473055,
"loss": 3.7515,
"step": 8550
},
{
"epoch": 2.503756333352746,
"grad_norm": 0.30652034282684326,
"learning_rate": 0.0005702918729973784,
"loss": 3.7504,
"step": 8600
},
{
"epoch": 2.5183157649525363,
"grad_norm": 0.30155831575393677,
"learning_rate": 0.0005701170987474512,
"loss": 3.755,
"step": 8650
},
{
"epoch": 2.5328751965523266,
"grad_norm": 0.2939681112766266,
"learning_rate": 0.0005699423244975239,
"loss": 3.7377,
"step": 8700
},
{
"epoch": 2.547434628152117,
"grad_norm": 0.31261202692985535,
"learning_rate": 0.0005697675502475968,
"loss": 3.7476,
"step": 8750
},
{
"epoch": 2.5619940597519073,
"grad_norm": 0.31194260716438293,
"learning_rate": 0.0005695927759976696,
"loss": 3.7342,
"step": 8800
},
{
"epoch": 2.5765534913516976,
"grad_norm": 0.3196016550064087,
"learning_rate": 0.0005694180017477425,
"loss": 3.7295,
"step": 8850
},
{
"epoch": 2.591112922951488,
"grad_norm": 0.30109933018684387,
"learning_rate": 0.0005692432274978153,
"loss": 3.7498,
"step": 8900
},
{
"epoch": 2.6056723545512783,
"grad_norm": 0.33264702558517456,
"learning_rate": 0.000569068453247888,
"loss": 3.7422,
"step": 8950
},
{
"epoch": 2.6202317861510687,
"grad_norm": 0.30177509784698486,
"learning_rate": 0.0005688936789979609,
"loss": 3.7507,
"step": 9000
},
{
"epoch": 2.6202317861510687,
"eval_accuracy": 0.34679883399201816,
"eval_loss": 3.7572646141052246,
"eval_runtime": 54.2719,
"eval_samples_per_second": 306.66,
"eval_steps_per_second": 19.181,
"step": 9000
},
{
"epoch": 2.634791217750859,
"grad_norm": 0.31919026374816895,
"learning_rate": 0.0005687189047480337,
"loss": 3.7458,
"step": 9050
},
{
"epoch": 2.6493506493506493,
"grad_norm": 0.3107492923736572,
"learning_rate": 0.0005685441304981066,
"loss": 3.742,
"step": 9100
},
{
"epoch": 2.6639100809504397,
"grad_norm": 0.3181777596473694,
"learning_rate": 0.0005683693562481794,
"loss": 3.7316,
"step": 9150
},
{
"epoch": 2.67846951255023,
"grad_norm": 0.30232176184654236,
"learning_rate": 0.0005681945819982522,
"loss": 3.7382,
"step": 9200
},
{
"epoch": 2.6930289441500204,
"grad_norm": 0.3122684359550476,
"learning_rate": 0.000568019807748325,
"loss": 3.7305,
"step": 9250
},
{
"epoch": 2.7075883757498107,
"grad_norm": 0.31523433327674866,
"learning_rate": 0.0005678450334983978,
"loss": 3.7453,
"step": 9300
},
{
"epoch": 2.722147807349601,
"grad_norm": 0.2995758652687073,
"learning_rate": 0.0005676702592484707,
"loss": 3.7388,
"step": 9350
},
{
"epoch": 2.7367072389493914,
"grad_norm": 0.29348045587539673,
"learning_rate": 0.0005674954849985435,
"loss": 3.7336,
"step": 9400
},
{
"epoch": 2.7512666705491817,
"grad_norm": 0.3128356337547302,
"learning_rate": 0.0005673207107486163,
"loss": 3.7327,
"step": 9450
},
{
"epoch": 2.765826102148972,
"grad_norm": 0.3079585134983063,
"learning_rate": 0.0005671459364986892,
"loss": 3.7333,
"step": 9500
},
{
"epoch": 2.7803855337487624,
"grad_norm": 0.3393835127353668,
"learning_rate": 0.0005669711622487619,
"loss": 3.7363,
"step": 9550
},
{
"epoch": 2.7949449653485527,
"grad_norm": 0.31067395210266113,
"learning_rate": 0.0005667963879988348,
"loss": 3.7345,
"step": 9600
},
{
"epoch": 2.809504396948343,
"grad_norm": 0.310585081577301,
"learning_rate": 0.0005666216137489076,
"loss": 3.7407,
"step": 9650
},
{
"epoch": 2.8240638285481334,
"grad_norm": 0.30752629041671753,
"learning_rate": 0.0005664468394989804,
"loss": 3.7323,
"step": 9700
},
{
"epoch": 2.8386232601479238,
"grad_norm": 0.3025956451892853,
"learning_rate": 0.0005662720652490533,
"loss": 3.74,
"step": 9750
},
{
"epoch": 2.853182691747714,
"grad_norm": 0.3014912009239197,
"learning_rate": 0.000566097290999126,
"loss": 3.7323,
"step": 9800
},
{
"epoch": 2.8677421233475044,
"grad_norm": 0.3069649934768677,
"learning_rate": 0.0005659225167491988,
"loss": 3.7426,
"step": 9850
},
{
"epoch": 2.882301554947295,
"grad_norm": 0.3200516104698181,
"learning_rate": 0.0005657477424992717,
"loss": 3.7251,
"step": 9900
},
{
"epoch": 2.896860986547085,
"grad_norm": 0.29957816004753113,
"learning_rate": 0.0005655729682493445,
"loss": 3.739,
"step": 9950
},
{
"epoch": 2.9114204181468755,
"grad_norm": 0.2948393225669861,
"learning_rate": 0.0005653981939994174,
"loss": 3.731,
"step": 10000
},
{
"epoch": 2.9114204181468755,
"eval_accuracy": 0.3490493878998981,
"eval_loss": 3.7306628227233887,
"eval_runtime": 54.2596,
"eval_samples_per_second": 306.729,
"eval_steps_per_second": 19.186,
"step": 10000
},
{
"epoch": 2.925979849746666,
"grad_norm": 0.29376843571662903,
"learning_rate": 0.0005652234197494902,
"loss": 3.7304,
"step": 10050
},
{
"epoch": 2.940539281346456,
"grad_norm": 0.30596283078193665,
"learning_rate": 0.0005650486454995629,
"loss": 3.7211,
"step": 10100
},
{
"epoch": 2.9550987129462465,
"grad_norm": 0.29923945665359497,
"learning_rate": 0.0005648738712496358,
"loss": 3.7182,
"step": 10150
},
{
"epoch": 2.969658144546037,
"grad_norm": 0.2963240444660187,
"learning_rate": 0.0005646990969997086,
"loss": 3.7241,
"step": 10200
},
{
"epoch": 2.984217576145827,
"grad_norm": 0.31028681993484497,
"learning_rate": 0.0005645243227497815,
"loss": 3.7338,
"step": 10250
},
{
"epoch": 2.9987770077456175,
"grad_norm": 0.33405447006225586,
"learning_rate": 0.0005643495484998543,
"loss": 3.7299,
"step": 10300
},
{
"epoch": 3.0131034884398114,
"grad_norm": 0.2879531979560852,
"learning_rate": 0.000564174774249927,
"loss": 3.6315,
"step": 10350
},
{
"epoch": 3.0276629200396017,
"grad_norm": 0.28263619542121887,
"learning_rate": 0.0005639999999999999,
"loss": 3.6257,
"step": 10400
},
{
"epoch": 3.042222351639392,
"grad_norm": 0.31342795491218567,
"learning_rate": 0.0005638252257500727,
"loss": 3.6218,
"step": 10450
},
{
"epoch": 3.0567817832391824,
"grad_norm": 0.2994714081287384,
"learning_rate": 0.0005636504515001456,
"loss": 3.6184,
"step": 10500
},
{
"epoch": 3.0713412148389727,
"grad_norm": 0.3202952444553375,
"learning_rate": 0.0005634756772502185,
"loss": 3.613,
"step": 10550
},
{
"epoch": 3.085900646438763,
"grad_norm": 0.3065634071826935,
"learning_rate": 0.0005633009030002913,
"loss": 3.6314,
"step": 10600
},
{
"epoch": 3.1004600780385534,
"grad_norm": 0.29666420817375183,
"learning_rate": 0.000563126128750364,
"loss": 3.629,
"step": 10650
},
{
"epoch": 3.1150195096383437,
"grad_norm": 0.3102670907974243,
"learning_rate": 0.0005629513545004369,
"loss": 3.6263,
"step": 10700
},
{
"epoch": 3.129578941238134,
"grad_norm": 0.2930033206939697,
"learning_rate": 0.0005627765802505097,
"loss": 3.6311,
"step": 10750
},
{
"epoch": 3.1441383728379244,
"grad_norm": 0.3130742609500885,
"learning_rate": 0.0005626018060005826,
"loss": 3.6318,
"step": 10800
},
{
"epoch": 3.1586978044377148,
"grad_norm": 0.311313271522522,
"learning_rate": 0.0005624270317506554,
"loss": 3.6366,
"step": 10850
},
{
"epoch": 3.173257236037505,
"grad_norm": 0.32204383611679077,
"learning_rate": 0.0005622522575007282,
"loss": 3.6384,
"step": 10900
},
{
"epoch": 3.1878166676372954,
"grad_norm": 0.3110329508781433,
"learning_rate": 0.000562077483250801,
"loss": 3.6283,
"step": 10950
},
{
"epoch": 3.2023760992370858,
"grad_norm": 0.3356837034225464,
"learning_rate": 0.0005619027090008738,
"loss": 3.6279,
"step": 11000
},
{
"epoch": 3.2023760992370858,
"eval_accuracy": 0.3507460045319148,
"eval_loss": 3.720451593399048,
"eval_runtime": 54.2329,
"eval_samples_per_second": 306.88,
"eval_steps_per_second": 19.195,
"step": 11000
},
{
"epoch": 3.216935530836876,
"grad_norm": 0.3050292134284973,
"learning_rate": 0.0005617279347509467,
"loss": 3.6233,
"step": 11050
},
{
"epoch": 3.2314949624366665,
"grad_norm": 0.30432161688804626,
"learning_rate": 0.0005615531605010195,
"loss": 3.6161,
"step": 11100
},
{
"epoch": 3.246054394036457,
"grad_norm": 0.30849170684814453,
"learning_rate": 0.0005613783862510923,
"loss": 3.6405,
"step": 11150
},
{
"epoch": 3.260613825636247,
"grad_norm": 0.2979888617992401,
"learning_rate": 0.0005612036120011652,
"loss": 3.6458,
"step": 11200
},
{
"epoch": 3.2751732572360375,
"grad_norm": 0.3145684599876404,
"learning_rate": 0.0005610288377512379,
"loss": 3.653,
"step": 11250
},
{
"epoch": 3.289732688835828,
"grad_norm": 0.3270987868309021,
"learning_rate": 0.0005608540635013107,
"loss": 3.6535,
"step": 11300
},
{
"epoch": 3.304292120435618,
"grad_norm": 0.3080297112464905,
"learning_rate": 0.0005606792892513836,
"loss": 3.6421,
"step": 11350
},
{
"epoch": 3.3188515520354085,
"grad_norm": 0.3096049427986145,
"learning_rate": 0.0005605045150014564,
"loss": 3.6457,
"step": 11400
},
{
"epoch": 3.333410983635199,
"grad_norm": 0.3183659315109253,
"learning_rate": 0.0005603297407515293,
"loss": 3.6523,
"step": 11450
},
{
"epoch": 3.347970415234989,
"grad_norm": 0.29702067375183105,
"learning_rate": 0.000560154966501602,
"loss": 3.6438,
"step": 11500
},
{
"epoch": 3.3625298468347795,
"grad_norm": 0.3221289813518524,
"learning_rate": 0.0005599801922516748,
"loss": 3.6431,
"step": 11550
},
{
"epoch": 3.37708927843457,
"grad_norm": 0.30078771710395813,
"learning_rate": 0.0005598054180017477,
"loss": 3.643,
"step": 11600
},
{
"epoch": 3.39164871003436,
"grad_norm": 0.2992677390575409,
"learning_rate": 0.0005596306437518205,
"loss": 3.6412,
"step": 11650
},
{
"epoch": 3.4062081416341505,
"grad_norm": 0.2946924567222595,
"learning_rate": 0.0005594558695018934,
"loss": 3.6514,
"step": 11700
},
{
"epoch": 3.420767573233941,
"grad_norm": 0.3194354772567749,
"learning_rate": 0.0005592810952519662,
"loss": 3.6428,
"step": 11750
},
{
"epoch": 3.435327004833731,
"grad_norm": 0.30356186628341675,
"learning_rate": 0.0005591063210020389,
"loss": 3.644,
"step": 11800
},
{
"epoch": 3.4498864364335216,
"grad_norm": 0.3009282350540161,
"learning_rate": 0.0005589315467521118,
"loss": 3.6435,
"step": 11850
},
{
"epoch": 3.464445868033312,
"grad_norm": 0.3127005994319916,
"learning_rate": 0.0005587567725021846,
"loss": 3.6493,
"step": 11900
},
{
"epoch": 3.4790052996331022,
"grad_norm": 0.3001411557197571,
"learning_rate": 0.0005585819982522575,
"loss": 3.643,
"step": 11950
},
{
"epoch": 3.4935647312328926,
"grad_norm": 0.30470672249794006,
"learning_rate": 0.0005584072240023303,
"loss": 3.6457,
"step": 12000
},
{
"epoch": 3.4935647312328926,
"eval_accuracy": 0.35282688501821313,
"eval_loss": 3.7005693912506104,
"eval_runtime": 54.1621,
"eval_samples_per_second": 307.281,
"eval_steps_per_second": 19.22,
"step": 12000
},
{
"epoch": 3.508124162832683,
"grad_norm": 0.3177192211151123,
"learning_rate": 0.000558232449752403,
"loss": 3.649,
"step": 12050
},
{
"epoch": 3.5226835944324733,
"grad_norm": 0.28869521617889404,
"learning_rate": 0.0005580576755024759,
"loss": 3.6462,
"step": 12100
},
{
"epoch": 3.5372430260322636,
"grad_norm": 0.2869119346141815,
"learning_rate": 0.0005578829012525487,
"loss": 3.6557,
"step": 12150
},
{
"epoch": 3.551802457632054,
"grad_norm": 0.2948986887931824,
"learning_rate": 0.0005577081270026216,
"loss": 3.6402,
"step": 12200
},
{
"epoch": 3.5663618892318443,
"grad_norm": 0.3073137402534485,
"learning_rate": 0.0005575333527526944,
"loss": 3.6419,
"step": 12250
},
{
"epoch": 3.5809213208316346,
"grad_norm": 0.2939525246620178,
"learning_rate": 0.0005573585785027672,
"loss": 3.6478,
"step": 12300
},
{
"epoch": 3.595480752431425,
"grad_norm": 0.30777987837791443,
"learning_rate": 0.00055718380425284,
"loss": 3.6392,
"step": 12350
},
{
"epoch": 3.6100401840312153,
"grad_norm": 0.2951711118221283,
"learning_rate": 0.0005570090300029128,
"loss": 3.6404,
"step": 12400
},
{
"epoch": 3.6245996156310056,
"grad_norm": 0.3050254285335541,
"learning_rate": 0.0005568342557529856,
"loss": 3.6431,
"step": 12450
},
{
"epoch": 3.639159047230796,
"grad_norm": 0.3149774670600891,
"learning_rate": 0.0005566594815030585,
"loss": 3.6474,
"step": 12500
},
{
"epoch": 3.6537184788305863,
"grad_norm": 0.29318898916244507,
"learning_rate": 0.0005564847072531313,
"loss": 3.6583,
"step": 12550
},
{
"epoch": 3.6682779104303767,
"grad_norm": 0.3082406222820282,
"learning_rate": 0.0005563099330032042,
"loss": 3.648,
"step": 12600
},
{
"epoch": 3.682837342030167,
"grad_norm": 0.30790096521377563,
"learning_rate": 0.0005561351587532769,
"loss": 3.6514,
"step": 12650
},
{
"epoch": 3.6973967736299573,
"grad_norm": 0.2913956046104431,
"learning_rate": 0.0005559603845033497,
"loss": 3.6446,
"step": 12700
},
{
"epoch": 3.7119562052297477,
"grad_norm": 0.2990446090698242,
"learning_rate": 0.0005557856102534226,
"loss": 3.648,
"step": 12750
},
{
"epoch": 3.726515636829538,
"grad_norm": 0.2935076057910919,
"learning_rate": 0.0005556108360034954,
"loss": 3.6476,
"step": 12800
},
{
"epoch": 3.7410750684293284,
"grad_norm": 0.30372142791748047,
"learning_rate": 0.0005554360617535683,
"loss": 3.6439,
"step": 12850
},
{
"epoch": 3.755634500029119,
"grad_norm": 0.2952456772327423,
"learning_rate": 0.000555261287503641,
"loss": 3.6436,
"step": 12900
},
{
"epoch": 3.770193931628909,
"grad_norm": 0.30808019638061523,
"learning_rate": 0.0005550865132537138,
"loss": 3.6507,
"step": 12950
},
{
"epoch": 3.7847533632287,
"grad_norm": 0.3047133684158325,
"learning_rate": 0.0005549117390037867,
"loss": 3.6444,
"step": 13000
},
{
"epoch": 3.7847533632287,
"eval_accuracy": 0.3544879913430104,
"eval_loss": 3.6809916496276855,
"eval_runtime": 54.2962,
"eval_samples_per_second": 306.522,
"eval_steps_per_second": 19.173,
"step": 13000
},
{
"epoch": 3.7993127948284897,
"grad_norm": 0.29825451970100403,
"learning_rate": 0.0005547369647538596,
"loss": 3.6342,
"step": 13050
},
{
"epoch": 3.8138722264282805,
"grad_norm": 0.303677499294281,
"learning_rate": 0.0005545621905039324,
"loss": 3.6411,
"step": 13100
},
{
"epoch": 3.8284316580280704,
"grad_norm": 0.29525500535964966,
"learning_rate": 0.0005543874162540053,
"loss": 3.6517,
"step": 13150
},
{
"epoch": 3.842991089627861,
"grad_norm": 0.28726667165756226,
"learning_rate": 0.000554212642004078,
"loss": 3.6351,
"step": 13200
},
{
"epoch": 3.857550521227651,
"grad_norm": 0.29034554958343506,
"learning_rate": 0.0005540378677541508,
"loss": 3.6442,
"step": 13250
},
{
"epoch": 3.872109952827442,
"grad_norm": 0.30698099732398987,
"learning_rate": 0.0005538630935042237,
"loss": 3.6443,
"step": 13300
},
{
"epoch": 3.8866693844272318,
"grad_norm": 0.31629928946495056,
"learning_rate": 0.0005536883192542965,
"loss": 3.6473,
"step": 13350
},
{
"epoch": 3.9012288160270225,
"grad_norm": 0.2991327941417694,
"learning_rate": 0.0005535135450043694,
"loss": 3.6418,
"step": 13400
},
{
"epoch": 3.9157882476268124,
"grad_norm": 0.31006455421447754,
"learning_rate": 0.0005533387707544422,
"loss": 3.6389,
"step": 13450
},
{
"epoch": 3.930347679226603,
"grad_norm": 0.3237093985080719,
"learning_rate": 0.0005531639965045149,
"loss": 3.6389,
"step": 13500
},
{
"epoch": 3.944907110826393,
"grad_norm": 0.2965652346611023,
"learning_rate": 0.0005529892222545878,
"loss": 3.638,
"step": 13550
},
{
"epoch": 3.959466542426184,
"grad_norm": 0.311987042427063,
"learning_rate": 0.0005528144480046606,
"loss": 3.643,
"step": 13600
},
{
"epoch": 3.974025974025974,
"grad_norm": 0.3185499310493469,
"learning_rate": 0.0005526396737547335,
"loss": 3.6513,
"step": 13650
},
{
"epoch": 3.9885854056257646,
"grad_norm": 0.30643561482429504,
"learning_rate": 0.0005524648995048063,
"loss": 3.6323,
"step": 13700
},
{
"epoch": 4.002911886319958,
"grad_norm": 0.3289678692817688,
"learning_rate": 0.000552290125254879,
"loss": 3.622,
"step": 13750
},
{
"epoch": 4.017471317919749,
"grad_norm": 0.2990996539592743,
"learning_rate": 0.0005521153510049519,
"loss": 3.5201,
"step": 13800
},
{
"epoch": 4.032030749519539,
"grad_norm": 0.3066459596157074,
"learning_rate": 0.0005519405767550247,
"loss": 3.5315,
"step": 13850
},
{
"epoch": 4.046590181119329,
"grad_norm": 0.30182674527168274,
"learning_rate": 0.0005517658025050975,
"loss": 3.539,
"step": 13900
},
{
"epoch": 4.061149612719119,
"grad_norm": 0.32139694690704346,
"learning_rate": 0.0005515910282551704,
"loss": 3.5314,
"step": 13950
},
{
"epoch": 4.07570904431891,
"grad_norm": 0.3230557441711426,
"learning_rate": 0.0005514162540052432,
"loss": 3.5475,
"step": 14000
},
{
"epoch": 4.07570904431891,
"eval_accuracy": 0.35576130629956376,
"eval_loss": 3.6757664680480957,
"eval_runtime": 54.2593,
"eval_samples_per_second": 306.731,
"eval_steps_per_second": 19.186,
"step": 14000
},
{
"epoch": 4.0902684759187,
"grad_norm": 0.33777207136154175,
"learning_rate": 0.000551241479755316,
"loss": 3.5486,
"step": 14050
},
{
"epoch": 4.104827907518491,
"grad_norm": 0.31867823004722595,
"learning_rate": 0.0005510667055053888,
"loss": 3.5385,
"step": 14100
},
{
"epoch": 4.119387339118281,
"grad_norm": 0.3000119924545288,
"learning_rate": 0.0005508919312554616,
"loss": 3.5491,
"step": 14150
},
{
"epoch": 4.1339467707180715,
"grad_norm": 0.3066224753856659,
"learning_rate": 0.0005507171570055345,
"loss": 3.5543,
"step": 14200
},
{
"epoch": 4.148506202317861,
"grad_norm": 0.31685900688171387,
"learning_rate": 0.0005505423827556073,
"loss": 3.5482,
"step": 14250
},
{
"epoch": 4.163065633917652,
"grad_norm": 0.34130823612213135,
"learning_rate": 0.0005503676085056802,
"loss": 3.5526,
"step": 14300
},
{
"epoch": 4.177625065517442,
"grad_norm": 0.3072553277015686,
"learning_rate": 0.0005501928342557529,
"loss": 3.5638,
"step": 14350
},
{
"epoch": 4.192184497117233,
"grad_norm": 0.3119163513183594,
"learning_rate": 0.0005500180600058257,
"loss": 3.5611,
"step": 14400
},
{
"epoch": 4.206743928717023,
"grad_norm": 0.3274585008621216,
"learning_rate": 0.0005498432857558986,
"loss": 3.5548,
"step": 14450
},
{
"epoch": 4.2213033603168135,
"grad_norm": 0.29814136028289795,
"learning_rate": 0.0005496685115059714,
"loss": 3.5676,
"step": 14500
},
{
"epoch": 4.235862791916603,
"grad_norm": 0.3296707570552826,
"learning_rate": 0.0005494937372560443,
"loss": 3.5638,
"step": 14550
},
{
"epoch": 4.250422223516394,
"grad_norm": 0.30175575613975525,
"learning_rate": 0.000549318963006117,
"loss": 3.5672,
"step": 14600
},
{
"epoch": 4.264981655116184,
"grad_norm": 0.3530194163322449,
"learning_rate": 0.0005491441887561898,
"loss": 3.5689,
"step": 14650
},
{
"epoch": 4.279541086715975,
"grad_norm": 0.31514260172843933,
"learning_rate": 0.0005489694145062627,
"loss": 3.5576,
"step": 14700
},
{
"epoch": 4.294100518315765,
"grad_norm": 0.31697317957878113,
"learning_rate": 0.0005487946402563355,
"loss": 3.5512,
"step": 14750
},
{
"epoch": 4.308659949915556,
"grad_norm": 0.3378666937351227,
"learning_rate": 0.0005486198660064084,
"loss": 3.5777,
"step": 14800
},
{
"epoch": 4.3232193815153455,
"grad_norm": 0.32078737020492554,
"learning_rate": 0.0005484450917564812,
"loss": 3.5714,
"step": 14850
},
{
"epoch": 4.337778813115136,
"grad_norm": 0.3216348886489868,
"learning_rate": 0.0005482703175065539,
"loss": 3.5704,
"step": 14900
},
{
"epoch": 4.352338244714926,
"grad_norm": 0.3143816590309143,
"learning_rate": 0.0005480955432566268,
"loss": 3.5698,
"step": 14950
},
{
"epoch": 4.366897676314717,
"grad_norm": 0.29025039076805115,
"learning_rate": 0.0005479207690066996,
"loss": 3.5642,
"step": 15000
},
{
"epoch": 4.366897676314717,
"eval_accuracy": 0.3568779996361957,
"eval_loss": 3.6625804901123047,
"eval_runtime": 54.3076,
"eval_samples_per_second": 306.458,
"eval_steps_per_second": 19.169,
"step": 15000
},
{
"epoch": 4.381457107914507,
"grad_norm": 0.30588728189468384,
"learning_rate": 0.0005477459947567725,
"loss": 3.5717,
"step": 15050
},
{
"epoch": 4.396016539514298,
"grad_norm": 0.31308284401893616,
"learning_rate": 0.0005475712205068453,
"loss": 3.581,
"step": 15100
},
{
"epoch": 4.4105759711140875,
"grad_norm": 0.3067929446697235,
"learning_rate": 0.000547396446256918,
"loss": 3.5683,
"step": 15150
},
{
"epoch": 4.425135402713878,
"grad_norm": 0.3118480443954468,
"learning_rate": 0.0005472216720069909,
"loss": 3.5769,
"step": 15200
},
{
"epoch": 4.439694834313668,
"grad_norm": 0.3015614151954651,
"learning_rate": 0.0005470468977570637,
"loss": 3.5702,
"step": 15250
},
{
"epoch": 4.454254265913459,
"grad_norm": 0.29570505023002625,
"learning_rate": 0.0005468721235071365,
"loss": 3.5623,
"step": 15300
},
{
"epoch": 4.468813697513249,
"grad_norm": 0.3057698905467987,
"learning_rate": 0.0005466973492572094,
"loss": 3.5715,
"step": 15350
},
{
"epoch": 4.48337312911304,
"grad_norm": 0.3069203197956085,
"learning_rate": 0.0005465225750072822,
"loss": 3.5627,
"step": 15400
},
{
"epoch": 4.4979325607128295,
"grad_norm": 0.30680912733078003,
"learning_rate": 0.000546347800757355,
"loss": 3.569,
"step": 15450
},
{
"epoch": 4.51249199231262,
"grad_norm": 0.3536013066768646,
"learning_rate": 0.0005461730265074279,
"loss": 3.5698,
"step": 15500
},
{
"epoch": 4.52705142391241,
"grad_norm": 0.2927176356315613,
"learning_rate": 0.0005459982522575007,
"loss": 3.5731,
"step": 15550
},
{
"epoch": 4.541610855512201,
"grad_norm": 0.2923072576522827,
"learning_rate": 0.0005458234780075735,
"loss": 3.5775,
"step": 15600
},
{
"epoch": 4.556170287111991,
"grad_norm": 0.3102036118507385,
"learning_rate": 0.0005456487037576464,
"loss": 3.5778,
"step": 15650
},
{
"epoch": 4.570729718711782,
"grad_norm": 0.31466609239578247,
"learning_rate": 0.0005454739295077192,
"loss": 3.5794,
"step": 15700
},
{
"epoch": 4.585289150311572,
"grad_norm": 0.3035258650779724,
"learning_rate": 0.000545299155257792,
"loss": 3.5683,
"step": 15750
},
{
"epoch": 4.599848581911362,
"grad_norm": 0.3166468143463135,
"learning_rate": 0.0005451243810078648,
"loss": 3.5833,
"step": 15800
},
{
"epoch": 4.614408013511152,
"grad_norm": 0.28802427649497986,
"learning_rate": 0.0005449496067579376,
"loss": 3.5812,
"step": 15850
},
{
"epoch": 4.628967445110943,
"grad_norm": 0.32211169600486755,
"learning_rate": 0.0005447748325080105,
"loss": 3.582,
"step": 15900
},
{
"epoch": 4.643526876710733,
"grad_norm": 0.32095086574554443,
"learning_rate": 0.0005446000582580833,
"loss": 3.5819,
"step": 15950
},
{
"epoch": 4.658086308310524,
"grad_norm": 0.2993008494377136,
"learning_rate": 0.0005444252840081562,
"loss": 3.5793,
"step": 16000
},
{
"epoch": 4.658086308310524,
"eval_accuracy": 0.35770002797318573,
"eval_loss": 3.65142560005188,
"eval_runtime": 54.1813,
"eval_samples_per_second": 307.172,
"eval_steps_per_second": 19.213,
"step": 16000
},
{
"epoch": 4.672645739910314,
"grad_norm": 0.2886819839477539,
"learning_rate": 0.0005442505097582289,
"loss": 3.5858,
"step": 16050
},
{
"epoch": 4.687205171510104,
"grad_norm": 0.30564039945602417,
"learning_rate": 0.0005440757355083017,
"loss": 3.5699,
"step": 16100
},
{
"epoch": 4.701764603109894,
"grad_norm": 0.3058094084262848,
"learning_rate": 0.0005439009612583746,
"loss": 3.5668,
"step": 16150
},
{
"epoch": 4.716324034709685,
"grad_norm": 0.33634814620018005,
"learning_rate": 0.0005437261870084474,
"loss": 3.587,
"step": 16200
},
{
"epoch": 4.730883466309475,
"grad_norm": 0.29354017972946167,
"learning_rate": 0.0005435514127585203,
"loss": 3.5741,
"step": 16250
},
{
"epoch": 4.745442897909266,
"grad_norm": 0.3387173116207123,
"learning_rate": 0.000543376638508593,
"loss": 3.5868,
"step": 16300
},
{
"epoch": 4.760002329509056,
"grad_norm": 0.3297916650772095,
"learning_rate": 0.0005432018642586658,
"loss": 3.5881,
"step": 16350
},
{
"epoch": 4.7745617611088464,
"grad_norm": 0.31599003076553345,
"learning_rate": 0.0005430270900087387,
"loss": 3.5791,
"step": 16400
},
{
"epoch": 4.789121192708636,
"grad_norm": 0.30269932746887207,
"learning_rate": 0.0005428523157588115,
"loss": 3.5765,
"step": 16450
},
{
"epoch": 4.803680624308427,
"grad_norm": 0.3012683093547821,
"learning_rate": 0.0005426775415088843,
"loss": 3.5923,
"step": 16500
},
{
"epoch": 4.818240055908217,
"grad_norm": 0.32081395387649536,
"learning_rate": 0.0005425027672589572,
"loss": 3.5764,
"step": 16550
},
{
"epoch": 4.832799487508008,
"grad_norm": 0.3058837056159973,
"learning_rate": 0.0005423279930090299,
"loss": 3.5786,
"step": 16600
},
{
"epoch": 4.847358919107798,
"grad_norm": 0.2925353944301605,
"learning_rate": 0.0005421532187591028,
"loss": 3.5914,
"step": 16650
},
{
"epoch": 4.8619183507075885,
"grad_norm": 0.3263484537601471,
"learning_rate": 0.0005419784445091756,
"loss": 3.5866,
"step": 16700
},
{
"epoch": 4.876477782307378,
"grad_norm": 0.28891900181770325,
"learning_rate": 0.0005418036702592484,
"loss": 3.5627,
"step": 16750
},
{
"epoch": 4.891037213907169,
"grad_norm": 0.2890537977218628,
"learning_rate": 0.0005416288960093213,
"loss": 3.583,
"step": 16800
},
{
"epoch": 4.905596645506959,
"grad_norm": 0.2902555465698242,
"learning_rate": 0.000541454121759394,
"loss": 3.5861,
"step": 16850
},
{
"epoch": 4.92015607710675,
"grad_norm": 0.30477380752563477,
"learning_rate": 0.0005412793475094669,
"loss": 3.5722,
"step": 16900
},
{
"epoch": 4.93471550870654,
"grad_norm": 0.2957979142665863,
"learning_rate": 0.0005411045732595397,
"loss": 3.5871,
"step": 16950
},
{
"epoch": 4.9492749403063305,
"grad_norm": 0.3154853880405426,
"learning_rate": 0.0005409297990096125,
"loss": 3.5751,
"step": 17000
},
{
"epoch": 4.9492749403063305,
"eval_accuracy": 0.35962369892056895,
"eval_loss": 3.6343870162963867,
"eval_runtime": 54.3437,
"eval_samples_per_second": 306.255,
"eval_steps_per_second": 19.156,
"step": 17000
},
{
"epoch": 4.96383437190612,
"grad_norm": 0.3399759531021118,
"learning_rate": 0.0005407550247596854,
"loss": 3.578,
"step": 17050
},
{
"epoch": 4.978393803505911,
"grad_norm": 0.30684736371040344,
"learning_rate": 0.0005405802505097582,
"loss": 3.5717,
"step": 17100
},
{
"epoch": 4.992953235105701,
"grad_norm": 0.30768290162086487,
"learning_rate": 0.000540405476259831,
"loss": 3.5805,
"step": 17150
},
{
"epoch": 5.007279715799895,
"grad_norm": 0.30980637669563293,
"learning_rate": 0.0005402307020099038,
"loss": 3.5216,
"step": 17200
},
{
"epoch": 5.021839147399685,
"grad_norm": 0.3266715705394745,
"learning_rate": 0.0005400559277599766,
"loss": 3.4649,
"step": 17250
},
{
"epoch": 5.036398578999476,
"grad_norm": 0.3132665455341339,
"learning_rate": 0.0005398811535100495,
"loss": 3.461,
"step": 17300
},
{
"epoch": 5.050958010599266,
"grad_norm": 0.3107033371925354,
"learning_rate": 0.0005397063792601223,
"loss": 3.4781,
"step": 17350
},
{
"epoch": 5.065517442199057,
"grad_norm": 0.29676178097724915,
"learning_rate": 0.0005395316050101951,
"loss": 3.475,
"step": 17400
},
{
"epoch": 5.080076873798847,
"grad_norm": 0.32961541414260864,
"learning_rate": 0.0005393568307602679,
"loss": 3.4675,
"step": 17450
},
{
"epoch": 5.094636305398637,
"grad_norm": 0.31969591975212097,
"learning_rate": 0.0005391820565103407,
"loss": 3.4834,
"step": 17500
},
{
"epoch": 5.109195736998427,
"grad_norm": 0.30894723534584045,
"learning_rate": 0.0005390072822604136,
"loss": 3.4935,
"step": 17550
},
{
"epoch": 5.123755168598218,
"grad_norm": 0.3231548070907593,
"learning_rate": 0.0005388325080104864,
"loss": 3.481,
"step": 17600
},
{
"epoch": 5.138314600198008,
"grad_norm": 0.30943936109542847,
"learning_rate": 0.0005386577337605593,
"loss": 3.4973,
"step": 17650
},
{
"epoch": 5.152874031797799,
"grad_norm": 0.30249306559562683,
"learning_rate": 0.000538482959510632,
"loss": 3.4798,
"step": 17700
},
{
"epoch": 5.167433463397589,
"grad_norm": 0.306157648563385,
"learning_rate": 0.0005383081852607048,
"loss": 3.4981,
"step": 17750
},
{
"epoch": 5.1819928949973795,
"grad_norm": 0.334652304649353,
"learning_rate": 0.0005381334110107777,
"loss": 3.4941,
"step": 17800
},
{
"epoch": 5.196552326597169,
"grad_norm": 0.3305426836013794,
"learning_rate": 0.0005379586367608505,
"loss": 3.505,
"step": 17850
},
{
"epoch": 5.21111175819696,
"grad_norm": 0.32884451746940613,
"learning_rate": 0.0005377838625109233,
"loss": 3.5129,
"step": 17900
},
{
"epoch": 5.22567118979675,
"grad_norm": 0.2979142665863037,
"learning_rate": 0.0005376090882609961,
"loss": 3.5081,
"step": 17950
},
{
"epoch": 5.240230621396541,
"grad_norm": 0.2956278920173645,
"learning_rate": 0.0005374343140110689,
"loss": 3.5111,
"step": 18000
},
{
"epoch": 5.240230621396541,
"eval_accuracy": 0.35995657865480135,
"eval_loss": 3.6383495330810547,
"eval_runtime": 54.1651,
"eval_samples_per_second": 307.264,
"eval_steps_per_second": 19.219,
"step": 18000
},
{
"epoch": 5.254790052996331,
"grad_norm": 0.32184895873069763,
"learning_rate": 0.0005372595397611418,
"loss": 3.5039,
"step": 18050
},
{
"epoch": 5.2693494845961215,
"grad_norm": 0.3286205232143402,
"learning_rate": 0.0005370847655112147,
"loss": 3.5081,
"step": 18100
},
{
"epoch": 5.283908916195911,
"grad_norm": 0.3119205832481384,
"learning_rate": 0.0005369099912612875,
"loss": 3.5086,
"step": 18150
},
{
"epoch": 5.298468347795702,
"grad_norm": 0.3186841905117035,
"learning_rate": 0.0005367352170113603,
"loss": 3.5062,
"step": 18200
},
{
"epoch": 5.313027779395492,
"grad_norm": 0.3225612938404083,
"learning_rate": 0.0005365604427614331,
"loss": 3.507,
"step": 18250
},
{
"epoch": 5.327587210995283,
"grad_norm": 0.31384894251823425,
"learning_rate": 0.0005363856685115059,
"loss": 3.5197,
"step": 18300
},
{
"epoch": 5.342146642595073,
"grad_norm": 0.3117847740650177,
"learning_rate": 0.0005362108942615788,
"loss": 3.5111,
"step": 18350
},
{
"epoch": 5.3567060741948636,
"grad_norm": 0.32050588726997375,
"learning_rate": 0.0005360361200116516,
"loss": 3.5116,
"step": 18400
},
{
"epoch": 5.3712655057946534,
"grad_norm": 0.31556278467178345,
"learning_rate": 0.0005358613457617244,
"loss": 3.5275,
"step": 18450
},
{
"epoch": 5.385824937394444,
"grad_norm": 0.29756829142570496,
"learning_rate": 0.0005356865715117973,
"loss": 3.5141,
"step": 18500
},
{
"epoch": 5.400384368994234,
"grad_norm": 0.344346821308136,
"learning_rate": 0.00053551179726187,
"loss": 3.5257,
"step": 18550
},
{
"epoch": 5.414943800594025,
"grad_norm": 0.3157712519168854,
"learning_rate": 0.0005353370230119429,
"loss": 3.5109,
"step": 18600
},
{
"epoch": 5.429503232193815,
"grad_norm": 0.3057422339916229,
"learning_rate": 0.0005351622487620157,
"loss": 3.5194,
"step": 18650
},
{
"epoch": 5.444062663793606,
"grad_norm": 0.3119611144065857,
"learning_rate": 0.0005349874745120885,
"loss": 3.5164,
"step": 18700
},
{
"epoch": 5.4586220953933955,
"grad_norm": 0.3102344274520874,
"learning_rate": 0.0005348127002621614,
"loss": 3.5166,
"step": 18750
},
{
"epoch": 5.473181526993186,
"grad_norm": 0.30929329991340637,
"learning_rate": 0.0005346379260122341,
"loss": 3.5222,
"step": 18800
},
{
"epoch": 5.487740958592976,
"grad_norm": 0.3128523528575897,
"learning_rate": 0.000534463151762307,
"loss": 3.5241,
"step": 18850
},
{
"epoch": 5.502300390192767,
"grad_norm": 0.3076431155204773,
"learning_rate": 0.0005342883775123798,
"loss": 3.5398,
"step": 18900
},
{
"epoch": 5.516859821792557,
"grad_norm": 0.3266940116882324,
"learning_rate": 0.0005341136032624526,
"loss": 3.5278,
"step": 18950
},
{
"epoch": 5.531419253392348,
"grad_norm": 0.3071880638599396,
"learning_rate": 0.0005339388290125255,
"loss": 3.5112,
"step": 19000
},
{
"epoch": 5.531419253392348,
"eval_accuracy": 0.3604157433888803,
"eval_loss": 3.6261556148529053,
"eval_runtime": 54.4593,
"eval_samples_per_second": 305.604,
"eval_steps_per_second": 19.115,
"step": 19000
},
{
"epoch": 5.5459786849921375,
"grad_norm": 0.30749401450157166,
"learning_rate": 0.0005337640547625983,
"loss": 3.5273,
"step": 19050
},
{
"epoch": 5.560538116591928,
"grad_norm": 0.3211112916469574,
"learning_rate": 0.000533589280512671,
"loss": 3.5192,
"step": 19100
},
{
"epoch": 5.575097548191718,
"grad_norm": 0.31711339950561523,
"learning_rate": 0.0005334145062627439,
"loss": 3.5173,
"step": 19150
},
{
"epoch": 5.589656979791509,
"grad_norm": 0.321505606174469,
"learning_rate": 0.0005332397320128167,
"loss": 3.55,
"step": 19200
},
{
"epoch": 5.604216411391299,
"grad_norm": 0.31558939814567566,
"learning_rate": 0.0005330649577628896,
"loss": 3.5341,
"step": 19250
},
{
"epoch": 5.61877584299109,
"grad_norm": 0.31166985630989075,
"learning_rate": 0.0005328901835129624,
"loss": 3.5368,
"step": 19300
},
{
"epoch": 5.6333352745908805,
"grad_norm": 0.3260209560394287,
"learning_rate": 0.0005327154092630351,
"loss": 3.5323,
"step": 19350
},
{
"epoch": 5.64789470619067,
"grad_norm": 0.31407615542411804,
"learning_rate": 0.000532540635013108,
"loss": 3.5209,
"step": 19400
},
{
"epoch": 5.66245413779046,
"grad_norm": 0.32734420895576477,
"learning_rate": 0.0005323658607631808,
"loss": 3.5266,
"step": 19450
},
{
"epoch": 5.677013569390251,
"grad_norm": 0.3149868845939636,
"learning_rate": 0.0005321910865132537,
"loss": 3.5234,
"step": 19500
},
{
"epoch": 5.691573000990042,
"grad_norm": 0.3325765132904053,
"learning_rate": 0.0005320163122633265,
"loss": 3.5196,
"step": 19550
},
{
"epoch": 5.706132432589832,
"grad_norm": 0.2984762191772461,
"learning_rate": 0.0005318415380133993,
"loss": 3.5319,
"step": 19600
},
{
"epoch": 5.720691864189622,
"grad_norm": 0.3270318806171417,
"learning_rate": 0.0005316667637634721,
"loss": 3.5425,
"step": 19650
},
{
"epoch": 5.735251295789412,
"grad_norm": 0.32038673758506775,
"learning_rate": 0.0005314919895135449,
"loss": 3.533,
"step": 19700
},
{
"epoch": 5.749810727389203,
"grad_norm": 0.3111437261104584,
"learning_rate": 0.0005313172152636178,
"loss": 3.5343,
"step": 19750
},
{
"epoch": 5.764370158988993,
"grad_norm": 0.308755099773407,
"learning_rate": 0.0005311424410136906,
"loss": 3.5286,
"step": 19800
},
{
"epoch": 5.778929590588783,
"grad_norm": 0.3227141499519348,
"learning_rate": 0.0005309676667637634,
"loss": 3.527,
"step": 19850
},
{
"epoch": 5.793489022188574,
"grad_norm": 0.31226617097854614,
"learning_rate": 0.0005307928925138363,
"loss": 3.5389,
"step": 19900
},
{
"epoch": 5.8080484537883645,
"grad_norm": 0.2969604730606079,
"learning_rate": 0.000530618118263909,
"loss": 3.5176,
"step": 19950
},
{
"epoch": 5.822607885388154,
"grad_norm": 0.34588631987571716,
"learning_rate": 0.0005304433440139819,
"loss": 3.5203,
"step": 20000
},
{
"epoch": 5.822607885388154,
"eval_accuracy": 0.36184932506311607,
"eval_loss": 3.6176443099975586,
"eval_runtime": 54.4221,
"eval_samples_per_second": 305.814,
"eval_steps_per_second": 19.128,
"step": 20000
},
{
"epoch": 5.837167316987944,
"grad_norm": 0.31692346930503845,
"learning_rate": 0.0005302685697640547,
"loss": 3.5295,
"step": 20050
},
{
"epoch": 5.851726748587735,
"grad_norm": 0.3064531683921814,
"learning_rate": 0.0005300937955141275,
"loss": 3.5287,
"step": 20100
},
{
"epoch": 5.866286180187526,
"grad_norm": 0.31022435426712036,
"learning_rate": 0.0005299190212642004,
"loss": 3.5314,
"step": 20150
},
{
"epoch": 5.880845611787316,
"grad_norm": 0.3072813153266907,
"learning_rate": 0.0005297442470142731,
"loss": 3.5338,
"step": 20200
},
{
"epoch": 5.895405043387106,
"grad_norm": 0.2999200224876404,
"learning_rate": 0.000529569472764346,
"loss": 3.5218,
"step": 20250
},
{
"epoch": 5.9099644749868965,
"grad_norm": 0.31100237369537354,
"learning_rate": 0.0005293946985144188,
"loss": 3.5419,
"step": 20300
},
{
"epoch": 5.924523906586687,
"grad_norm": 0.2922367751598358,
"learning_rate": 0.0005292199242644916,
"loss": 3.5429,
"step": 20350
},
{
"epoch": 5.939083338186477,
"grad_norm": 0.3141598701477051,
"learning_rate": 0.0005290451500145645,
"loss": 3.5185,
"step": 20400
},
{
"epoch": 5.953642769786267,
"grad_norm": 0.33754071593284607,
"learning_rate": 0.0005288703757646373,
"loss": 3.5203,
"step": 20450
},
{
"epoch": 5.968202201386058,
"grad_norm": 0.3237624168395996,
"learning_rate": 0.00052869560151471,
"loss": 3.5362,
"step": 20500
},
{
"epoch": 5.982761632985849,
"grad_norm": 0.2930886447429657,
"learning_rate": 0.0005285208272647829,
"loss": 3.5243,
"step": 20550
},
{
"epoch": 5.9973210645856385,
"grad_norm": 0.3187922537326813,
"learning_rate": 0.0005283460530148558,
"loss": 3.5354,
"step": 20600
},
{
"epoch": 6.011647545279832,
"grad_norm": 0.3651193380355835,
"learning_rate": 0.0005281712787649286,
"loss": 3.4451,
"step": 20650
},
{
"epoch": 6.026206976879623,
"grad_norm": 0.3150573968887329,
"learning_rate": 0.0005279965045150015,
"loss": 3.4055,
"step": 20700
},
{
"epoch": 6.040766408479413,
"grad_norm": 0.32729947566986084,
"learning_rate": 0.0005278217302650743,
"loss": 3.423,
"step": 20750
},
{
"epoch": 6.055325840079203,
"grad_norm": 0.31897303462028503,
"learning_rate": 0.000527646956015147,
"loss": 3.4237,
"step": 20800
},
{
"epoch": 6.069885271678993,
"grad_norm": 0.3033381700515747,
"learning_rate": 0.0005274721817652199,
"loss": 3.4244,
"step": 20850
},
{
"epoch": 6.084444703278784,
"grad_norm": 0.33832624554634094,
"learning_rate": 0.0005272974075152927,
"loss": 3.4368,
"step": 20900
},
{
"epoch": 6.099004134878574,
"grad_norm": 0.3380010426044464,
"learning_rate": 0.0005271226332653656,
"loss": 3.4396,
"step": 20950
},
{
"epoch": 6.113563566478365,
"grad_norm": 0.32484617829322815,
"learning_rate": 0.0005269478590154384,
"loss": 3.4341,
"step": 21000
},
{
"epoch": 6.113563566478365,
"eval_accuracy": 0.36267782050903674,
"eval_loss": 3.6185672283172607,
"eval_runtime": 53.9496,
"eval_samples_per_second": 308.491,
"eval_steps_per_second": 19.296,
"step": 21000
},
{
"epoch": 6.128122998078155,
"grad_norm": 0.32085222005844116,
"learning_rate": 0.0005267730847655111,
"loss": 3.4444,
"step": 21050
},
{
"epoch": 6.142682429677945,
"grad_norm": 0.31938812136650085,
"learning_rate": 0.000526598310515584,
"loss": 3.4584,
"step": 21100
},
{
"epoch": 6.157241861277735,
"grad_norm": 0.3215405344963074,
"learning_rate": 0.0005264235362656568,
"loss": 3.4451,
"step": 21150
},
{
"epoch": 6.171801292877526,
"grad_norm": 0.30794188380241394,
"learning_rate": 0.0005262487620157297,
"loss": 3.4428,
"step": 21200
},
{
"epoch": 6.186360724477316,
"grad_norm": 0.3180452585220337,
"learning_rate": 0.0005260739877658025,
"loss": 3.4677,
"step": 21250
},
{
"epoch": 6.200920156077107,
"grad_norm": 0.31727978587150574,
"learning_rate": 0.0005258992135158753,
"loss": 3.4612,
"step": 21300
},
{
"epoch": 6.215479587676897,
"grad_norm": 0.3364965617656708,
"learning_rate": 0.0005257244392659481,
"loss": 3.4563,
"step": 21350
},
{
"epoch": 6.2300390192766875,
"grad_norm": 0.3123633861541748,
"learning_rate": 0.0005255496650160209,
"loss": 3.463,
"step": 21400
},
{
"epoch": 6.244598450876477,
"grad_norm": 0.3345796763896942,
"learning_rate": 0.0005253748907660938,
"loss": 3.4686,
"step": 21450
},
{
"epoch": 6.259157882476268,
"grad_norm": 0.32419443130493164,
"learning_rate": 0.0005252001165161666,
"loss": 3.4628,
"step": 21500
},
{
"epoch": 6.273717314076059,
"grad_norm": 0.34241312742233276,
"learning_rate": 0.0005250253422662394,
"loss": 3.4714,
"step": 21550
},
{
"epoch": 6.288276745675849,
"grad_norm": 0.3336371183395386,
"learning_rate": 0.0005248505680163123,
"loss": 3.4706,
"step": 21600
},
{
"epoch": 6.302836177275639,
"grad_norm": 0.3192159831523895,
"learning_rate": 0.000524675793766385,
"loss": 3.4659,
"step": 21650
},
{
"epoch": 6.3173956088754295,
"grad_norm": 0.3582025170326233,
"learning_rate": 0.0005245010195164579,
"loss": 3.4717,
"step": 21700
},
{
"epoch": 6.33195504047522,
"grad_norm": 0.31705984473228455,
"learning_rate": 0.0005243262452665307,
"loss": 3.4633,
"step": 21750
},
{
"epoch": 6.34651447207501,
"grad_norm": 0.32373446226119995,
"learning_rate": 0.0005241514710166035,
"loss": 3.4706,
"step": 21800
},
{
"epoch": 6.3610739036748,
"grad_norm": 0.3116013705730438,
"learning_rate": 0.0005239766967666764,
"loss": 3.4741,
"step": 21850
},
{
"epoch": 6.375633335274591,
"grad_norm": 0.3358467221260071,
"learning_rate": 0.0005238019225167491,
"loss": 3.4731,
"step": 21900
},
{
"epoch": 6.390192766874382,
"grad_norm": 0.30520445108413696,
"learning_rate": 0.0005236271482668219,
"loss": 3.4689,
"step": 21950
},
{
"epoch": 6.4047521984741715,
"grad_norm": 0.3098299503326416,
"learning_rate": 0.0005234523740168948,
"loss": 3.478,
"step": 22000
},
{
"epoch": 6.4047521984741715,
"eval_accuracy": 0.3628752437071209,
"eval_loss": 3.6096370220184326,
"eval_runtime": 54.1283,
"eval_samples_per_second": 307.473,
"eval_steps_per_second": 19.232,
"step": 22000
},
{
"epoch": 6.419311630073962,
"grad_norm": 0.299277126789093,
"learning_rate": 0.0005232775997669676,
"loss": 3.4772,
"step": 22050
},
{
"epoch": 6.433871061673752,
"grad_norm": 0.33022385835647583,
"learning_rate": 0.0005231028255170405,
"loss": 3.4784,
"step": 22100
},
{
"epoch": 6.448430493273543,
"grad_norm": 0.3448750376701355,
"learning_rate": 0.0005229280512671133,
"loss": 3.4862,
"step": 22150
},
{
"epoch": 6.462989924873333,
"grad_norm": 0.31978732347488403,
"learning_rate": 0.000522753277017186,
"loss": 3.4777,
"step": 22200
},
{
"epoch": 6.477549356473124,
"grad_norm": 0.3098117709159851,
"learning_rate": 0.0005225785027672589,
"loss": 3.4775,
"step": 22250
},
{
"epoch": 6.492108788072914,
"grad_norm": 0.3060181140899658,
"learning_rate": 0.0005224037285173317,
"loss": 3.4832,
"step": 22300
},
{
"epoch": 6.506668219672704,
"grad_norm": 0.3079688549041748,
"learning_rate": 0.0005222289542674046,
"loss": 3.4789,
"step": 22350
},
{
"epoch": 6.521227651272494,
"grad_norm": 0.3160729706287384,
"learning_rate": 0.0005220541800174774,
"loss": 3.4833,
"step": 22400
},
{
"epoch": 6.535787082872285,
"grad_norm": 0.32327190041542053,
"learning_rate": 0.0005218794057675501,
"loss": 3.4703,
"step": 22450
},
{
"epoch": 6.550346514472075,
"grad_norm": 0.3138171136379242,
"learning_rate": 0.000521704631517623,
"loss": 3.4716,
"step": 22500
},
{
"epoch": 6.564905946071866,
"grad_norm": 0.3231295049190521,
"learning_rate": 0.0005215298572676958,
"loss": 3.4791,
"step": 22550
},
{
"epoch": 6.579465377671656,
"grad_norm": 0.3182159662246704,
"learning_rate": 0.0005213550830177687,
"loss": 3.4915,
"step": 22600
},
{
"epoch": 6.594024809271446,
"grad_norm": 0.29927167296409607,
"learning_rate": 0.0005211803087678415,
"loss": 3.4777,
"step": 22650
},
{
"epoch": 6.608584240871236,
"grad_norm": 0.29772549867630005,
"learning_rate": 0.0005210055345179143,
"loss": 3.4757,
"step": 22700
},
{
"epoch": 6.623143672471027,
"grad_norm": 0.3051946759223938,
"learning_rate": 0.0005208307602679871,
"loss": 3.4851,
"step": 22750
},
{
"epoch": 6.637703104070817,
"grad_norm": 0.3218457102775574,
"learning_rate": 0.0005206559860180599,
"loss": 3.4805,
"step": 22800
},
{
"epoch": 6.652262535670608,
"grad_norm": 0.3196474611759186,
"learning_rate": 0.0005204812117681328,
"loss": 3.4912,
"step": 22850
},
{
"epoch": 6.666821967270398,
"grad_norm": 0.30887487530708313,
"learning_rate": 0.0005203064375182056,
"loss": 3.4878,
"step": 22900
},
{
"epoch": 6.6813813988701884,
"grad_norm": 0.34201371669769287,
"learning_rate": 0.0005201316632682784,
"loss": 3.4869,
"step": 22950
},
{
"epoch": 6.695940830469978,
"grad_norm": 0.33082863688468933,
"learning_rate": 0.0005199568890183513,
"loss": 3.4906,
"step": 23000
},
{
"epoch": 6.695940830469978,
"eval_accuracy": 0.3635063159549574,
"eval_loss": 3.602727174758911,
"eval_runtime": 54.3369,
"eval_samples_per_second": 306.293,
"eval_steps_per_second": 19.158,
"step": 23000
},
{
"epoch": 6.710500262069769,
"grad_norm": 0.31607675552368164,
"learning_rate": 0.000519782114768424,
"loss": 3.4755,
"step": 23050
},
{
"epoch": 6.725059693669559,
"grad_norm": 0.300519198179245,
"learning_rate": 0.0005196073405184969,
"loss": 3.4849,
"step": 23100
},
{
"epoch": 6.73961912526935,
"grad_norm": 0.3010658919811249,
"learning_rate": 0.0005194325662685697,
"loss": 3.4793,
"step": 23150
},
{
"epoch": 6.75417855686914,
"grad_norm": 0.3112882077693939,
"learning_rate": 0.0005192577920186426,
"loss": 3.4941,
"step": 23200
},
{
"epoch": 6.7687379884689305,
"grad_norm": 0.32871827483177185,
"learning_rate": 0.0005190830177687154,
"loss": 3.4962,
"step": 23250
},
{
"epoch": 6.78329742006872,
"grad_norm": 0.32987311482429504,
"learning_rate": 0.0005189082435187883,
"loss": 3.5037,
"step": 23300
},
{
"epoch": 6.797856851668511,
"grad_norm": 0.3238174021244049,
"learning_rate": 0.000518733469268861,
"loss": 3.4946,
"step": 23350
},
{
"epoch": 6.812416283268301,
"grad_norm": 0.33636802434921265,
"learning_rate": 0.0005185586950189338,
"loss": 3.4939,
"step": 23400
},
{
"epoch": 6.826975714868092,
"grad_norm": 0.312847763299942,
"learning_rate": 0.0005183839207690067,
"loss": 3.4955,
"step": 23450
},
{
"epoch": 6.841535146467882,
"grad_norm": 0.3202083706855774,
"learning_rate": 0.0005182091465190795,
"loss": 3.4939,
"step": 23500
},
{
"epoch": 6.8560945780676725,
"grad_norm": 0.31817376613616943,
"learning_rate": 0.0005180343722691524,
"loss": 3.4978,
"step": 23550
},
{
"epoch": 6.870654009667462,
"grad_norm": 0.3149799406528473,
"learning_rate": 0.0005178595980192251,
"loss": 3.4968,
"step": 23600
},
{
"epoch": 6.885213441267253,
"grad_norm": 0.2956794202327728,
"learning_rate": 0.0005176848237692979,
"loss": 3.4857,
"step": 23650
},
{
"epoch": 6.899772872867043,
"grad_norm": 0.32103171944618225,
"learning_rate": 0.0005175100495193708,
"loss": 3.4962,
"step": 23700
},
{
"epoch": 6.914332304466834,
"grad_norm": 0.2945249676704407,
"learning_rate": 0.0005173352752694436,
"loss": 3.4761,
"step": 23750
},
{
"epoch": 6.928891736066624,
"grad_norm": 0.2928471863269806,
"learning_rate": 0.0005171605010195165,
"loss": 3.4972,
"step": 23800
},
{
"epoch": 6.943451167666415,
"grad_norm": 0.35107314586639404,
"learning_rate": 0.0005169857267695893,
"loss": 3.5057,
"step": 23850
},
{
"epoch": 6.9580105992662045,
"grad_norm": 0.30272382497787476,
"learning_rate": 0.000516810952519662,
"loss": 3.4895,
"step": 23900
},
{
"epoch": 6.972570030865995,
"grad_norm": 0.29927268624305725,
"learning_rate": 0.0005166361782697349,
"loss": 3.5006,
"step": 23950
},
{
"epoch": 6.987129462465785,
"grad_norm": 0.31151506304740906,
"learning_rate": 0.0005164614040198077,
"loss": 3.4966,
"step": 24000
},
{
"epoch": 6.987129462465785,
"eval_accuracy": 0.364730716051235,
"eval_loss": 3.594974994659424,
"eval_runtime": 54.1369,
"eval_samples_per_second": 307.424,
"eval_steps_per_second": 19.229,
"step": 24000
},
{
"epoch": 7.001455943159979,
"grad_norm": 0.3276313841342926,
"learning_rate": 0.0005162866297698806,
"loss": 3.4778,
"step": 24050
},
{
"epoch": 7.016015374759769,
"grad_norm": 0.33290964365005493,
"learning_rate": 0.0005161118555199534,
"loss": 3.3818,
"step": 24100
},
{
"epoch": 7.03057480635956,
"grad_norm": 0.3170606195926666,
"learning_rate": 0.0005159370812700261,
"loss": 3.3891,
"step": 24150
},
{
"epoch": 7.04513423795935,
"grad_norm": 0.3252675533294678,
"learning_rate": 0.000515762307020099,
"loss": 3.3859,
"step": 24200
},
{
"epoch": 7.059693669559141,
"grad_norm": 0.3159274756908417,
"learning_rate": 0.0005155875327701718,
"loss": 3.3887,
"step": 24250
},
{
"epoch": 7.074253101158931,
"grad_norm": 0.34390512108802795,
"learning_rate": 0.0005154127585202447,
"loss": 3.3975,
"step": 24300
},
{
"epoch": 7.0888125327587215,
"grad_norm": 0.32971522212028503,
"learning_rate": 0.0005152379842703175,
"loss": 3.3996,
"step": 24350
},
{
"epoch": 7.103371964358511,
"grad_norm": 0.306550532579422,
"learning_rate": 0.0005150632100203903,
"loss": 3.3994,
"step": 24400
},
{
"epoch": 7.117931395958302,
"grad_norm": 0.3376745581626892,
"learning_rate": 0.0005148884357704631,
"loss": 3.4013,
"step": 24450
},
{
"epoch": 7.132490827558092,
"grad_norm": 0.3302764296531677,
"learning_rate": 0.0005147136615205359,
"loss": 3.4085,
"step": 24500
},
{
"epoch": 7.147050259157883,
"grad_norm": 0.3368372321128845,
"learning_rate": 0.0005145388872706087,
"loss": 3.4101,
"step": 24550
},
{
"epoch": 7.161609690757673,
"grad_norm": 0.33424296975135803,
"learning_rate": 0.0005143641130206816,
"loss": 3.428,
"step": 24600
},
{
"epoch": 7.1761691223574635,
"grad_norm": 0.3270719647407532,
"learning_rate": 0.0005141893387707544,
"loss": 3.4132,
"step": 24650
},
{
"epoch": 7.190728553957253,
"grad_norm": 0.33818596601486206,
"learning_rate": 0.0005140145645208272,
"loss": 3.4144,
"step": 24700
},
{
"epoch": 7.205287985557044,
"grad_norm": 0.32162582874298096,
"learning_rate": 0.0005138397902709,
"loss": 3.4265,
"step": 24750
},
{
"epoch": 7.219847417156834,
"grad_norm": 0.33046919107437134,
"learning_rate": 0.0005136650160209728,
"loss": 3.4209,
"step": 24800
},
{
"epoch": 7.234406848756625,
"grad_norm": 0.31164079904556274,
"learning_rate": 0.0005134902417710457,
"loss": 3.4252,
"step": 24850
},
{
"epoch": 7.248966280356415,
"grad_norm": 0.35857391357421875,
"learning_rate": 0.0005133154675211185,
"loss": 3.4089,
"step": 24900
},
{
"epoch": 7.2635257119562056,
"grad_norm": 0.32694748044013977,
"learning_rate": 0.0005131406932711914,
"loss": 3.4282,
"step": 24950
},
{
"epoch": 7.2780851435559955,
"grad_norm": 0.3398299813270569,
"learning_rate": 0.0005129659190212641,
"loss": 3.4272,
"step": 25000
},
{
"epoch": 7.2780851435559955,
"eval_accuracy": 0.3646033727971998,
"eval_loss": 3.5990543365478516,
"eval_runtime": 54.4254,
"eval_samples_per_second": 305.794,
"eval_steps_per_second": 19.127,
"step": 25000
},
{
"epoch": 7.292644575155786,
"grad_norm": 0.3356783092021942,
"learning_rate": 0.0005127911447713369,
"loss": 3.4373,
"step": 25050
},
{
"epoch": 7.307204006755576,
"grad_norm": 0.3248707354068756,
"learning_rate": 0.0005126163705214098,
"loss": 3.4371,
"step": 25100
},
{
"epoch": 7.321763438355367,
"grad_norm": 0.31292667984962463,
"learning_rate": 0.0005124415962714826,
"loss": 3.4362,
"step": 25150
},
{
"epoch": 7.336322869955157,
"grad_norm": 0.29806479811668396,
"learning_rate": 0.0005122668220215555,
"loss": 3.4325,
"step": 25200
},
{
"epoch": 7.350882301554948,
"grad_norm": 0.33509254455566406,
"learning_rate": 0.0005120920477716282,
"loss": 3.4387,
"step": 25250
},
{
"epoch": 7.3654417331547375,
"grad_norm": 0.33612021803855896,
"learning_rate": 0.000511917273521701,
"loss": 3.4294,
"step": 25300
},
{
"epoch": 7.380001164754528,
"grad_norm": 0.3314110040664673,
"learning_rate": 0.0005117424992717739,
"loss": 3.439,
"step": 25350
},
{
"epoch": 7.394560596354318,
"grad_norm": 0.31326502561569214,
"learning_rate": 0.0005115677250218467,
"loss": 3.4409,
"step": 25400
},
{
"epoch": 7.409120027954109,
"grad_norm": 0.32877790927886963,
"learning_rate": 0.0005113929507719196,
"loss": 3.4458,
"step": 25450
},
{
"epoch": 7.423679459553899,
"grad_norm": 0.3034365475177765,
"learning_rate": 0.0005112181765219924,
"loss": 3.4455,
"step": 25500
},
{
"epoch": 7.43823889115369,
"grad_norm": 0.30074968934059143,
"learning_rate": 0.0005110434022720651,
"loss": 3.4473,
"step": 25550
},
{
"epoch": 7.4527983227534795,
"grad_norm": 0.34182247519493103,
"learning_rate": 0.000510868628022138,
"loss": 3.4443,
"step": 25600
},
{
"epoch": 7.46735775435327,
"grad_norm": 0.324805349111557,
"learning_rate": 0.0005106938537722109,
"loss": 3.4394,
"step": 25650
},
{
"epoch": 7.48191718595306,
"grad_norm": 0.327799916267395,
"learning_rate": 0.0005105190795222837,
"loss": 3.4358,
"step": 25700
},
{
"epoch": 7.496476617552851,
"grad_norm": 0.32267752289772034,
"learning_rate": 0.0005103443052723565,
"loss": 3.4471,
"step": 25750
},
{
"epoch": 7.511036049152641,
"grad_norm": 0.3113254904747009,
"learning_rate": 0.0005101695310224294,
"loss": 3.4371,
"step": 25800
},
{
"epoch": 7.525595480752432,
"grad_norm": 0.3191179633140564,
"learning_rate": 0.0005099947567725021,
"loss": 3.4375,
"step": 25850
},
{
"epoch": 7.540154912352222,
"grad_norm": 0.3289468288421631,
"learning_rate": 0.000509819982522575,
"loss": 3.4396,
"step": 25900
},
{
"epoch": 7.554714343952012,
"grad_norm": 0.3294450044631958,
"learning_rate": 0.0005096452082726478,
"loss": 3.4431,
"step": 25950
},
{
"epoch": 7.569273775551802,
"grad_norm": 0.32150015234947205,
"learning_rate": 0.0005094704340227206,
"loss": 3.4526,
"step": 26000
},
{
"epoch": 7.569273775551802,
"eval_accuracy": 0.3649583582855953,
"eval_loss": 3.5938405990600586,
"eval_runtime": 54.4229,
"eval_samples_per_second": 305.809,
"eval_steps_per_second": 19.128,
"step": 26000
},
{
"epoch": 7.583833207151593,
"grad_norm": 0.36833828687667847,
"learning_rate": 0.0005092956597727935,
"loss": 3.4566,
"step": 26050
},
{
"epoch": 7.598392638751383,
"grad_norm": 0.3328242897987366,
"learning_rate": 0.0005091208855228662,
"loss": 3.4516,
"step": 26100
},
{
"epoch": 7.612952070351174,
"grad_norm": 0.3110596239566803,
"learning_rate": 0.0005089461112729391,
"loss": 3.4443,
"step": 26150
},
{
"epoch": 7.627511501950964,
"grad_norm": 0.3015148341655731,
"learning_rate": 0.0005087713370230119,
"loss": 3.4576,
"step": 26200
},
{
"epoch": 7.642070933550754,
"grad_norm": 0.31756895780563354,
"learning_rate": 0.0005085965627730847,
"loss": 3.4537,
"step": 26250
},
{
"epoch": 7.656630365150544,
"grad_norm": 0.3184044361114502,
"learning_rate": 0.0005084217885231576,
"loss": 3.4558,
"step": 26300
},
{
"epoch": 7.671189796750335,
"grad_norm": 0.3208577036857605,
"learning_rate": 0.0005082470142732304,
"loss": 3.4679,
"step": 26350
},
{
"epoch": 7.685749228350125,
"grad_norm": 0.31280356645584106,
"learning_rate": 0.0005080722400233032,
"loss": 3.4482,
"step": 26400
},
{
"epoch": 7.700308659949916,
"grad_norm": 0.31253930926322937,
"learning_rate": 0.000507897465773376,
"loss": 3.4452,
"step": 26450
},
{
"epoch": 7.714868091549706,
"grad_norm": 0.30488118529319763,
"learning_rate": 0.0005077226915234488,
"loss": 3.4542,
"step": 26500
},
{
"epoch": 7.729427523149496,
"grad_norm": 0.3201649785041809,
"learning_rate": 0.0005075479172735217,
"loss": 3.4561,
"step": 26550
},
{
"epoch": 7.743986954749286,
"grad_norm": 0.3104819655418396,
"learning_rate": 0.0005073731430235945,
"loss": 3.4467,
"step": 26600
},
{
"epoch": 7.758546386349077,
"grad_norm": 0.3295295238494873,
"learning_rate": 0.0005071983687736674,
"loss": 3.4631,
"step": 26650
},
{
"epoch": 7.773105817948867,
"grad_norm": 0.30710911750793457,
"learning_rate": 0.0005070235945237401,
"loss": 3.4518,
"step": 26700
},
{
"epoch": 7.787665249548658,
"grad_norm": 0.33581966161727905,
"learning_rate": 0.0005068488202738129,
"loss": 3.4568,
"step": 26750
},
{
"epoch": 7.802224681148448,
"grad_norm": 0.3220713436603546,
"learning_rate": 0.0005066740460238858,
"loss": 3.451,
"step": 26800
},
{
"epoch": 7.8167841127482385,
"grad_norm": 0.3231236934661865,
"learning_rate": 0.0005064992717739586,
"loss": 3.4682,
"step": 26850
},
{
"epoch": 7.831343544348028,
"grad_norm": 0.3177933692932129,
"learning_rate": 0.0005063244975240315,
"loss": 3.4558,
"step": 26900
},
{
"epoch": 7.845902975947819,
"grad_norm": 0.3490990102291107,
"learning_rate": 0.0005061497232741042,
"loss": 3.4578,
"step": 26950
},
{
"epoch": 7.860462407547609,
"grad_norm": 0.3442172110080719,
"learning_rate": 0.000505974949024177,
"loss": 3.4506,
"step": 27000
},
{
"epoch": 7.860462407547609,
"eval_accuracy": 0.3662008662868788,
"eval_loss": 3.581078052520752,
"eval_runtime": 54.1953,
"eval_samples_per_second": 307.093,
"eval_steps_per_second": 19.208,
"step": 27000
},
{
"epoch": 7.8750218391474,
"grad_norm": 0.343916654586792,
"learning_rate": 0.0005058001747742499,
"loss": 3.4742,
"step": 27050
},
{
"epoch": 7.88958127074719,
"grad_norm": 0.31028035283088684,
"learning_rate": 0.0005056254005243227,
"loss": 3.464,
"step": 27100
},
{
"epoch": 7.9041407023469805,
"grad_norm": 0.3196263313293457,
"learning_rate": 0.0005054506262743955,
"loss": 3.458,
"step": 27150
},
{
"epoch": 7.91870013394677,
"grad_norm": 0.31997817754745483,
"learning_rate": 0.0005052758520244684,
"loss": 3.467,
"step": 27200
},
{
"epoch": 7.933259565546561,
"grad_norm": 0.3503701388835907,
"learning_rate": 0.0005051010777745411,
"loss": 3.4566,
"step": 27250
},
{
"epoch": 7.947818997146351,
"grad_norm": 0.3098277151584625,
"learning_rate": 0.000504926303524614,
"loss": 3.4636,
"step": 27300
},
{
"epoch": 7.962378428746142,
"grad_norm": 0.3302386403083801,
"learning_rate": 0.0005047515292746868,
"loss": 3.4563,
"step": 27350
},
{
"epoch": 7.976937860345932,
"grad_norm": 0.3156256079673767,
"learning_rate": 0.0005045767550247596,
"loss": 3.465,
"step": 27400
},
{
"epoch": 7.991497291945723,
"grad_norm": 0.31740131974220276,
"learning_rate": 0.0005044019807748325,
"loss": 3.4715,
"step": 27450
},
{
"epoch": 8.005823772639916,
"grad_norm": 0.3159884810447693,
"learning_rate": 0.0005042272065249052,
"loss": 3.4171,
"step": 27500
},
{
"epoch": 8.020383204239707,
"grad_norm": 0.32894912362098694,
"learning_rate": 0.0005040524322749781,
"loss": 3.3421,
"step": 27550
},
{
"epoch": 8.034942635839498,
"grad_norm": 0.3333737552165985,
"learning_rate": 0.0005038776580250509,
"loss": 3.3601,
"step": 27600
},
{
"epoch": 8.049502067439287,
"grad_norm": 0.3066194951534271,
"learning_rate": 0.0005037028837751237,
"loss": 3.3551,
"step": 27650
},
{
"epoch": 8.064061499039077,
"grad_norm": 0.3339882493019104,
"learning_rate": 0.0005035281095251966,
"loss": 3.3661,
"step": 27700
},
{
"epoch": 8.078620930638868,
"grad_norm": 0.3425856828689575,
"learning_rate": 0.0005033533352752694,
"loss": 3.3698,
"step": 27750
},
{
"epoch": 8.093180362238659,
"grad_norm": 0.3282395005226135,
"learning_rate": 0.0005031785610253422,
"loss": 3.3741,
"step": 27800
},
{
"epoch": 8.107739793838448,
"grad_norm": 0.3201969563961029,
"learning_rate": 0.000503003786775415,
"loss": 3.3759,
"step": 27850
},
{
"epoch": 8.122299225438239,
"grad_norm": 0.3361366391181946,
"learning_rate": 0.0005028290125254878,
"loss": 3.3752,
"step": 27900
},
{
"epoch": 8.13685865703803,
"grad_norm": 0.3365829288959503,
"learning_rate": 0.0005026542382755607,
"loss": 3.3637,
"step": 27950
},
{
"epoch": 8.15141808863782,
"grad_norm": 0.3227217495441437,
"learning_rate": 0.0005024794640256335,
"loss": 3.3616,
"step": 28000
},
{
"epoch": 8.15141808863782,
"eval_accuracy": 0.3656379926423114,
"eval_loss": 3.5914840698242188,
"eval_runtime": 53.9744,
"eval_samples_per_second": 308.35,
"eval_steps_per_second": 19.287,
"step": 28000
},
{
"epoch": 8.16597752023761,
"grad_norm": 0.3198586702346802,
"learning_rate": 0.0005023046897757064,
"loss": 3.3881,
"step": 28050
},
{
"epoch": 8.1805369518374,
"grad_norm": 0.35139596462249756,
"learning_rate": 0.0005021299155257791,
"loss": 3.3827,
"step": 28100
},
{
"epoch": 8.19509638343719,
"grad_norm": 0.30760928988456726,
"learning_rate": 0.000501955141275852,
"loss": 3.3961,
"step": 28150
},
{
"epoch": 8.209655815036982,
"grad_norm": 0.31556805968284607,
"learning_rate": 0.0005017803670259248,
"loss": 3.3849,
"step": 28200
},
{
"epoch": 8.22421524663677,
"grad_norm": 0.34970980882644653,
"learning_rate": 0.0005016055927759977,
"loss": 3.3894,
"step": 28250
},
{
"epoch": 8.238774678236561,
"grad_norm": 0.3420124650001526,
"learning_rate": 0.0005014308185260705,
"loss": 3.4022,
"step": 28300
},
{
"epoch": 8.253334109836352,
"grad_norm": 0.3457973003387451,
"learning_rate": 0.0005012560442761432,
"loss": 3.3901,
"step": 28350
},
{
"epoch": 8.267893541436143,
"grad_norm": 0.32130882143974304,
"learning_rate": 0.0005010812700262161,
"loss": 3.3881,
"step": 28400
},
{
"epoch": 8.282452973035932,
"grad_norm": 0.31469210982322693,
"learning_rate": 0.0005009064957762889,
"loss": 3.4068,
"step": 28450
},
{
"epoch": 8.297012404635723,
"grad_norm": 0.3118443787097931,
"learning_rate": 0.0005007317215263618,
"loss": 3.4032,
"step": 28500
},
{
"epoch": 8.311571836235514,
"grad_norm": 0.3427928686141968,
"learning_rate": 0.0005005569472764346,
"loss": 3.4035,
"step": 28550
},
{
"epoch": 8.326131267835304,
"grad_norm": 0.3363751173019409,
"learning_rate": 0.0005003821730265074,
"loss": 3.4096,
"step": 28600
},
{
"epoch": 8.340690699435093,
"grad_norm": 0.30898264050483704,
"learning_rate": 0.0005002073987765802,
"loss": 3.4041,
"step": 28650
},
{
"epoch": 8.355250131034884,
"grad_norm": 0.3334648907184601,
"learning_rate": 0.000500032624526653,
"loss": 3.4122,
"step": 28700
},
{
"epoch": 8.369809562634675,
"grad_norm": 0.3632184565067291,
"learning_rate": 0.0004998578502767259,
"loss": 3.3975,
"step": 28750
},
{
"epoch": 8.384368994234466,
"grad_norm": 0.3695220649242401,
"learning_rate": 0.0004996830760267987,
"loss": 3.4077,
"step": 28800
},
{
"epoch": 8.398928425834255,
"grad_norm": 0.3448977470397949,
"learning_rate": 0.0004995083017768715,
"loss": 3.4034,
"step": 28850
},
{
"epoch": 8.413487857434045,
"grad_norm": 0.3269871771335602,
"learning_rate": 0.0004993335275269444,
"loss": 3.4123,
"step": 28900
},
{
"epoch": 8.428047289033836,
"grad_norm": 0.34690749645233154,
"learning_rate": 0.0004991587532770171,
"loss": 3.4035,
"step": 28950
},
{
"epoch": 8.442606720633627,
"grad_norm": 0.31924426555633545,
"learning_rate": 0.00049898397902709,
"loss": 3.4033,
"step": 29000
},
{
"epoch": 8.442606720633627,
"eval_accuracy": 0.36657313659368906,
"eval_loss": 3.5842514038085938,
"eval_runtime": 54.0313,
"eval_samples_per_second": 308.025,
"eval_steps_per_second": 19.267,
"step": 29000
},
{
"epoch": 8.457166152233416,
"grad_norm": 0.32659900188446045,
"learning_rate": 0.0004988092047771628,
"loss": 3.4083,
"step": 29050
},
{
"epoch": 8.471725583833207,
"grad_norm": 0.32884863018989563,
"learning_rate": 0.0004986344305272356,
"loss": 3.4058,
"step": 29100
},
{
"epoch": 8.486285015432998,
"grad_norm": 0.32945168018341064,
"learning_rate": 0.0004984596562773085,
"loss": 3.4181,
"step": 29150
},
{
"epoch": 8.500844447032788,
"grad_norm": 0.3260224163532257,
"learning_rate": 0.0004982848820273812,
"loss": 3.4164,
"step": 29200
},
{
"epoch": 8.515403878632577,
"grad_norm": 0.30971044301986694,
"learning_rate": 0.0004981101077774541,
"loss": 3.4222,
"step": 29250
},
{
"epoch": 8.529963310232368,
"grad_norm": 0.3370177447795868,
"learning_rate": 0.0004979353335275269,
"loss": 3.4189,
"step": 29300
},
{
"epoch": 8.544522741832159,
"grad_norm": 0.3051028847694397,
"learning_rate": 0.0004977605592775997,
"loss": 3.4381,
"step": 29350
},
{
"epoch": 8.55908217343195,
"grad_norm": 0.31183165311813354,
"learning_rate": 0.0004975857850276726,
"loss": 3.4182,
"step": 29400
},
{
"epoch": 8.573641605031739,
"grad_norm": 0.35490667819976807,
"learning_rate": 0.0004974110107777454,
"loss": 3.4251,
"step": 29450
},
{
"epoch": 8.58820103663153,
"grad_norm": 0.32052773237228394,
"learning_rate": 0.0004972362365278182,
"loss": 3.4291,
"step": 29500
},
{
"epoch": 8.60276046823132,
"grad_norm": 0.3103652596473694,
"learning_rate": 0.000497061462277891,
"loss": 3.4086,
"step": 29550
},
{
"epoch": 8.617319899831111,
"grad_norm": 0.31185802817344666,
"learning_rate": 0.0004968866880279638,
"loss": 3.4274,
"step": 29600
},
{
"epoch": 8.6318793314309,
"grad_norm": 0.33494362235069275,
"learning_rate": 0.0004967119137780367,
"loss": 3.4227,
"step": 29650
},
{
"epoch": 8.646438763030691,
"grad_norm": 0.3317851424217224,
"learning_rate": 0.0004965371395281095,
"loss": 3.414,
"step": 29700
},
{
"epoch": 8.660998194630482,
"grad_norm": 0.3123399317264557,
"learning_rate": 0.0004963623652781822,
"loss": 3.4224,
"step": 29750
},
{
"epoch": 8.675557626230272,
"grad_norm": 0.3217580020427704,
"learning_rate": 0.0004961875910282551,
"loss": 3.417,
"step": 29800
},
{
"epoch": 8.690117057830061,
"grad_norm": 0.320300430059433,
"learning_rate": 0.0004960128167783279,
"loss": 3.4167,
"step": 29850
},
{
"epoch": 8.704676489429852,
"grad_norm": 0.3256986141204834,
"learning_rate": 0.0004958380425284008,
"loss": 3.439,
"step": 29900
},
{
"epoch": 8.719235921029643,
"grad_norm": 0.3188437819480896,
"learning_rate": 0.0004956632682784736,
"loss": 3.4193,
"step": 29950
},
{
"epoch": 8.733795352629434,
"grad_norm": 0.33221110701560974,
"learning_rate": 0.0004954884940285464,
"loss": 3.4251,
"step": 30000
},
{
"epoch": 8.733795352629434,
"eval_accuracy": 0.36723513338059416,
"eval_loss": 3.573622941970825,
"eval_runtime": 54.2175,
"eval_samples_per_second": 306.967,
"eval_steps_per_second": 19.2,
"step": 30000
},
{
"epoch": 8.748354784229225,
"grad_norm": 0.34907636046409607,
"learning_rate": 0.0004953137197786192,
"loss": 3.4265,
"step": 30050
},
{
"epoch": 8.762914215829014,
"grad_norm": 0.31726714968681335,
"learning_rate": 0.000495138945528692,
"loss": 3.4318,
"step": 30100
},
{
"epoch": 8.777473647428804,
"grad_norm": 0.3318343758583069,
"learning_rate": 0.0004949641712787649,
"loss": 3.4329,
"step": 30150
},
{
"epoch": 8.792033079028595,
"grad_norm": 0.32355913519859314,
"learning_rate": 0.0004947893970288377,
"loss": 3.4301,
"step": 30200
},
{
"epoch": 8.806592510628384,
"grad_norm": 0.3216176927089691,
"learning_rate": 0.0004946146227789105,
"loss": 3.4259,
"step": 30250
},
{
"epoch": 8.821151942228175,
"grad_norm": 0.3203097879886627,
"learning_rate": 0.0004944398485289834,
"loss": 3.4274,
"step": 30300
},
{
"epoch": 8.835711373827966,
"grad_norm": 0.3079201579093933,
"learning_rate": 0.0004942650742790561,
"loss": 3.4354,
"step": 30350
},
{
"epoch": 8.850270805427757,
"grad_norm": 0.3074656128883362,
"learning_rate": 0.000494090300029129,
"loss": 3.4371,
"step": 30400
},
{
"epoch": 8.864830237027547,
"grad_norm": 0.34187525510787964,
"learning_rate": 0.0004939155257792018,
"loss": 3.442,
"step": 30450
},
{
"epoch": 8.879389668627336,
"grad_norm": 0.33804062008857727,
"learning_rate": 0.0004937407515292746,
"loss": 3.4379,
"step": 30500
},
{
"epoch": 8.893949100227127,
"grad_norm": 0.3283509314060211,
"learning_rate": 0.0004935659772793475,
"loss": 3.4292,
"step": 30550
},
{
"epoch": 8.908508531826918,
"grad_norm": 0.3360782563686371,
"learning_rate": 0.0004933912030294202,
"loss": 3.43,
"step": 30600
},
{
"epoch": 8.923067963426707,
"grad_norm": 0.32649967074394226,
"learning_rate": 0.0004932164287794931,
"loss": 3.4253,
"step": 30650
},
{
"epoch": 8.937627395026498,
"grad_norm": 0.31838634610176086,
"learning_rate": 0.000493041654529566,
"loss": 3.4488,
"step": 30700
},
{
"epoch": 8.952186826626289,
"grad_norm": 0.3074035346508026,
"learning_rate": 0.0004928668802796388,
"loss": 3.4389,
"step": 30750
},
{
"epoch": 8.96674625822608,
"grad_norm": 0.34510338306427,
"learning_rate": 0.0004926921060297116,
"loss": 3.4359,
"step": 30800
},
{
"epoch": 8.98130568982587,
"grad_norm": 0.32787054777145386,
"learning_rate": 0.0004925173317797845,
"loss": 3.438,
"step": 30850
},
{
"epoch": 8.995865121425659,
"grad_norm": 0.32717031240463257,
"learning_rate": 0.0004923425575298572,
"loss": 3.4371,
"step": 30900
},
{
"epoch": 9.010191602119853,
"grad_norm": 0.3534397780895233,
"learning_rate": 0.0004921677832799301,
"loss": 3.366,
"step": 30950
},
{
"epoch": 9.024751033719644,
"grad_norm": 0.32879695296287537,
"learning_rate": 0.0004919930090300029,
"loss": 3.3199,
"step": 31000
},
{
"epoch": 9.024751033719644,
"eval_accuracy": 0.3674407874445901,
"eval_loss": 3.5794942378997803,
"eval_runtime": 53.9638,
"eval_samples_per_second": 308.41,
"eval_steps_per_second": 19.291,
"step": 31000
},
{
"epoch": 9.039310465319433,
"grad_norm": 0.32832634449005127,
"learning_rate": 0.0004918182347800757,
"loss": 3.3192,
"step": 31050
},
{
"epoch": 9.053869896919224,
"grad_norm": 0.3287530839443207,
"learning_rate": 0.0004916434605301486,
"loss": 3.3388,
"step": 31100
},
{
"epoch": 9.068429328519015,
"grad_norm": 0.32364922761917114,
"learning_rate": 0.0004914686862802213,
"loss": 3.3306,
"step": 31150
},
{
"epoch": 9.082988760118806,
"grad_norm": 0.343106210231781,
"learning_rate": 0.0004912939120302941,
"loss": 3.33,
"step": 31200
},
{
"epoch": 9.097548191718595,
"grad_norm": 0.31469976902008057,
"learning_rate": 0.000491119137780367,
"loss": 3.3493,
"step": 31250
},
{
"epoch": 9.112107623318385,
"grad_norm": 0.36989399790763855,
"learning_rate": 0.0004909443635304398,
"loss": 3.3455,
"step": 31300
},
{
"epoch": 9.126667054918176,
"grad_norm": 0.3417125344276428,
"learning_rate": 0.0004907695892805127,
"loss": 3.3545,
"step": 31350
},
{
"epoch": 9.141226486517967,
"grad_norm": 0.3421690762042999,
"learning_rate": 0.0004905948150305855,
"loss": 3.3503,
"step": 31400
},
{
"epoch": 9.155785918117756,
"grad_norm": 0.34142985939979553,
"learning_rate": 0.0004904200407806582,
"loss": 3.3663,
"step": 31450
},
{
"epoch": 9.170345349717547,
"grad_norm": 0.3186779320240021,
"learning_rate": 0.0004902452665307311,
"loss": 3.342,
"step": 31500
},
{
"epoch": 9.184904781317337,
"grad_norm": 0.35970360040664673,
"learning_rate": 0.0004900704922808039,
"loss": 3.3642,
"step": 31550
},
{
"epoch": 9.199464212917128,
"grad_norm": 0.33403223752975464,
"learning_rate": 0.0004898957180308768,
"loss": 3.3581,
"step": 31600
},
{
"epoch": 9.214023644516917,
"grad_norm": 0.3198859691619873,
"learning_rate": 0.0004897209437809496,
"loss": 3.3645,
"step": 31650
},
{
"epoch": 9.228583076116708,
"grad_norm": 0.3827151656150818,
"learning_rate": 0.0004895461695310223,
"loss": 3.3694,
"step": 31700
},
{
"epoch": 9.243142507716499,
"grad_norm": 0.34455496072769165,
"learning_rate": 0.0004893713952810952,
"loss": 3.3653,
"step": 31750
},
{
"epoch": 9.25770193931629,
"grad_norm": 0.3326873183250427,
"learning_rate": 0.000489196621031168,
"loss": 3.3595,
"step": 31800
},
{
"epoch": 9.272261370916079,
"grad_norm": 0.32842305302619934,
"learning_rate": 0.0004890218467812409,
"loss": 3.3899,
"step": 31850
},
{
"epoch": 9.28682080251587,
"grad_norm": 0.32804909348487854,
"learning_rate": 0.0004888470725313137,
"loss": 3.3667,
"step": 31900
},
{
"epoch": 9.30138023411566,
"grad_norm": 0.3439745604991913,
"learning_rate": 0.0004886722982813865,
"loss": 3.3901,
"step": 31950
},
{
"epoch": 9.315939665715451,
"grad_norm": 0.3488461375236511,
"learning_rate": 0.0004884975240314593,
"loss": 3.3801,
"step": 32000
},
{
"epoch": 9.315939665715451,
"eval_accuracy": 0.3670910932271379,
"eval_loss": 3.579740047454834,
"eval_runtime": 53.9933,
"eval_samples_per_second": 308.242,
"eval_steps_per_second": 19.28,
"step": 32000
},
{
"epoch": 9.33049909731524,
"grad_norm": 0.3165011703968048,
"learning_rate": 0.0004883227497815321,
"loss": 3.3729,
"step": 32050
},
{
"epoch": 9.34505852891503,
"grad_norm": 0.32998713850975037,
"learning_rate": 0.00048814797553160496,
"loss": 3.3726,
"step": 32100
},
{
"epoch": 9.359617960514822,
"grad_norm": 0.3431642949581146,
"learning_rate": 0.0004879732012816778,
"loss": 3.3722,
"step": 32150
},
{
"epoch": 9.374177392114612,
"grad_norm": 0.3487151265144348,
"learning_rate": 0.0004877984270317506,
"loss": 3.3829,
"step": 32200
},
{
"epoch": 9.388736823714403,
"grad_norm": 0.3365775942802429,
"learning_rate": 0.0004876236527818234,
"loss": 3.3844,
"step": 32250
},
{
"epoch": 9.403296255314192,
"grad_norm": 0.32179173827171326,
"learning_rate": 0.00048744887853189624,
"loss": 3.3815,
"step": 32300
},
{
"epoch": 9.417855686913983,
"grad_norm": 0.3473745584487915,
"learning_rate": 0.00048727410428196907,
"loss": 3.3759,
"step": 32350
},
{
"epoch": 9.432415118513774,
"grad_norm": 0.31671932339668274,
"learning_rate": 0.0004870993300320419,
"loss": 3.386,
"step": 32400
},
{
"epoch": 9.446974550113563,
"grad_norm": 0.31829264760017395,
"learning_rate": 0.00048692455578211474,
"loss": 3.3817,
"step": 32450
},
{
"epoch": 9.461533981713353,
"grad_norm": 0.3237382769584656,
"learning_rate": 0.0004867497815321875,
"loss": 3.3909,
"step": 32500
},
{
"epoch": 9.476093413313144,
"grad_norm": 0.31413301825523376,
"learning_rate": 0.00048657500728226035,
"loss": 3.3873,
"step": 32550
},
{
"epoch": 9.490652844912935,
"grad_norm": 0.3735544681549072,
"learning_rate": 0.0004864002330323332,
"loss": 3.3888,
"step": 32600
},
{
"epoch": 9.505212276512726,
"grad_norm": 0.33301472663879395,
"learning_rate": 0.000486225458782406,
"loss": 3.388,
"step": 32650
},
{
"epoch": 9.519771708112515,
"grad_norm": 0.34808629751205444,
"learning_rate": 0.0004860506845324788,
"loss": 3.379,
"step": 32700
},
{
"epoch": 9.534331139712306,
"grad_norm": 0.3324873447418213,
"learning_rate": 0.0004858759102825516,
"loss": 3.3959,
"step": 32750
},
{
"epoch": 9.548890571312096,
"grad_norm": 0.34592559933662415,
"learning_rate": 0.00048570113603262446,
"loss": 3.3929,
"step": 32800
},
{
"epoch": 9.563450002911885,
"grad_norm": 0.35991552472114563,
"learning_rate": 0.0004855263617826973,
"loss": 3.3922,
"step": 32850
},
{
"epoch": 9.578009434511676,
"grad_norm": 0.3375621438026428,
"learning_rate": 0.0004853515875327701,
"loss": 3.3983,
"step": 32900
},
{
"epoch": 9.592568866111467,
"grad_norm": 0.34628742933273315,
"learning_rate": 0.0004851768132828429,
"loss": 3.391,
"step": 32950
},
{
"epoch": 9.607128297711258,
"grad_norm": 0.3223342001438141,
"learning_rate": 0.00048500203903291574,
"loss": 3.3891,
"step": 33000
},
{
"epoch": 9.607128297711258,
"eval_accuracy": 0.3680227096645534,
"eval_loss": 3.569427251815796,
"eval_runtime": 54.1344,
"eval_samples_per_second": 307.438,
"eval_steps_per_second": 19.23,
"step": 33000
},
{
"epoch": 9.621687729311049,
"grad_norm": 0.3431673049926758,
"learning_rate": 0.00048482726478298857,
"loss": 3.3903,
"step": 33050
},
{
"epoch": 9.636247160910838,
"grad_norm": 0.3508543372154236,
"learning_rate": 0.0004846524905330614,
"loss": 3.3924,
"step": 33100
},
{
"epoch": 9.650806592510628,
"grad_norm": 0.329703688621521,
"learning_rate": 0.00048447771628313424,
"loss": 3.3948,
"step": 33150
},
{
"epoch": 9.66536602411042,
"grad_norm": 0.33297428488731384,
"learning_rate": 0.0004843029420332071,
"loss": 3.3903,
"step": 33200
},
{
"epoch": 9.67992545571021,
"grad_norm": 0.32107412815093994,
"learning_rate": 0.0004841281677832799,
"loss": 3.4097,
"step": 33250
},
{
"epoch": 9.694484887309999,
"grad_norm": 0.35037678480148315,
"learning_rate": 0.00048395339353335273,
"loss": 3.3981,
"step": 33300
},
{
"epoch": 9.70904431890979,
"grad_norm": 0.3188696801662445,
"learning_rate": 0.00048377861928342557,
"loss": 3.4078,
"step": 33350
},
{
"epoch": 9.72360375050958,
"grad_norm": 0.3539755046367645,
"learning_rate": 0.0004836038450334984,
"loss": 3.4027,
"step": 33400
},
{
"epoch": 9.738163182109371,
"grad_norm": 0.35060805082321167,
"learning_rate": 0.0004834290707835712,
"loss": 3.3889,
"step": 33450
},
{
"epoch": 9.75272261370916,
"grad_norm": 0.3305850327014923,
"learning_rate": 0.000483254296533644,
"loss": 3.4183,
"step": 33500
},
{
"epoch": 9.767282045308951,
"grad_norm": 0.3297503590583801,
"learning_rate": 0.00048307952228371685,
"loss": 3.4122,
"step": 33550
},
{
"epoch": 9.781841476908742,
"grad_norm": 0.3465321362018585,
"learning_rate": 0.0004829047480337897,
"loss": 3.4138,
"step": 33600
},
{
"epoch": 9.796400908508533,
"grad_norm": 0.3402300775051117,
"learning_rate": 0.0004827299737838625,
"loss": 3.4017,
"step": 33650
},
{
"epoch": 9.810960340108322,
"grad_norm": 0.3542657196521759,
"learning_rate": 0.0004825551995339353,
"loss": 3.4158,
"step": 33700
},
{
"epoch": 9.825519771708112,
"grad_norm": 0.32605618238449097,
"learning_rate": 0.0004823804252840081,
"loss": 3.4154,
"step": 33750
},
{
"epoch": 9.840079203307903,
"grad_norm": 0.37203866243362427,
"learning_rate": 0.00048220565103408096,
"loss": 3.4068,
"step": 33800
},
{
"epoch": 9.854638634907694,
"grad_norm": 0.3198862373828888,
"learning_rate": 0.0004820308767841538,
"loss": 3.4161,
"step": 33850
},
{
"epoch": 9.869198066507483,
"grad_norm": 0.316527783870697,
"learning_rate": 0.0004818561025342266,
"loss": 3.4128,
"step": 33900
},
{
"epoch": 9.883757498107274,
"grad_norm": 0.32887765765190125,
"learning_rate": 0.0004816813282842994,
"loss": 3.4107,
"step": 33950
},
{
"epoch": 9.898316929707065,
"grad_norm": 0.3417325019836426,
"learning_rate": 0.00048150655403437223,
"loss": 3.4101,
"step": 34000
},
{
"epoch": 9.898316929707065,
"eval_accuracy": 0.36854583998514684,
"eval_loss": 3.5642430782318115,
"eval_runtime": 54.1838,
"eval_samples_per_second": 307.158,
"eval_steps_per_second": 19.212,
"step": 34000
},
{
"epoch": 9.912876361306855,
"grad_norm": 0.33323341608047485,
"learning_rate": 0.00048133177978444507,
"loss": 3.4087,
"step": 34050
},
{
"epoch": 9.927435792906644,
"grad_norm": 0.32190296053886414,
"learning_rate": 0.0004811570055345179,
"loss": 3.4097,
"step": 34100
},
{
"epoch": 9.941995224506435,
"grad_norm": 0.33463558554649353,
"learning_rate": 0.0004809822312845907,
"loss": 3.413,
"step": 34150
},
{
"epoch": 9.956554656106226,
"grad_norm": 0.35033613443374634,
"learning_rate": 0.0004808074570346635,
"loss": 3.4105,
"step": 34200
},
{
"epoch": 9.971114087706017,
"grad_norm": 0.3425426185131073,
"learning_rate": 0.00048063268278473634,
"loss": 3.4089,
"step": 34250
},
{
"epoch": 9.985673519305806,
"grad_norm": 0.33646196126937866,
"learning_rate": 0.0004804579085348092,
"loss": 3.4147,
"step": 34300
},
{
"epoch": 10.0,
"grad_norm": 0.8161603808403015,
"learning_rate": 0.000480283134284882,
"loss": 3.428,
"step": 34350
},
{
"epoch": 10.01455943159979,
"grad_norm": 0.34390079975128174,
"learning_rate": 0.0004801083600349548,
"loss": 3.3054,
"step": 34400
},
{
"epoch": 10.029118863199582,
"grad_norm": 0.3613048493862152,
"learning_rate": 0.0004799335857850276,
"loss": 3.2991,
"step": 34450
},
{
"epoch": 10.04367829479937,
"grad_norm": 0.3359822928905487,
"learning_rate": 0.00047975881153510046,
"loss": 3.2928,
"step": 34500
},
{
"epoch": 10.058237726399161,
"grad_norm": 0.32760336995124817,
"learning_rate": 0.0004795840372851733,
"loss": 3.3078,
"step": 34550
},
{
"epoch": 10.072797157998952,
"grad_norm": 0.333717942237854,
"learning_rate": 0.00047940926303524607,
"loss": 3.3163,
"step": 34600
},
{
"epoch": 10.087356589598743,
"grad_norm": 0.33068737387657166,
"learning_rate": 0.0004792344887853189,
"loss": 3.3126,
"step": 34650
},
{
"epoch": 10.101916021198532,
"grad_norm": 0.3351806402206421,
"learning_rate": 0.00047905971453539173,
"loss": 3.3233,
"step": 34700
},
{
"epoch": 10.116475452798323,
"grad_norm": 0.378257691860199,
"learning_rate": 0.00047888494028546457,
"loss": 3.3302,
"step": 34750
},
{
"epoch": 10.131034884398114,
"grad_norm": 0.3484111428260803,
"learning_rate": 0.0004787101660355374,
"loss": 3.3303,
"step": 34800
},
{
"epoch": 10.145594315997904,
"grad_norm": 0.3338722884654999,
"learning_rate": 0.0004785353917856102,
"loss": 3.3367,
"step": 34850
},
{
"epoch": 10.160153747597693,
"grad_norm": 0.33523687720298767,
"learning_rate": 0.000478360617535683,
"loss": 3.3373,
"step": 34900
},
{
"epoch": 10.174713179197484,
"grad_norm": 0.3656017780303955,
"learning_rate": 0.00047818584328575584,
"loss": 3.3416,
"step": 34950
},
{
"epoch": 10.189272610797275,
"grad_norm": 0.3645973801612854,
"learning_rate": 0.0004780110690358287,
"loss": 3.3363,
"step": 35000
},
{
"epoch": 10.189272610797275,
"eval_accuracy": 0.36812065696890367,
"eval_loss": 3.574761152267456,
"eval_runtime": 53.8933,
"eval_samples_per_second": 308.814,
"eval_steps_per_second": 19.316,
"step": 35000
},
{
"epoch": 10.203832042397066,
"grad_norm": 0.33940547704696655,
"learning_rate": 0.0004778362947859015,
"loss": 3.3454,
"step": 35050
},
{
"epoch": 10.218391473996855,
"grad_norm": 0.33723345398902893,
"learning_rate": 0.0004776615205359743,
"loss": 3.3449,
"step": 35100
},
{
"epoch": 10.232950905596645,
"grad_norm": 0.3461247980594635,
"learning_rate": 0.0004774867462860471,
"loss": 3.3466,
"step": 35150
},
{
"epoch": 10.247510337196436,
"grad_norm": 0.3415777385234833,
"learning_rate": 0.00047731197203611995,
"loss": 3.3445,
"step": 35200
},
{
"epoch": 10.262069768796227,
"grad_norm": 0.3251374363899231,
"learning_rate": 0.0004771371977861928,
"loss": 3.338,
"step": 35250
},
{
"epoch": 10.276629200396016,
"grad_norm": 0.3444693684577942,
"learning_rate": 0.00047696242353626557,
"loss": 3.3484,
"step": 35300
},
{
"epoch": 10.291188631995807,
"grad_norm": 0.3499116897583008,
"learning_rate": 0.0004767876492863384,
"loss": 3.3543,
"step": 35350
},
{
"epoch": 10.305748063595598,
"grad_norm": 0.3512921631336212,
"learning_rate": 0.00047661287503641123,
"loss": 3.3542,
"step": 35400
},
{
"epoch": 10.320307495195388,
"grad_norm": 0.33651211857795715,
"learning_rate": 0.00047643810078648407,
"loss": 3.3529,
"step": 35450
},
{
"epoch": 10.334866926795177,
"grad_norm": 0.35819199681282043,
"learning_rate": 0.0004762633265365569,
"loss": 3.3563,
"step": 35500
},
{
"epoch": 10.349426358394968,
"grad_norm": 0.33055511116981506,
"learning_rate": 0.0004760885522866297,
"loss": 3.3583,
"step": 35550
},
{
"epoch": 10.363985789994759,
"grad_norm": 0.3145069181919098,
"learning_rate": 0.0004759137780367025,
"loss": 3.3484,
"step": 35600
},
{
"epoch": 10.37854522159455,
"grad_norm": 0.3309759497642517,
"learning_rate": 0.00047573900378677534,
"loss": 3.3466,
"step": 35650
},
{
"epoch": 10.393104653194339,
"grad_norm": 0.3606186509132385,
"learning_rate": 0.00047556422953684823,
"loss": 3.3622,
"step": 35700
},
{
"epoch": 10.40766408479413,
"grad_norm": 0.3602750897407532,
"learning_rate": 0.00047538945528692106,
"loss": 3.3511,
"step": 35750
},
{
"epoch": 10.42222351639392,
"grad_norm": 0.3379049003124237,
"learning_rate": 0.0004752146810369939,
"loss": 3.3676,
"step": 35800
},
{
"epoch": 10.436782947993711,
"grad_norm": 0.3288438320159912,
"learning_rate": 0.0004750399067870667,
"loss": 3.3672,
"step": 35850
},
{
"epoch": 10.4513423795935,
"grad_norm": 0.3229992985725403,
"learning_rate": 0.0004748651325371395,
"loss": 3.3697,
"step": 35900
},
{
"epoch": 10.46590181119329,
"grad_norm": 0.3085167407989502,
"learning_rate": 0.00047469035828721234,
"loss": 3.3625,
"step": 35950
},
{
"epoch": 10.480461242793082,
"grad_norm": 0.3422580361366272,
"learning_rate": 0.0004745155840372852,
"loss": 3.3778,
"step": 36000
},
{
"epoch": 10.480461242793082,
"eval_accuracy": 0.36864954889563534,
"eval_loss": 3.5684714317321777,
"eval_runtime": 53.892,
"eval_samples_per_second": 308.821,
"eval_steps_per_second": 19.316,
"step": 36000
},
{
"epoch": 10.495020674392872,
"grad_norm": 0.34788641333580017,
"learning_rate": 0.00047434080978735795,
"loss": 3.371,
"step": 36050
},
{
"epoch": 10.509580105992661,
"grad_norm": 0.3174489140510559,
"learning_rate": 0.0004741660355374308,
"loss": 3.3691,
"step": 36100
},
{
"epoch": 10.524139537592452,
"grad_norm": 0.3501240313053131,
"learning_rate": 0.0004739912612875036,
"loss": 3.367,
"step": 36150
},
{
"epoch": 10.538698969192243,
"grad_norm": 0.3440835773944855,
"learning_rate": 0.00047381648703757645,
"loss": 3.3787,
"step": 36200
},
{
"epoch": 10.553258400792034,
"grad_norm": 0.3479546308517456,
"learning_rate": 0.0004736417127876493,
"loss": 3.3656,
"step": 36250
},
{
"epoch": 10.567817832391823,
"grad_norm": 0.3386428654193878,
"learning_rate": 0.00047346693853772206,
"loss": 3.3647,
"step": 36300
},
{
"epoch": 10.582377263991614,
"grad_norm": 0.3308039903640747,
"learning_rate": 0.0004732921642877949,
"loss": 3.3657,
"step": 36350
},
{
"epoch": 10.596936695591404,
"grad_norm": 0.3481200635433197,
"learning_rate": 0.00047311739003786773,
"loss": 3.3783,
"step": 36400
},
{
"epoch": 10.611496127191195,
"grad_norm": 0.34154003858566284,
"learning_rate": 0.00047294261578794056,
"loss": 3.3755,
"step": 36450
},
{
"epoch": 10.626055558790984,
"grad_norm": 0.33981460332870483,
"learning_rate": 0.0004727678415380134,
"loss": 3.3738,
"step": 36500
},
{
"epoch": 10.640614990390775,
"grad_norm": 0.34777724742889404,
"learning_rate": 0.0004725930672880862,
"loss": 3.3713,
"step": 36550
},
{
"epoch": 10.655174421990566,
"grad_norm": 0.3389792740345001,
"learning_rate": 0.000472418293038159,
"loss": 3.3861,
"step": 36600
},
{
"epoch": 10.669733853590357,
"grad_norm": 0.32127055525779724,
"learning_rate": 0.00047224351878823184,
"loss": 3.3809,
"step": 36650
},
{
"epoch": 10.684293285190146,
"grad_norm": 0.3361356854438782,
"learning_rate": 0.0004720687445383047,
"loss": 3.3902,
"step": 36700
},
{
"epoch": 10.698852716789936,
"grad_norm": 0.3294849395751953,
"learning_rate": 0.00047189397028837745,
"loss": 3.3857,
"step": 36750
},
{
"epoch": 10.713412148389727,
"grad_norm": 0.3620792031288147,
"learning_rate": 0.0004717191960384503,
"loss": 3.3883,
"step": 36800
},
{
"epoch": 10.727971579989518,
"grad_norm": 0.31317734718322754,
"learning_rate": 0.0004715444217885231,
"loss": 3.3763,
"step": 36850
},
{
"epoch": 10.742531011589307,
"grad_norm": 0.33570465445518494,
"learning_rate": 0.00047136964753859595,
"loss": 3.3725,
"step": 36900
},
{
"epoch": 10.757090443189098,
"grad_norm": 0.32414549589157104,
"learning_rate": 0.0004711948732886688,
"loss": 3.3789,
"step": 36950
},
{
"epoch": 10.771649874788888,
"grad_norm": 0.3350679874420166,
"learning_rate": 0.00047102009903874156,
"loss": 3.3797,
"step": 37000
},
{
"epoch": 10.771649874788888,
"eval_accuracy": 0.36933459210709346,
"eval_loss": 3.5581462383270264,
"eval_runtime": 53.7954,
"eval_samples_per_second": 309.376,
"eval_steps_per_second": 19.351,
"step": 37000
},
{
"epoch": 10.78620930638868,
"grad_norm": 0.36362600326538086,
"learning_rate": 0.0004708453247888144,
"loss": 3.3874,
"step": 37050
},
{
"epoch": 10.800768737988468,
"grad_norm": 0.3308012783527374,
"learning_rate": 0.00047067055053888723,
"loss": 3.3835,
"step": 37100
},
{
"epoch": 10.815328169588259,
"grad_norm": 0.35319286584854126,
"learning_rate": 0.00047049577628896006,
"loss": 3.3845,
"step": 37150
},
{
"epoch": 10.82988760118805,
"grad_norm": 0.34049636125564575,
"learning_rate": 0.0004703210020390329,
"loss": 3.3913,
"step": 37200
},
{
"epoch": 10.84444703278784,
"grad_norm": 0.32271477580070496,
"learning_rate": 0.0004701462277891057,
"loss": 3.3867,
"step": 37250
},
{
"epoch": 10.85900646438763,
"grad_norm": 0.3542429804801941,
"learning_rate": 0.0004699714535391785,
"loss": 3.3767,
"step": 37300
},
{
"epoch": 10.87356589598742,
"grad_norm": 0.3518877327442169,
"learning_rate": 0.00046979667928925134,
"loss": 3.3977,
"step": 37350
},
{
"epoch": 10.888125327587211,
"grad_norm": 0.3210870921611786,
"learning_rate": 0.0004696219050393242,
"loss": 3.385,
"step": 37400
},
{
"epoch": 10.902684759187002,
"grad_norm": 0.36343225836753845,
"learning_rate": 0.00046944713078939695,
"loss": 3.3819,
"step": 37450
},
{
"epoch": 10.917244190786791,
"grad_norm": 0.34078526496887207,
"learning_rate": 0.0004692723565394698,
"loss": 3.3817,
"step": 37500
},
{
"epoch": 10.931803622386582,
"grad_norm": 0.32085925340652466,
"learning_rate": 0.0004690975822895426,
"loss": 3.3841,
"step": 37550
},
{
"epoch": 10.946363053986373,
"grad_norm": 0.3279073238372803,
"learning_rate": 0.00046892280803961545,
"loss": 3.3845,
"step": 37600
},
{
"epoch": 10.960922485586163,
"grad_norm": 0.35262706875801086,
"learning_rate": 0.0004687480337896883,
"loss": 3.3934,
"step": 37650
},
{
"epoch": 10.975481917185952,
"grad_norm": 0.36066585779190063,
"learning_rate": 0.00046857325953976106,
"loss": 3.3778,
"step": 37700
},
{
"epoch": 10.990041348785743,
"grad_norm": 0.35543933510780334,
"learning_rate": 0.0004683984852898339,
"loss": 3.3916,
"step": 37750
},
{
"epoch": 11.004367829479937,
"grad_norm": 0.33726900815963745,
"learning_rate": 0.00046822371103990673,
"loss": 3.365,
"step": 37800
},
{
"epoch": 11.018927261079728,
"grad_norm": 0.38896262645721436,
"learning_rate": 0.00046804893678997956,
"loss": 3.2857,
"step": 37850
},
{
"epoch": 11.033486692679517,
"grad_norm": 0.32145750522613525,
"learning_rate": 0.00046787416254005234,
"loss": 3.2777,
"step": 37900
},
{
"epoch": 11.048046124279308,
"grad_norm": 0.3301202058792114,
"learning_rate": 0.0004676993882901252,
"loss": 3.2926,
"step": 37950
},
{
"epoch": 11.062605555879099,
"grad_norm": 0.3512705862522125,
"learning_rate": 0.000467524614040198,
"loss": 3.2924,
"step": 38000
},
{
"epoch": 11.062605555879099,
"eval_accuracy": 0.369106126786142,
"eval_loss": 3.569038152694702,
"eval_runtime": 54.0656,
"eval_samples_per_second": 307.83,
"eval_steps_per_second": 19.254,
"step": 38000
},
{
"epoch": 11.07716498747889,
"grad_norm": 0.35407987236976624,
"learning_rate": 0.00046734983979027084,
"loss": 3.2867,
"step": 38050
},
{
"epoch": 11.091724419078679,
"grad_norm": 0.33891621232032776,
"learning_rate": 0.00046717506554034367,
"loss": 3.2913,
"step": 38100
},
{
"epoch": 11.10628385067847,
"grad_norm": 0.3495442569255829,
"learning_rate": 0.00046700029129041645,
"loss": 3.304,
"step": 38150
},
{
"epoch": 11.12084328227826,
"grad_norm": 0.33784544467926025,
"learning_rate": 0.0004668255170404893,
"loss": 3.314,
"step": 38200
},
{
"epoch": 11.135402713878051,
"grad_norm": 0.3771565556526184,
"learning_rate": 0.00046665074279056217,
"loss": 3.3101,
"step": 38250
},
{
"epoch": 11.14996214547784,
"grad_norm": 0.3389083743095398,
"learning_rate": 0.000466475968540635,
"loss": 3.3027,
"step": 38300
},
{
"epoch": 11.16452157707763,
"grad_norm": 0.34060126543045044,
"learning_rate": 0.00046630119429070784,
"loss": 3.3005,
"step": 38350
},
{
"epoch": 11.179081008677421,
"grad_norm": 0.33946189284324646,
"learning_rate": 0.00046612642004078067,
"loss": 3.3136,
"step": 38400
},
{
"epoch": 11.193640440277212,
"grad_norm": 0.33616408705711365,
"learning_rate": 0.00046595164579085345,
"loss": 3.3179,
"step": 38450
},
{
"epoch": 11.208199871877001,
"grad_norm": 0.35129034519195557,
"learning_rate": 0.0004657768715409263,
"loss": 3.3181,
"step": 38500
},
{
"epoch": 11.222759303476792,
"grad_norm": 0.37194448709487915,
"learning_rate": 0.0004656020972909991,
"loss": 3.3245,
"step": 38550
},
{
"epoch": 11.237318735076583,
"grad_norm": 0.3498443067073822,
"learning_rate": 0.00046542732304107195,
"loss": 3.3199,
"step": 38600
},
{
"epoch": 11.251878166676374,
"grad_norm": 0.33491912484169006,
"learning_rate": 0.0004652525487911447,
"loss": 3.3188,
"step": 38650
},
{
"epoch": 11.266437598276163,
"grad_norm": 0.4009822905063629,
"learning_rate": 0.00046507777454121756,
"loss": 3.3187,
"step": 38700
},
{
"epoch": 11.280997029875953,
"grad_norm": 0.3393429219722748,
"learning_rate": 0.0004649030002912904,
"loss": 3.3263,
"step": 38750
},
{
"epoch": 11.295556461475744,
"grad_norm": 0.345793217420578,
"learning_rate": 0.0004647282260413632,
"loss": 3.3232,
"step": 38800
},
{
"epoch": 11.310115893075535,
"grad_norm": 0.3599303066730499,
"learning_rate": 0.00046455345179143606,
"loss": 3.3334,
"step": 38850
},
{
"epoch": 11.324675324675324,
"grad_norm": 0.32492297887802124,
"learning_rate": 0.00046437867754150884,
"loss": 3.3376,
"step": 38900
},
{
"epoch": 11.339234756275115,
"grad_norm": 0.3709739148616791,
"learning_rate": 0.00046420390329158167,
"loss": 3.3403,
"step": 38950
},
{
"epoch": 11.353794187874906,
"grad_norm": 0.3648548424243927,
"learning_rate": 0.0004640291290416545,
"loss": 3.3359,
"step": 39000
},
{
"epoch": 11.353794187874906,
"eval_accuracy": 0.3693122511853329,
"eval_loss": 3.5671463012695312,
"eval_runtime": 54.06,
"eval_samples_per_second": 307.861,
"eval_steps_per_second": 19.256,
"step": 39000
},
{
"epoch": 11.368353619474696,
"grad_norm": 0.3588879108428955,
"learning_rate": 0.00046385435479172734,
"loss": 3.3408,
"step": 39050
},
{
"epoch": 11.382913051074485,
"grad_norm": 0.3257652223110199,
"learning_rate": 0.00046367958054180017,
"loss": 3.3512,
"step": 39100
},
{
"epoch": 11.397472482674276,
"grad_norm": 0.3484971225261688,
"learning_rate": 0.00046350480629187295,
"loss": 3.3376,
"step": 39150
},
{
"epoch": 11.412031914274067,
"grad_norm": 0.3337056636810303,
"learning_rate": 0.0004633300320419458,
"loss": 3.3432,
"step": 39200
},
{
"epoch": 11.426591345873858,
"grad_norm": 0.3409966230392456,
"learning_rate": 0.0004631552577920186,
"loss": 3.3473,
"step": 39250
},
{
"epoch": 11.441150777473647,
"grad_norm": 0.3414597511291504,
"learning_rate": 0.00046298048354209145,
"loss": 3.3393,
"step": 39300
},
{
"epoch": 11.455710209073438,
"grad_norm": 0.35023412108421326,
"learning_rate": 0.0004628057092921642,
"loss": 3.342,
"step": 39350
},
{
"epoch": 11.470269640673228,
"grad_norm": 0.3243994414806366,
"learning_rate": 0.00046263093504223706,
"loss": 3.3399,
"step": 39400
},
{
"epoch": 11.484829072273019,
"grad_norm": 0.32542338967323303,
"learning_rate": 0.0004624561607923099,
"loss": 3.3426,
"step": 39450
},
{
"epoch": 11.499388503872808,
"grad_norm": 0.38432076573371887,
"learning_rate": 0.0004622813865423827,
"loss": 3.34,
"step": 39500
},
{
"epoch": 11.513947935472599,
"grad_norm": 0.36201798915863037,
"learning_rate": 0.00046210661229245556,
"loss": 3.3318,
"step": 39550
},
{
"epoch": 11.52850736707239,
"grad_norm": 0.3721318542957306,
"learning_rate": 0.00046193183804252834,
"loss": 3.334,
"step": 39600
},
{
"epoch": 11.54306679867218,
"grad_norm": 0.33296698331832886,
"learning_rate": 0.00046175706379260117,
"loss": 3.3538,
"step": 39650
},
{
"epoch": 11.55762623027197,
"grad_norm": 0.3993701636791229,
"learning_rate": 0.000461582289542674,
"loss": 3.3529,
"step": 39700
},
{
"epoch": 11.57218566187176,
"grad_norm": 0.3540388345718384,
"learning_rate": 0.00046140751529274684,
"loss": 3.3533,
"step": 39750
},
{
"epoch": 11.586745093471551,
"grad_norm": 0.3348828852176666,
"learning_rate": 0.00046123274104281967,
"loss": 3.3525,
"step": 39800
},
{
"epoch": 11.601304525071342,
"grad_norm": 0.35919663310050964,
"learning_rate": 0.00046105796679289245,
"loss": 3.3464,
"step": 39850
},
{
"epoch": 11.61586395667113,
"grad_norm": 0.3327691853046417,
"learning_rate": 0.0004608831925429653,
"loss": 3.3645,
"step": 39900
},
{
"epoch": 11.630423388270922,
"grad_norm": 0.35042068362236023,
"learning_rate": 0.0004607084182930381,
"loss": 3.3599,
"step": 39950
},
{
"epoch": 11.644982819870712,
"grad_norm": 0.32780370116233826,
"learning_rate": 0.00046053364404311095,
"loss": 3.3503,
"step": 40000
},
{
"epoch": 11.644982819870712,
"eval_accuracy": 0.3698941734052962,
"eval_loss": 3.5564932823181152,
"eval_runtime": 53.915,
"eval_samples_per_second": 308.69,
"eval_steps_per_second": 19.308,
"step": 40000
},
{
"epoch": 11.659542251470503,
"grad_norm": 0.358482301235199,
"learning_rate": 0.0004603588697931837,
"loss": 3.3541,
"step": 40050
},
{
"epoch": 11.674101683070292,
"grad_norm": 0.33564674854278564,
"learning_rate": 0.00046018409554325656,
"loss": 3.3658,
"step": 40100
},
{
"epoch": 11.688661114670083,
"grad_norm": 0.3993256390094757,
"learning_rate": 0.0004600093212933294,
"loss": 3.3569,
"step": 40150
},
{
"epoch": 11.703220546269874,
"grad_norm": 0.34387677907943726,
"learning_rate": 0.0004598345470434022,
"loss": 3.3745,
"step": 40200
},
{
"epoch": 11.717779977869665,
"grad_norm": 0.3534398376941681,
"learning_rate": 0.00045965977279347506,
"loss": 3.357,
"step": 40250
},
{
"epoch": 11.732339409469454,
"grad_norm": 0.35269302129745483,
"learning_rate": 0.00045948499854354784,
"loss": 3.3703,
"step": 40300
},
{
"epoch": 11.746898841069244,
"grad_norm": 0.3300093114376068,
"learning_rate": 0.00045931022429362067,
"loss": 3.3671,
"step": 40350
},
{
"epoch": 11.761458272669035,
"grad_norm": 0.33715489506721497,
"learning_rate": 0.0004591354500436935,
"loss": 3.3716,
"step": 40400
},
{
"epoch": 11.776017704268826,
"grad_norm": 0.34225428104400635,
"learning_rate": 0.00045896067579376634,
"loss": 3.3641,
"step": 40450
},
{
"epoch": 11.790577135868615,
"grad_norm": 0.3427339494228363,
"learning_rate": 0.0004587859015438391,
"loss": 3.3754,
"step": 40500
},
{
"epoch": 11.805136567468406,
"grad_norm": 0.3600717782974243,
"learning_rate": 0.00045861112729391195,
"loss": 3.3607,
"step": 40550
},
{
"epoch": 11.819695999068196,
"grad_norm": 0.34151703119277954,
"learning_rate": 0.0004584363530439848,
"loss": 3.3648,
"step": 40600
},
{
"epoch": 11.834255430667987,
"grad_norm": 0.3521574139595032,
"learning_rate": 0.0004582615787940576,
"loss": 3.3645,
"step": 40650
},
{
"epoch": 11.848814862267776,
"grad_norm": 0.3284907937049866,
"learning_rate": 0.00045808680454413045,
"loss": 3.3637,
"step": 40700
},
{
"epoch": 11.863374293867567,
"grad_norm": 0.35242322087287903,
"learning_rate": 0.00045791203029420333,
"loss": 3.3755,
"step": 40750
},
{
"epoch": 11.877933725467358,
"grad_norm": 0.33842575550079346,
"learning_rate": 0.0004577372560442761,
"loss": 3.3726,
"step": 40800
},
{
"epoch": 11.892493157067149,
"grad_norm": 0.354082316160202,
"learning_rate": 0.00045756248179434894,
"loss": 3.3701,
"step": 40850
},
{
"epoch": 11.90705258866694,
"grad_norm": 0.32441848516464233,
"learning_rate": 0.0004573877075444218,
"loss": 3.3719,
"step": 40900
},
{
"epoch": 11.921612020266728,
"grad_norm": 0.3555139899253845,
"learning_rate": 0.0004572129332944946,
"loss": 3.3542,
"step": 40950
},
{
"epoch": 11.93617145186652,
"grad_norm": 0.3518112897872925,
"learning_rate": 0.00045703815904456744,
"loss": 3.3774,
"step": 41000
},
{
"epoch": 11.93617145186652,
"eval_accuracy": 0.3705694571614589,
"eval_loss": 3.5490357875823975,
"eval_runtime": 53.9812,
"eval_samples_per_second": 308.311,
"eval_steps_per_second": 19.284,
"step": 41000
},
{
"epoch": 11.95073088346631,
"grad_norm": 0.33228781819343567,
"learning_rate": 0.0004568633847946402,
"loss": 3.3794,
"step": 41050
},
{
"epoch": 11.965290315066099,
"grad_norm": 0.3327503204345703,
"learning_rate": 0.00045668861054471306,
"loss": 3.391,
"step": 41100
},
{
"epoch": 11.97984974666589,
"grad_norm": 0.32881519198417664,
"learning_rate": 0.0004565138362947859,
"loss": 3.3747,
"step": 41150
},
{
"epoch": 11.99440917826568,
"grad_norm": 0.3413378894329071,
"learning_rate": 0.0004563390620448587,
"loss": 3.3809,
"step": 41200
},
{
"epoch": 12.008735658959875,
"grad_norm": 0.35909968614578247,
"learning_rate": 0.0004561642877949315,
"loss": 3.3022,
"step": 41250
},
{
"epoch": 12.023295090559664,
"grad_norm": 0.3564230799674988,
"learning_rate": 0.00045598951354500433,
"loss": 3.267,
"step": 41300
},
{
"epoch": 12.037854522159455,
"grad_norm": 0.3933013677597046,
"learning_rate": 0.00045581473929507717,
"loss": 3.2561,
"step": 41350
},
{
"epoch": 12.052413953759245,
"grad_norm": 0.32905763387680054,
"learning_rate": 0.00045563996504515,
"loss": 3.2752,
"step": 41400
},
{
"epoch": 12.066973385359036,
"grad_norm": 0.37047168612480164,
"learning_rate": 0.00045546519079522283,
"loss": 3.2786,
"step": 41450
},
{
"epoch": 12.081532816958825,
"grad_norm": 0.34073546528816223,
"learning_rate": 0.0004552904165452956,
"loss": 3.2767,
"step": 41500
},
{
"epoch": 12.096092248558616,
"grad_norm": 0.3816109597682953,
"learning_rate": 0.00045511564229536844,
"loss": 3.2799,
"step": 41550
},
{
"epoch": 12.110651680158407,
"grad_norm": 0.3862399160861969,
"learning_rate": 0.0004549408680454413,
"loss": 3.2798,
"step": 41600
},
{
"epoch": 12.125211111758198,
"grad_norm": 0.352953165769577,
"learning_rate": 0.0004547660937955141,
"loss": 3.2707,
"step": 41650
},
{
"epoch": 12.139770543357987,
"grad_norm": 0.3749876320362091,
"learning_rate": 0.00045459131954558694,
"loss": 3.2888,
"step": 41700
},
{
"epoch": 12.154329974957777,
"grad_norm": 0.3750421106815338,
"learning_rate": 0.0004544165452956597,
"loss": 3.2932,
"step": 41750
},
{
"epoch": 12.168889406557568,
"grad_norm": 0.32412317395210266,
"learning_rate": 0.00045424177104573255,
"loss": 3.289,
"step": 41800
},
{
"epoch": 12.183448838157359,
"grad_norm": 0.3581444025039673,
"learning_rate": 0.0004540669967958054,
"loss": 3.2892,
"step": 41850
},
{
"epoch": 12.198008269757148,
"grad_norm": 0.3392166495323181,
"learning_rate": 0.0004538922225458782,
"loss": 3.2978,
"step": 41900
},
{
"epoch": 12.212567701356939,
"grad_norm": 0.36249926686286926,
"learning_rate": 0.000453717448295951,
"loss": 3.3048,
"step": 41950
},
{
"epoch": 12.22712713295673,
"grad_norm": 0.32953113317489624,
"learning_rate": 0.00045354267404602383,
"loss": 3.3027,
"step": 42000
},
{
"epoch": 12.22712713295673,
"eval_accuracy": 0.3696067986011761,
"eval_loss": 3.5610363483428955,
"eval_runtime": 53.9811,
"eval_samples_per_second": 308.312,
"eval_steps_per_second": 19.285,
"step": 42000
},
{
"epoch": 12.24168656455652,
"grad_norm": 0.3559216260910034,
"learning_rate": 0.00045336789979609667,
"loss": 3.3001,
"step": 42050
},
{
"epoch": 12.25624599615631,
"grad_norm": 0.3325844705104828,
"learning_rate": 0.0004531931255461695,
"loss": 3.2938,
"step": 42100
},
{
"epoch": 12.2708054277561,
"grad_norm": 0.3980172574520111,
"learning_rate": 0.00045301835129624233,
"loss": 3.3119,
"step": 42150
},
{
"epoch": 12.28536485935589,
"grad_norm": 0.32405316829681396,
"learning_rate": 0.0004528435770463151,
"loss": 3.3034,
"step": 42200
},
{
"epoch": 12.299924290955682,
"grad_norm": 0.34660008549690247,
"learning_rate": 0.00045266880279638794,
"loss": 3.3109,
"step": 42250
},
{
"epoch": 12.31448372255547,
"grad_norm": 0.3319573998451233,
"learning_rate": 0.0004524940285464608,
"loss": 3.3025,
"step": 42300
},
{
"epoch": 12.329043154155261,
"grad_norm": 0.38782092928886414,
"learning_rate": 0.0004523192542965336,
"loss": 3.3051,
"step": 42350
},
{
"epoch": 12.343602585755052,
"grad_norm": 0.3703951835632324,
"learning_rate": 0.00045214448004660644,
"loss": 3.3112,
"step": 42400
},
{
"epoch": 12.358162017354843,
"grad_norm": 0.3678281903266907,
"learning_rate": 0.0004519697057966792,
"loss": 3.3175,
"step": 42450
},
{
"epoch": 12.372721448954632,
"grad_norm": 0.34847941994667053,
"learning_rate": 0.00045179493154675205,
"loss": 3.3125,
"step": 42500
},
{
"epoch": 12.387280880554423,
"grad_norm": 0.39477667212486267,
"learning_rate": 0.0004516201572968249,
"loss": 3.323,
"step": 42550
},
{
"epoch": 12.401840312154214,
"grad_norm": 0.3333967328071594,
"learning_rate": 0.0004514453830468977,
"loss": 3.3301,
"step": 42600
},
{
"epoch": 12.416399743754004,
"grad_norm": 0.3731665015220642,
"learning_rate": 0.0004512706087969705,
"loss": 3.3191,
"step": 42650
},
{
"epoch": 12.430959175353793,
"grad_norm": 0.3655344247817993,
"learning_rate": 0.00045109583454704333,
"loss": 3.3232,
"step": 42700
},
{
"epoch": 12.445518606953584,
"grad_norm": 0.36312124133110046,
"learning_rate": 0.00045092106029711616,
"loss": 3.3276,
"step": 42750
},
{
"epoch": 12.460078038553375,
"grad_norm": 0.3322156071662903,
"learning_rate": 0.000450746286047189,
"loss": 3.3234,
"step": 42800
},
{
"epoch": 12.474637470153166,
"grad_norm": 0.35798975825309753,
"learning_rate": 0.00045057151179726183,
"loss": 3.3379,
"step": 42850
},
{
"epoch": 12.489196901752955,
"grad_norm": 0.34931838512420654,
"learning_rate": 0.0004503967375473346,
"loss": 3.3324,
"step": 42900
},
{
"epoch": 12.503756333352746,
"grad_norm": 0.3425159752368927,
"learning_rate": 0.00045022196329740744,
"loss": 3.3265,
"step": 42950
},
{
"epoch": 12.518315764952536,
"grad_norm": 0.3326928913593292,
"learning_rate": 0.0004500471890474803,
"loss": 3.3386,
"step": 43000
},
{
"epoch": 12.518315764952536,
"eval_accuracy": 0.37016050070944184,
"eval_loss": 3.5549118518829346,
"eval_runtime": 53.8689,
"eval_samples_per_second": 308.954,
"eval_steps_per_second": 19.325,
"step": 43000
},
{
"epoch": 12.532875196552327,
"grad_norm": 0.39922279119491577,
"learning_rate": 0.0004498724147975531,
"loss": 3.3232,
"step": 43050
},
{
"epoch": 12.547434628152118,
"grad_norm": 0.3702709376811981,
"learning_rate": 0.00044969764054762594,
"loss": 3.3329,
"step": 43100
},
{
"epoch": 12.561994059751907,
"grad_norm": 0.33652347326278687,
"learning_rate": 0.0004495228662976987,
"loss": 3.3552,
"step": 43150
},
{
"epoch": 12.576553491351698,
"grad_norm": 0.35900095105171204,
"learning_rate": 0.00044934809204777155,
"loss": 3.3367,
"step": 43200
},
{
"epoch": 12.591112922951488,
"grad_norm": 0.3671649694442749,
"learning_rate": 0.0004491733177978444,
"loss": 3.3357,
"step": 43250
},
{
"epoch": 12.605672354551277,
"grad_norm": 0.3420347273349762,
"learning_rate": 0.0004489985435479173,
"loss": 3.3338,
"step": 43300
},
{
"epoch": 12.620231786151068,
"grad_norm": 0.3353428244590759,
"learning_rate": 0.0004488237692979901,
"loss": 3.3421,
"step": 43350
},
{
"epoch": 12.634791217750859,
"grad_norm": 0.3602941036224365,
"learning_rate": 0.0004486489950480629,
"loss": 3.3351,
"step": 43400
},
{
"epoch": 12.64935064935065,
"grad_norm": 0.3476759195327759,
"learning_rate": 0.0004484742207981357,
"loss": 3.3393,
"step": 43450
},
{
"epoch": 12.66391008095044,
"grad_norm": 0.3405091464519501,
"learning_rate": 0.00044829944654820855,
"loss": 3.3518,
"step": 43500
},
{
"epoch": 12.67846951255023,
"grad_norm": 0.36805057525634766,
"learning_rate": 0.0004481246722982814,
"loss": 3.3508,
"step": 43550
},
{
"epoch": 12.69302894415002,
"grad_norm": 0.3548225164413452,
"learning_rate": 0.0004479498980483542,
"loss": 3.3468,
"step": 43600
},
{
"epoch": 12.707588375749811,
"grad_norm": 0.3556191325187683,
"learning_rate": 0.000447775123798427,
"loss": 3.342,
"step": 43650
},
{
"epoch": 12.7221478073496,
"grad_norm": 0.3459774851799011,
"learning_rate": 0.00044760034954849983,
"loss": 3.3501,
"step": 43700
},
{
"epoch": 12.736707238949391,
"grad_norm": 0.32603490352630615,
"learning_rate": 0.00044742557529857266,
"loss": 3.3453,
"step": 43750
},
{
"epoch": 12.751266670549182,
"grad_norm": 0.3487292230129242,
"learning_rate": 0.0004472508010486455,
"loss": 3.34,
"step": 43800
},
{
"epoch": 12.765826102148973,
"grad_norm": 0.34649938344955444,
"learning_rate": 0.0004470760267987183,
"loss": 3.3549,
"step": 43850
},
{
"epoch": 12.780385533748763,
"grad_norm": 0.33671310544013977,
"learning_rate": 0.0004469012525487911,
"loss": 3.3551,
"step": 43900
},
{
"epoch": 12.794944965348552,
"grad_norm": 0.3770430088043213,
"learning_rate": 0.00044672647829886394,
"loss": 3.3422,
"step": 43950
},
{
"epoch": 12.809504396948343,
"grad_norm": 0.3594439625740051,
"learning_rate": 0.00044655170404893677,
"loss": 3.3472,
"step": 44000
},
{
"epoch": 12.809504396948343,
"eval_accuracy": 0.37095325068054563,
"eval_loss": 3.5484530925750732,
"eval_runtime": 53.9621,
"eval_samples_per_second": 308.42,
"eval_steps_per_second": 19.291,
"step": 44000
},
{
"epoch": 12.824063828548134,
"grad_norm": 0.3437642455101013,
"learning_rate": 0.0004463769297990096,
"loss": 3.355,
"step": 44050
},
{
"epoch": 12.838623260147925,
"grad_norm": 0.35489070415496826,
"learning_rate": 0.0004462021555490824,
"loss": 3.3439,
"step": 44100
},
{
"epoch": 12.853182691747714,
"grad_norm": 0.36889269948005676,
"learning_rate": 0.0004460273812991552,
"loss": 3.3646,
"step": 44150
},
{
"epoch": 12.867742123347504,
"grad_norm": 0.3426932692527771,
"learning_rate": 0.00044585260704922805,
"loss": 3.3556,
"step": 44200
},
{
"epoch": 12.882301554947295,
"grad_norm": 0.32859864830970764,
"learning_rate": 0.0004456778327993009,
"loss": 3.3724,
"step": 44250
},
{
"epoch": 12.896860986547086,
"grad_norm": 0.3485097289085388,
"learning_rate": 0.0004455030585493737,
"loss": 3.3643,
"step": 44300
},
{
"epoch": 12.911420418146875,
"grad_norm": 0.3588804006576538,
"learning_rate": 0.0004453282842994465,
"loss": 3.3393,
"step": 44350
},
{
"epoch": 12.925979849746666,
"grad_norm": 0.33998623490333557,
"learning_rate": 0.00044515351004951933,
"loss": 3.3432,
"step": 44400
},
{
"epoch": 12.940539281346457,
"grad_norm": 0.35905739665031433,
"learning_rate": 0.00044497873579959216,
"loss": 3.3529,
"step": 44450
},
{
"epoch": 12.955098712946247,
"grad_norm": 0.3558342456817627,
"learning_rate": 0.000444803961549665,
"loss": 3.354,
"step": 44500
},
{
"epoch": 12.969658144546036,
"grad_norm": 0.3453124761581421,
"learning_rate": 0.0004446291872997378,
"loss": 3.3444,
"step": 44550
},
{
"epoch": 12.984217576145827,
"grad_norm": 0.3470025360584259,
"learning_rate": 0.0004444544130498106,
"loss": 3.3561,
"step": 44600
},
{
"epoch": 12.998777007745618,
"grad_norm": 0.3806982636451721,
"learning_rate": 0.00044427963879988344,
"loss": 3.3558,
"step": 44650
},
{
"epoch": 13.01310348843981,
"grad_norm": 0.40065309405326843,
"learning_rate": 0.00044410486454995627,
"loss": 3.2516,
"step": 44700
},
{
"epoch": 13.027662920039601,
"grad_norm": 0.3657712936401367,
"learning_rate": 0.0004439300903000291,
"loss": 3.2454,
"step": 44750
},
{
"epoch": 13.042222351639392,
"grad_norm": 0.3365810513496399,
"learning_rate": 0.0004437553160501019,
"loss": 3.2503,
"step": 44800
},
{
"epoch": 13.056781783239183,
"grad_norm": 0.36263352632522583,
"learning_rate": 0.0004435805418001747,
"loss": 3.2604,
"step": 44850
},
{
"epoch": 13.071341214838972,
"grad_norm": 0.349775105714798,
"learning_rate": 0.00044340576755024755,
"loss": 3.2649,
"step": 44900
},
{
"epoch": 13.085900646438763,
"grad_norm": 0.34163522720336914,
"learning_rate": 0.0004432309933003204,
"loss": 3.2584,
"step": 44950
},
{
"epoch": 13.100460078038553,
"grad_norm": 0.34596383571624756,
"learning_rate": 0.0004430562190503932,
"loss": 3.2543,
"step": 45000
},
{
"epoch": 13.100460078038553,
"eval_accuracy": 0.37036556685444405,
"eval_loss": 3.5629189014434814,
"eval_runtime": 53.9034,
"eval_samples_per_second": 308.756,
"eval_steps_per_second": 19.312,
"step": 45000
},
{
"epoch": 13.115019509638344,
"grad_norm": 0.34392884373664856,
"learning_rate": 0.000442881444800466,
"loss": 3.2718,
"step": 45050
},
{
"epoch": 13.129578941238133,
"grad_norm": 0.3442322015762329,
"learning_rate": 0.00044270667055053883,
"loss": 3.2637,
"step": 45100
},
{
"epoch": 13.144138372837924,
"grad_norm": 0.3357892632484436,
"learning_rate": 0.00044253189630061166,
"loss": 3.2682,
"step": 45150
},
{
"epoch": 13.158697804437715,
"grad_norm": 0.34123629331588745,
"learning_rate": 0.0004423571220506845,
"loss": 3.2682,
"step": 45200
},
{
"epoch": 13.173257236037506,
"grad_norm": 0.369365930557251,
"learning_rate": 0.00044218234780075727,
"loss": 3.2775,
"step": 45250
},
{
"epoch": 13.187816667637296,
"grad_norm": 0.35826003551483154,
"learning_rate": 0.0004420075735508301,
"loss": 3.2802,
"step": 45300
},
{
"epoch": 13.202376099237085,
"grad_norm": 0.3576175570487976,
"learning_rate": 0.00044183279930090294,
"loss": 3.2813,
"step": 45350
},
{
"epoch": 13.216935530836876,
"grad_norm": 0.36184608936309814,
"learning_rate": 0.00044165802505097577,
"loss": 3.2794,
"step": 45400
},
{
"epoch": 13.231494962436667,
"grad_norm": 0.34625130891799927,
"learning_rate": 0.0004414832508010486,
"loss": 3.2818,
"step": 45450
},
{
"epoch": 13.246054394036458,
"grad_norm": 0.35068416595458984,
"learning_rate": 0.0004413084765511214,
"loss": 3.2791,
"step": 45500
},
{
"epoch": 13.260613825636247,
"grad_norm": 0.37707656621932983,
"learning_rate": 0.0004411337023011942,
"loss": 3.2962,
"step": 45550
},
{
"epoch": 13.275173257236037,
"grad_norm": 0.3505012094974518,
"learning_rate": 0.00044095892805126705,
"loss": 3.2936,
"step": 45600
},
{
"epoch": 13.289732688835828,
"grad_norm": 0.38045358657836914,
"learning_rate": 0.0004407841538013399,
"loss": 3.2977,
"step": 45650
},
{
"epoch": 13.304292120435619,
"grad_norm": 0.3746790289878845,
"learning_rate": 0.0004406093795514127,
"loss": 3.3078,
"step": 45700
},
{
"epoch": 13.318851552035408,
"grad_norm": 0.3702433705329895,
"learning_rate": 0.0004404346053014855,
"loss": 3.2968,
"step": 45750
},
{
"epoch": 13.333410983635199,
"grad_norm": 0.36914801597595215,
"learning_rate": 0.0004402598310515584,
"loss": 3.2999,
"step": 45800
},
{
"epoch": 13.34797041523499,
"grad_norm": 0.38449791073799133,
"learning_rate": 0.0004400850568016312,
"loss": 3.306,
"step": 45850
},
{
"epoch": 13.36252984683478,
"grad_norm": 0.3450029790401459,
"learning_rate": 0.00043991028255170405,
"loss": 3.2906,
"step": 45900
},
{
"epoch": 13.37708927843457,
"grad_norm": 0.3418305516242981,
"learning_rate": 0.0004397355083017769,
"loss": 3.3096,
"step": 45950
},
{
"epoch": 13.39164871003436,
"grad_norm": 0.3726021945476532,
"learning_rate": 0.00043956073405184966,
"loss": 3.2975,
"step": 46000
},
{
"epoch": 13.39164871003436,
"eval_accuracy": 0.3704893825945171,
"eval_loss": 3.562157392501831,
"eval_runtime": 54.1441,
"eval_samples_per_second": 307.383,
"eval_steps_per_second": 19.226,
"step": 46000
},
{
"epoch": 13.406208141634151,
"grad_norm": 0.38728371262550354,
"learning_rate": 0.0004393859598019225,
"loss": 3.3054,
"step": 46050
},
{
"epoch": 13.420767573233942,
"grad_norm": 0.33851444721221924,
"learning_rate": 0.0004392111855519953,
"loss": 3.3035,
"step": 46100
},
{
"epoch": 13.43532700483373,
"grad_norm": 0.3990343511104584,
"learning_rate": 0.00043903641130206816,
"loss": 3.3157,
"step": 46150
},
{
"epoch": 13.449886436433522,
"grad_norm": 0.33781442046165466,
"learning_rate": 0.000438861637052141,
"loss": 3.3149,
"step": 46200
},
{
"epoch": 13.464445868033312,
"grad_norm": 0.3608773350715637,
"learning_rate": 0.00043868686280221377,
"loss": 3.3063,
"step": 46250
},
{
"epoch": 13.479005299633103,
"grad_norm": 0.35064592957496643,
"learning_rate": 0.0004385120885522866,
"loss": 3.3086,
"step": 46300
},
{
"epoch": 13.493564731232892,
"grad_norm": 0.36569035053253174,
"learning_rate": 0.00043833731430235944,
"loss": 3.3099,
"step": 46350
},
{
"epoch": 13.508124162832683,
"grad_norm": 0.3421012759208679,
"learning_rate": 0.00043816254005243227,
"loss": 3.3263,
"step": 46400
},
{
"epoch": 13.522683594432474,
"grad_norm": 0.3369988203048706,
"learning_rate": 0.0004379877658025051,
"loss": 3.3287,
"step": 46450
},
{
"epoch": 13.537243026032264,
"grad_norm": 0.36069077253341675,
"learning_rate": 0.0004378129915525779,
"loss": 3.3208,
"step": 46500
},
{
"epoch": 13.551802457632053,
"grad_norm": 0.3256238102912903,
"learning_rate": 0.0004376382173026507,
"loss": 3.3102,
"step": 46550
},
{
"epoch": 13.566361889231844,
"grad_norm": 0.3456880748271942,
"learning_rate": 0.00043746344305272355,
"loss": 3.3189,
"step": 46600
},
{
"epoch": 13.580921320831635,
"grad_norm": 0.3881891071796417,
"learning_rate": 0.0004372886688027964,
"loss": 3.3217,
"step": 46650
},
{
"epoch": 13.595480752431426,
"grad_norm": 0.34957295656204224,
"learning_rate": 0.00043711389455286916,
"loss": 3.3285,
"step": 46700
},
{
"epoch": 13.610040184031215,
"grad_norm": 0.34814003109931946,
"learning_rate": 0.000436939120302942,
"loss": 3.3207,
"step": 46750
},
{
"epoch": 13.624599615631006,
"grad_norm": 0.32534271478652954,
"learning_rate": 0.0004367643460530148,
"loss": 3.3224,
"step": 46800
},
{
"epoch": 13.639159047230796,
"grad_norm": 0.36359184980392456,
"learning_rate": 0.00043658957180308766,
"loss": 3.3316,
"step": 46850
},
{
"epoch": 13.653718478830587,
"grad_norm": 0.3385885953903198,
"learning_rate": 0.0004364147975531605,
"loss": 3.3246,
"step": 46900
},
{
"epoch": 13.668277910430376,
"grad_norm": 0.3716272711753845,
"learning_rate": 0.00043624002330323327,
"loss": 3.3287,
"step": 46950
},
{
"epoch": 13.682837342030167,
"grad_norm": 0.3404206931591034,
"learning_rate": 0.0004360652490533061,
"loss": 3.3194,
"step": 47000
},
{
"epoch": 13.682837342030167,
"eval_accuracy": 0.3711830094232832,
"eval_loss": 3.5483055114746094,
"eval_runtime": 53.9157,
"eval_samples_per_second": 308.685,
"eval_steps_per_second": 19.308,
"step": 47000
},
{
"epoch": 13.697396773629958,
"grad_norm": 0.32848721742630005,
"learning_rate": 0.00043589047480337893,
"loss": 3.3275,
"step": 47050
},
{
"epoch": 13.711956205229749,
"grad_norm": 0.37124887108802795,
"learning_rate": 0.00043571570055345177,
"loss": 3.3219,
"step": 47100
},
{
"epoch": 13.726515636829538,
"grad_norm": 0.35257938504219055,
"learning_rate": 0.00043554092630352455,
"loss": 3.3272,
"step": 47150
},
{
"epoch": 13.741075068429328,
"grad_norm": 0.34476980566978455,
"learning_rate": 0.0004353661520535974,
"loss": 3.3159,
"step": 47200
},
{
"epoch": 13.75563450002912,
"grad_norm": 0.3280114233493805,
"learning_rate": 0.0004351913778036702,
"loss": 3.3301,
"step": 47250
},
{
"epoch": 13.77019393162891,
"grad_norm": 0.34827882051467896,
"learning_rate": 0.00043501660355374305,
"loss": 3.3353,
"step": 47300
},
{
"epoch": 13.784753363228699,
"grad_norm": 0.34743741154670715,
"learning_rate": 0.0004348418293038159,
"loss": 3.3392,
"step": 47350
},
{
"epoch": 13.79931279482849,
"grad_norm": 0.3636046051979065,
"learning_rate": 0.00043466705505388866,
"loss": 3.3299,
"step": 47400
},
{
"epoch": 13.81387222642828,
"grad_norm": 0.3252928555011749,
"learning_rate": 0.0004344922808039615,
"loss": 3.3242,
"step": 47450
},
{
"epoch": 13.828431658028071,
"grad_norm": 0.3536205291748047,
"learning_rate": 0.0004343175065540343,
"loss": 3.3361,
"step": 47500
},
{
"epoch": 13.84299108962786,
"grad_norm": 0.3566448390483856,
"learning_rate": 0.00043414273230410716,
"loss": 3.3314,
"step": 47550
},
{
"epoch": 13.857550521227651,
"grad_norm": 0.35187003016471863,
"learning_rate": 0.00043396795805418,
"loss": 3.3405,
"step": 47600
},
{
"epoch": 13.872109952827442,
"grad_norm": 0.3437885642051697,
"learning_rate": 0.00043379318380425277,
"loss": 3.3419,
"step": 47650
},
{
"epoch": 13.886669384427233,
"grad_norm": 0.3523525595664978,
"learning_rate": 0.0004336184095543256,
"loss": 3.3432,
"step": 47700
},
{
"epoch": 13.901228816027022,
"grad_norm": 0.3286518454551697,
"learning_rate": 0.00043344363530439843,
"loss": 3.3352,
"step": 47750
},
{
"epoch": 13.915788247626812,
"grad_norm": 0.3584185540676117,
"learning_rate": 0.00043326886105447127,
"loss": 3.3352,
"step": 47800
},
{
"epoch": 13.930347679226603,
"grad_norm": 0.3556600511074066,
"learning_rate": 0.00043309408680454405,
"loss": 3.3239,
"step": 47850
},
{
"epoch": 13.944907110826394,
"grad_norm": 0.3555219769477844,
"learning_rate": 0.0004329193125546169,
"loss": 3.3366,
"step": 47900
},
{
"epoch": 13.959466542426183,
"grad_norm": 0.33693447709083557,
"learning_rate": 0.0004327445383046897,
"loss": 3.3353,
"step": 47950
},
{
"epoch": 13.974025974025974,
"grad_norm": 0.32497769594192505,
"learning_rate": 0.00043256976405476255,
"loss": 3.3425,
"step": 48000
},
{
"epoch": 13.974025974025974,
"eval_accuracy": 0.37171954429693294,
"eval_loss": 3.5404067039489746,
"eval_runtime": 53.9107,
"eval_samples_per_second": 308.714,
"eval_steps_per_second": 19.31,
"step": 48000
},
{
"epoch": 13.988585405625765,
"grad_norm": 0.35186105966567993,
"learning_rate": 0.0004323949898048354,
"loss": 3.3426,
"step": 48050
},
{
"epoch": 14.002911886319959,
"grad_norm": 0.36423787474632263,
"learning_rate": 0.00043222021555490816,
"loss": 3.3166,
"step": 48100
},
{
"epoch": 14.017471317919748,
"grad_norm": 0.32975277304649353,
"learning_rate": 0.000432045441304981,
"loss": 3.2352,
"step": 48150
},
{
"epoch": 14.032030749519539,
"grad_norm": 0.3414633572101593,
"learning_rate": 0.0004318706670550538,
"loss": 3.2437,
"step": 48200
},
{
"epoch": 14.04659018111933,
"grad_norm": 0.3397580087184906,
"learning_rate": 0.00043169589280512666,
"loss": 3.2371,
"step": 48250
},
{
"epoch": 14.06114961271912,
"grad_norm": 0.37154221534729004,
"learning_rate": 0.0004315211185551995,
"loss": 3.2452,
"step": 48300
},
{
"epoch": 14.07570904431891,
"grad_norm": 0.37078914046287537,
"learning_rate": 0.0004313463443052724,
"loss": 3.2346,
"step": 48350
},
{
"epoch": 14.0902684759187,
"grad_norm": 0.36830246448516846,
"learning_rate": 0.00043117157005534515,
"loss": 3.2361,
"step": 48400
},
{
"epoch": 14.10482790751849,
"grad_norm": 0.33482542634010315,
"learning_rate": 0.000430996795805418,
"loss": 3.2619,
"step": 48450
},
{
"epoch": 14.119387339118282,
"grad_norm": 0.36272552609443665,
"learning_rate": 0.0004308220215554908,
"loss": 3.2486,
"step": 48500
},
{
"epoch": 14.13394677071807,
"grad_norm": 0.3442781865596771,
"learning_rate": 0.00043064724730556365,
"loss": 3.2551,
"step": 48550
},
{
"epoch": 14.148506202317861,
"grad_norm": 0.37955281138420105,
"learning_rate": 0.00043047247305563643,
"loss": 3.2665,
"step": 48600
},
{
"epoch": 14.163065633917652,
"grad_norm": 0.3380308449268341,
"learning_rate": 0.00043029769880570927,
"loss": 3.255,
"step": 48650
},
{
"epoch": 14.177625065517443,
"grad_norm": 0.36957699060440063,
"learning_rate": 0.0004301229245557821,
"loss": 3.2574,
"step": 48700
},
{
"epoch": 14.192184497117232,
"grad_norm": 0.3417280614376068,
"learning_rate": 0.00042994815030585493,
"loss": 3.2674,
"step": 48750
},
{
"epoch": 14.206743928717023,
"grad_norm": 0.3839270770549774,
"learning_rate": 0.00042977337605592776,
"loss": 3.2759,
"step": 48800
},
{
"epoch": 14.221303360316814,
"grad_norm": 0.3585422933101654,
"learning_rate": 0.00042959860180600054,
"loss": 3.2727,
"step": 48850
},
{
"epoch": 14.235862791916604,
"grad_norm": 0.33868226408958435,
"learning_rate": 0.0004294238275560734,
"loss": 3.2727,
"step": 48900
},
{
"epoch": 14.250422223516393,
"grad_norm": 0.3586726188659668,
"learning_rate": 0.0004292490533061462,
"loss": 3.2764,
"step": 48950
},
{
"epoch": 14.264981655116184,
"grad_norm": 0.3632107973098755,
"learning_rate": 0.00042907427905621904,
"loss": 3.2744,
"step": 49000
},
{
"epoch": 14.264981655116184,
"eval_accuracy": 0.37127966330584733,
"eval_loss": 3.5580098628997803,
"eval_runtime": 54.0909,
"eval_samples_per_second": 307.686,
"eval_steps_per_second": 19.245,
"step": 49000
},
{
"epoch": 14.279541086715975,
"grad_norm": 0.36595603823661804,
"learning_rate": 0.0004288995048062919,
"loss": 3.2797,
"step": 49050
},
{
"epoch": 14.294100518315766,
"grad_norm": 0.3803919553756714,
"learning_rate": 0.00042872473055636465,
"loss": 3.274,
"step": 49100
},
{
"epoch": 14.308659949915555,
"grad_norm": 0.36094287037849426,
"learning_rate": 0.0004285499563064375,
"loss": 3.2691,
"step": 49150
},
{
"epoch": 14.323219381515345,
"grad_norm": 0.3780951201915741,
"learning_rate": 0.0004283751820565103,
"loss": 3.2754,
"step": 49200
},
{
"epoch": 14.337778813115136,
"grad_norm": 0.38594871759414673,
"learning_rate": 0.00042820040780658315,
"loss": 3.2913,
"step": 49250
},
{
"epoch": 14.352338244714927,
"grad_norm": 0.362032026052475,
"learning_rate": 0.00042802563355665593,
"loss": 3.2821,
"step": 49300
},
{
"epoch": 14.366897676314716,
"grad_norm": 0.3559892177581787,
"learning_rate": 0.00042785085930672876,
"loss": 3.2865,
"step": 49350
},
{
"epoch": 14.381457107914507,
"grad_norm": 0.3505326211452484,
"learning_rate": 0.0004276760850568016,
"loss": 3.2994,
"step": 49400
},
{
"epoch": 14.396016539514298,
"grad_norm": 0.3532223403453827,
"learning_rate": 0.00042750131080687443,
"loss": 3.279,
"step": 49450
},
{
"epoch": 14.410575971114088,
"grad_norm": 0.3655863404273987,
"learning_rate": 0.00042732653655694726,
"loss": 3.284,
"step": 49500
},
{
"epoch": 14.425135402713877,
"grad_norm": 0.3348618745803833,
"learning_rate": 0.00042715176230702004,
"loss": 3.297,
"step": 49550
},
{
"epoch": 14.439694834313668,
"grad_norm": 0.3709411919116974,
"learning_rate": 0.0004269769880570929,
"loss": 3.3011,
"step": 49600
},
{
"epoch": 14.454254265913459,
"grad_norm": 0.3769262433052063,
"learning_rate": 0.0004268022138071657,
"loss": 3.2907,
"step": 49650
},
{
"epoch": 14.46881369751325,
"grad_norm": 0.3713088631629944,
"learning_rate": 0.00042662743955723854,
"loss": 3.2944,
"step": 49700
},
{
"epoch": 14.483373129113039,
"grad_norm": 0.3499068319797516,
"learning_rate": 0.0004264526653073114,
"loss": 3.2959,
"step": 49750
},
{
"epoch": 14.49793256071283,
"grad_norm": 0.3687066435813904,
"learning_rate": 0.00042627789105738415,
"loss": 3.2915,
"step": 49800
},
{
"epoch": 14.51249199231262,
"grad_norm": 0.38556233048439026,
"learning_rate": 0.000426103116807457,
"loss": 3.2961,
"step": 49850
},
{
"epoch": 14.527051423912411,
"grad_norm": 0.3844015896320343,
"learning_rate": 0.0004259283425575298,
"loss": 3.2934,
"step": 49900
},
{
"epoch": 14.5416108555122,
"grad_norm": 0.36595389246940613,
"learning_rate": 0.00042575356830760265,
"loss": 3.3006,
"step": 49950
},
{
"epoch": 14.556170287111991,
"grad_norm": 0.3643812835216522,
"learning_rate": 0.00042557879405767543,
"loss": 3.3138,
"step": 50000
},
{
"epoch": 14.556170287111991,
"eval_accuracy": 0.37170237706231696,
"eval_loss": 3.5477921962738037,
"eval_runtime": 53.7554,
"eval_samples_per_second": 309.606,
"eval_steps_per_second": 19.366,
"step": 50000
},
{
"epoch": 14.570729718711782,
"grad_norm": 0.3703961968421936,
"learning_rate": 0.00042540401980774826,
"loss": 3.2976,
"step": 50050
},
{
"epoch": 14.585289150311572,
"grad_norm": 0.34066158533096313,
"learning_rate": 0.0004252292455578211,
"loss": 3.3139,
"step": 50100
},
{
"epoch": 14.599848581911361,
"grad_norm": 0.3463113009929657,
"learning_rate": 0.00042505447130789393,
"loss": 3.3023,
"step": 50150
},
{
"epoch": 14.614408013511152,
"grad_norm": 0.3557461202144623,
"learning_rate": 0.00042487969705796676,
"loss": 3.2978,
"step": 50200
},
{
"epoch": 14.628967445110943,
"grad_norm": 0.36186400055885315,
"learning_rate": 0.00042470492280803954,
"loss": 3.3095,
"step": 50250
},
{
"epoch": 14.643526876710734,
"grad_norm": 0.33865877985954285,
"learning_rate": 0.0004245301485581124,
"loss": 3.3047,
"step": 50300
},
{
"epoch": 14.658086308310523,
"grad_norm": 0.331206351518631,
"learning_rate": 0.0004243553743081852,
"loss": 3.3206,
"step": 50350
},
{
"epoch": 14.672645739910314,
"grad_norm": 0.3369050621986389,
"learning_rate": 0.00042418060005825804,
"loss": 3.3068,
"step": 50400
},
{
"epoch": 14.687205171510104,
"grad_norm": 0.35863232612609863,
"learning_rate": 0.0004240058258083308,
"loss": 3.3162,
"step": 50450
},
{
"epoch": 14.701764603109895,
"grad_norm": 0.3646875023841858,
"learning_rate": 0.00042383105155840365,
"loss": 3.3166,
"step": 50500
},
{
"epoch": 14.716324034709684,
"grad_norm": 0.35427382588386536,
"learning_rate": 0.0004236562773084765,
"loss": 3.3133,
"step": 50550
},
{
"epoch": 14.730883466309475,
"grad_norm": 0.3502483665943146,
"learning_rate": 0.0004234815030585493,
"loss": 3.3116,
"step": 50600
},
{
"epoch": 14.745442897909266,
"grad_norm": 0.34835225343704224,
"learning_rate": 0.00042330672880862215,
"loss": 3.32,
"step": 50650
},
{
"epoch": 14.760002329509057,
"grad_norm": 0.35687556862831116,
"learning_rate": 0.00042313195455869493,
"loss": 3.3089,
"step": 50700
},
{
"epoch": 14.774561761108846,
"grad_norm": 0.3292189836502075,
"learning_rate": 0.00042295718030876776,
"loss": 3.3108,
"step": 50750
},
{
"epoch": 14.789121192708636,
"grad_norm": 0.3459155559539795,
"learning_rate": 0.0004227824060588406,
"loss": 3.3135,
"step": 50800
},
{
"epoch": 14.803680624308427,
"grad_norm": 0.3650501072406769,
"learning_rate": 0.0004226076318089135,
"loss": 3.3255,
"step": 50850
},
{
"epoch": 14.818240055908218,
"grad_norm": 0.3758101463317871,
"learning_rate": 0.0004224328575589863,
"loss": 3.3176,
"step": 50900
},
{
"epoch": 14.832799487508007,
"grad_norm": 0.39660805463790894,
"learning_rate": 0.00042225808330905915,
"loss": 3.325,
"step": 50950
},
{
"epoch": 14.847358919107798,
"grad_norm": 0.34520870447158813,
"learning_rate": 0.00042208330905913193,
"loss": 3.32,
"step": 51000
},
{
"epoch": 14.847358919107798,
"eval_accuracy": 0.37202161707589554,
"eval_loss": 3.540180206298828,
"eval_runtime": 54.0283,
"eval_samples_per_second": 308.042,
"eval_steps_per_second": 19.268,
"step": 51000
},
{
"epoch": 14.861918350707588,
"grad_norm": 0.3515894412994385,
"learning_rate": 0.00042190853480920476,
"loss": 3.3174,
"step": 51050
},
{
"epoch": 14.87647778230738,
"grad_norm": 0.3637034296989441,
"learning_rate": 0.0004217337605592776,
"loss": 3.335,
"step": 51100
},
{
"epoch": 14.891037213907168,
"grad_norm": 0.3399968445301056,
"learning_rate": 0.00042155898630935043,
"loss": 3.3259,
"step": 51150
},
{
"epoch": 14.905596645506959,
"grad_norm": 0.3715548515319824,
"learning_rate": 0.0004213842120594232,
"loss": 3.3153,
"step": 51200
},
{
"epoch": 14.92015607710675,
"grad_norm": 0.3517013192176819,
"learning_rate": 0.00042120943780949604,
"loss": 3.3186,
"step": 51250
},
{
"epoch": 14.93471550870654,
"grad_norm": 0.3491024971008301,
"learning_rate": 0.00042103466355956887,
"loss": 3.3276,
"step": 51300
},
{
"epoch": 14.94927494030633,
"grad_norm": 0.349324107170105,
"learning_rate": 0.0004208598893096417,
"loss": 3.3208,
"step": 51350
},
{
"epoch": 14.96383437190612,
"grad_norm": 0.3804023265838623,
"learning_rate": 0.00042068511505971454,
"loss": 3.3212,
"step": 51400
},
{
"epoch": 14.978393803505911,
"grad_norm": 0.3643926978111267,
"learning_rate": 0.0004205103408097873,
"loss": 3.3238,
"step": 51450
},
{
"epoch": 14.992953235105702,
"grad_norm": 0.3704342544078827,
"learning_rate": 0.00042033556655986015,
"loss": 3.3282,
"step": 51500
},
{
"epoch": 15.007279715799895,
"grad_norm": 0.375531405210495,
"learning_rate": 0.000420160792309933,
"loss": 3.2593,
"step": 51550
},
{
"epoch": 15.021839147399685,
"grad_norm": 0.3492209315299988,
"learning_rate": 0.0004199860180600058,
"loss": 3.2057,
"step": 51600
},
{
"epoch": 15.036398578999476,
"grad_norm": 0.39182865619659424,
"learning_rate": 0.00041981124381007865,
"loss": 3.2147,
"step": 51650
},
{
"epoch": 15.050958010599267,
"grad_norm": 0.3726196885108948,
"learning_rate": 0.00041963646956015143,
"loss": 3.2193,
"step": 51700
},
{
"epoch": 15.065517442199056,
"grad_norm": 0.36583876609802246,
"learning_rate": 0.00041946169531022426,
"loss": 3.2143,
"step": 51750
},
{
"epoch": 15.080076873798847,
"grad_norm": 0.35128819942474365,
"learning_rate": 0.0004192869210602971,
"loss": 3.2313,
"step": 51800
},
{
"epoch": 15.094636305398637,
"grad_norm": 0.3431715667247772,
"learning_rate": 0.0004191121468103699,
"loss": 3.2344,
"step": 51850
},
{
"epoch": 15.109195736998428,
"grad_norm": 0.360064834356308,
"learning_rate": 0.0004189373725604427,
"loss": 3.241,
"step": 51900
},
{
"epoch": 15.123755168598217,
"grad_norm": 0.3761180341243744,
"learning_rate": 0.00041876259831051554,
"loss": 3.2406,
"step": 51950
},
{
"epoch": 15.138314600198008,
"grad_norm": 0.3627549409866333,
"learning_rate": 0.00041858782406058837,
"loss": 3.2496,
"step": 52000
},
{
"epoch": 15.138314600198008,
"eval_accuracy": 0.3714227627889137,
"eval_loss": 3.557051420211792,
"eval_runtime": 54.0838,
"eval_samples_per_second": 307.726,
"eval_steps_per_second": 19.248,
"step": 52000
},
{
"epoch": 15.152874031797799,
"grad_norm": 0.33691975474357605,
"learning_rate": 0.0004184130498106612,
"loss": 3.2433,
"step": 52050
},
{
"epoch": 15.16743346339759,
"grad_norm": 0.33604761958122253,
"learning_rate": 0.00041823827556073404,
"loss": 3.2478,
"step": 52100
},
{
"epoch": 15.181992894997379,
"grad_norm": 0.34679973125457764,
"learning_rate": 0.0004180635013108068,
"loss": 3.2522,
"step": 52150
},
{
"epoch": 15.19655232659717,
"grad_norm": 0.3581389784812927,
"learning_rate": 0.00041788872706087965,
"loss": 3.2547,
"step": 52200
},
{
"epoch": 15.21111175819696,
"grad_norm": 0.33653566241264343,
"learning_rate": 0.0004177139528109525,
"loss": 3.261,
"step": 52250
},
{
"epoch": 15.225671189796751,
"grad_norm": 0.36339202523231506,
"learning_rate": 0.0004175391785610253,
"loss": 3.2535,
"step": 52300
},
{
"epoch": 15.24023062139654,
"grad_norm": 0.3384561240673065,
"learning_rate": 0.00041736440431109815,
"loss": 3.2516,
"step": 52350
},
{
"epoch": 15.25479005299633,
"grad_norm": 0.34658774733543396,
"learning_rate": 0.0004171896300611709,
"loss": 3.2719,
"step": 52400
},
{
"epoch": 15.269349484596122,
"grad_norm": 0.3711901009082794,
"learning_rate": 0.00041701485581124376,
"loss": 3.2684,
"step": 52450
},
{
"epoch": 15.283908916195912,
"grad_norm": 0.3736010193824768,
"learning_rate": 0.0004168400815613166,
"loss": 3.2595,
"step": 52500
},
{
"epoch": 15.298468347795701,
"grad_norm": 0.3565717041492462,
"learning_rate": 0.0004166653073113894,
"loss": 3.2614,
"step": 52550
},
{
"epoch": 15.313027779395492,
"grad_norm": 0.3514735698699951,
"learning_rate": 0.0004164905330614622,
"loss": 3.2719,
"step": 52600
},
{
"epoch": 15.327587210995283,
"grad_norm": 0.3635771870613098,
"learning_rate": 0.00041631575881153504,
"loss": 3.2727,
"step": 52650
},
{
"epoch": 15.342146642595074,
"grad_norm": 0.36101034283638,
"learning_rate": 0.00041614098456160787,
"loss": 3.2739,
"step": 52700
},
{
"epoch": 15.356706074194863,
"grad_norm": 0.3623729646205902,
"learning_rate": 0.0004159662103116807,
"loss": 3.2742,
"step": 52750
},
{
"epoch": 15.371265505794653,
"grad_norm": 0.3742918074131012,
"learning_rate": 0.00041579143606175354,
"loss": 3.2738,
"step": 52800
},
{
"epoch": 15.385824937394444,
"grad_norm": 0.3470838963985443,
"learning_rate": 0.0004156166618118263,
"loss": 3.2805,
"step": 52850
},
{
"epoch": 15.400384368994235,
"grad_norm": 0.3733544647693634,
"learning_rate": 0.00041544188756189915,
"loss": 3.2814,
"step": 52900
},
{
"epoch": 15.414943800594024,
"grad_norm": 0.3658877909183502,
"learning_rate": 0.000415267113311972,
"loss": 3.2687,
"step": 52950
},
{
"epoch": 15.429503232193815,
"grad_norm": 0.36594730615615845,
"learning_rate": 0.0004150923390620448,
"loss": 3.2803,
"step": 53000
},
{
"epoch": 15.429503232193815,
"eval_accuracy": 0.3717013188081283,
"eval_loss": 3.549906015396118,
"eval_runtime": 53.9513,
"eval_samples_per_second": 308.482,
"eval_steps_per_second": 19.295,
"step": 53000
},
{
"epoch": 15.444062663793606,
"grad_norm": 0.372753381729126,
"learning_rate": 0.0004149175648121176,
"loss": 3.2817,
"step": 53050
},
{
"epoch": 15.458622095393396,
"grad_norm": 0.3828512132167816,
"learning_rate": 0.0004147427905621904,
"loss": 3.2858,
"step": 53100
},
{
"epoch": 15.473181526993185,
"grad_norm": 0.3578071594238281,
"learning_rate": 0.00041456801631226326,
"loss": 3.2727,
"step": 53150
},
{
"epoch": 15.487740958592976,
"grad_norm": 0.39373624324798584,
"learning_rate": 0.0004143932420623361,
"loss": 3.2806,
"step": 53200
},
{
"epoch": 15.502300390192767,
"grad_norm": 0.38140150904655457,
"learning_rate": 0.0004142184678124089,
"loss": 3.2895,
"step": 53250
},
{
"epoch": 15.516859821792558,
"grad_norm": 0.3811562955379486,
"learning_rate": 0.0004140436935624817,
"loss": 3.2965,
"step": 53300
},
{
"epoch": 15.531419253392347,
"grad_norm": 0.3750387132167816,
"learning_rate": 0.00041386891931255454,
"loss": 3.2872,
"step": 53350
},
{
"epoch": 15.545978684992138,
"grad_norm": 0.4013814628124237,
"learning_rate": 0.0004136941450626274,
"loss": 3.2822,
"step": 53400
},
{
"epoch": 15.560538116591928,
"grad_norm": 0.3812587559223175,
"learning_rate": 0.00041351937081270026,
"loss": 3.3002,
"step": 53450
},
{
"epoch": 15.575097548191719,
"grad_norm": 0.37295323610305786,
"learning_rate": 0.0004133445965627731,
"loss": 3.2983,
"step": 53500
},
{
"epoch": 15.58965697979151,
"grad_norm": 0.3801644742488861,
"learning_rate": 0.0004131698223128459,
"loss": 3.2869,
"step": 53550
},
{
"epoch": 15.604216411391299,
"grad_norm": 0.3526298999786377,
"learning_rate": 0.0004129950480629187,
"loss": 3.2995,
"step": 53600
},
{
"epoch": 15.61877584299109,
"grad_norm": 0.3763193190097809,
"learning_rate": 0.00041282027381299153,
"loss": 3.292,
"step": 53650
},
{
"epoch": 15.63333527459088,
"grad_norm": 0.3336646556854248,
"learning_rate": 0.00041264549956306437,
"loss": 3.2837,
"step": 53700
},
{
"epoch": 15.64789470619067,
"grad_norm": 0.3432423174381256,
"learning_rate": 0.0004124707253131372,
"loss": 3.298,
"step": 53750
},
{
"epoch": 15.66245413779046,
"grad_norm": 0.37490835785865784,
"learning_rate": 0.00041229595106321,
"loss": 3.3064,
"step": 53800
},
{
"epoch": 15.677013569390251,
"grad_norm": 0.355726033449173,
"learning_rate": 0.0004121211768132828,
"loss": 3.2969,
"step": 53850
},
{
"epoch": 15.691573000990042,
"grad_norm": 0.3476122319698334,
"learning_rate": 0.00041194640256335565,
"loss": 3.2986,
"step": 53900
},
{
"epoch": 15.706132432589833,
"grad_norm": 0.33737611770629883,
"learning_rate": 0.0004117716283134285,
"loss": 3.2942,
"step": 53950
},
{
"epoch": 15.720691864189622,
"grad_norm": 0.38145211338996887,
"learning_rate": 0.0004115968540635013,
"loss": 3.2919,
"step": 54000
},
{
"epoch": 15.720691864189622,
"eval_accuracy": 0.3721127445199189,
"eval_loss": 3.5404856204986572,
"eval_runtime": 53.8872,
"eval_samples_per_second": 308.849,
"eval_steps_per_second": 19.318,
"step": 54000
},
{
"epoch": 15.735251295789412,
"grad_norm": 0.42329856753349304,
"learning_rate": 0.0004114220798135741,
"loss": 3.3079,
"step": 54050
},
{
"epoch": 15.749810727389203,
"grad_norm": 0.344635546207428,
"learning_rate": 0.0004112473055636469,
"loss": 3.3045,
"step": 54100
},
{
"epoch": 15.764370158988992,
"grad_norm": 0.38018330931663513,
"learning_rate": 0.00041107253131371976,
"loss": 3.3042,
"step": 54150
},
{
"epoch": 15.778929590588783,
"grad_norm": 0.34092721343040466,
"learning_rate": 0.0004108977570637926,
"loss": 3.303,
"step": 54200
},
{
"epoch": 15.793489022188574,
"grad_norm": 0.3967198133468628,
"learning_rate": 0.0004107229828138654,
"loss": 3.2946,
"step": 54250
},
{
"epoch": 15.808048453788365,
"grad_norm": 0.34989359974861145,
"learning_rate": 0.0004105482085639382,
"loss": 3.2874,
"step": 54300
},
{
"epoch": 15.822607885388155,
"grad_norm": 0.3509521186351776,
"learning_rate": 0.00041037343431401103,
"loss": 3.301,
"step": 54350
},
{
"epoch": 15.837167316987944,
"grad_norm": 0.3767544627189636,
"learning_rate": 0.00041019866006408387,
"loss": 3.304,
"step": 54400
},
{
"epoch": 15.851726748587735,
"grad_norm": 0.3420964777469635,
"learning_rate": 0.0004100238858141567,
"loss": 3.3212,
"step": 54450
},
{
"epoch": 15.866286180187526,
"grad_norm": 0.35542258620262146,
"learning_rate": 0.0004098491115642295,
"loss": 3.3,
"step": 54500
},
{
"epoch": 15.880845611787315,
"grad_norm": 0.34566208720207214,
"learning_rate": 0.0004096743373143023,
"loss": 3.311,
"step": 54550
},
{
"epoch": 15.895405043387106,
"grad_norm": 0.3717935383319855,
"learning_rate": 0.00040949956306437514,
"loss": 3.3017,
"step": 54600
},
{
"epoch": 15.909964474986896,
"grad_norm": 0.35940760374069214,
"learning_rate": 0.000409324788814448,
"loss": 3.3117,
"step": 54650
},
{
"epoch": 15.924523906586687,
"grad_norm": 0.3497525155544281,
"learning_rate": 0.0004091500145645208,
"loss": 3.2981,
"step": 54700
},
{
"epoch": 15.939083338186478,
"grad_norm": 0.35200536251068115,
"learning_rate": 0.0004089752403145936,
"loss": 3.3011,
"step": 54750
},
{
"epoch": 15.953642769786267,
"grad_norm": 0.35767561197280884,
"learning_rate": 0.0004088004660646664,
"loss": 3.3174,
"step": 54800
},
{
"epoch": 15.968202201386058,
"grad_norm": 0.33962079882621765,
"learning_rate": 0.00040862569181473926,
"loss": 3.3088,
"step": 54850
},
{
"epoch": 15.982761632985849,
"grad_norm": 0.33080315589904785,
"learning_rate": 0.0004084509175648121,
"loss": 3.3117,
"step": 54900
},
{
"epoch": 15.99732106458564,
"grad_norm": 0.35157039761543274,
"learning_rate": 0.0004082761433148849,
"loss": 3.31,
"step": 54950
},
{
"epoch": 16.011647545279832,
"grad_norm": 0.35152769088745117,
"learning_rate": 0.0004081013690649577,
"loss": 3.2175,
"step": 55000
},
{
"epoch": 16.011647545279832,
"eval_accuracy": 0.37205536362613384,
"eval_loss": 3.5515835285186768,
"eval_runtime": 53.9904,
"eval_samples_per_second": 308.259,
"eval_steps_per_second": 19.281,
"step": 55000
},
{
"epoch": 16.02620697687962,
"grad_norm": 0.34217002987861633,
"learning_rate": 0.00040792659481503053,
"loss": 3.2079,
"step": 55050
},
{
"epoch": 16.040766408479413,
"grad_norm": 0.3347378075122833,
"learning_rate": 0.00040775182056510337,
"loss": 3.198,
"step": 55100
},
{
"epoch": 16.055325840079202,
"grad_norm": 0.38384637236595154,
"learning_rate": 0.0004075770463151762,
"loss": 3.2018,
"step": 55150
},
{
"epoch": 16.069885271678995,
"grad_norm": 0.3489525616168976,
"learning_rate": 0.000407402272065249,
"loss": 3.2152,
"step": 55200
},
{
"epoch": 16.084444703278784,
"grad_norm": 0.4081012010574341,
"learning_rate": 0.0004072274978153218,
"loss": 3.2085,
"step": 55250
},
{
"epoch": 16.099004134878573,
"grad_norm": 0.37120628356933594,
"learning_rate": 0.00040705272356539464,
"loss": 3.2262,
"step": 55300
},
{
"epoch": 16.113563566478366,
"grad_norm": 0.4057721793651581,
"learning_rate": 0.0004068779493154675,
"loss": 3.2285,
"step": 55350
},
{
"epoch": 16.128122998078155,
"grad_norm": 0.35924482345581055,
"learning_rate": 0.0004067031750655403,
"loss": 3.2267,
"step": 55400
},
{
"epoch": 16.142682429677944,
"grad_norm": 0.41697263717651367,
"learning_rate": 0.0004065284008156131,
"loss": 3.2411,
"step": 55450
},
{
"epoch": 16.157241861277736,
"grad_norm": 0.38057592511177063,
"learning_rate": 0.0004063536265656859,
"loss": 3.226,
"step": 55500
},
{
"epoch": 16.171801292877525,
"grad_norm": 0.35883548855781555,
"learning_rate": 0.00040617885231575876,
"loss": 3.2306,
"step": 55550
},
{
"epoch": 16.186360724477318,
"grad_norm": 0.3391270339488983,
"learning_rate": 0.0004060040780658316,
"loss": 3.2337,
"step": 55600
},
{
"epoch": 16.200920156077107,
"grad_norm": 0.3992067873477936,
"learning_rate": 0.0004058293038159044,
"loss": 3.2355,
"step": 55650
},
{
"epoch": 16.215479587676896,
"grad_norm": 0.38337230682373047,
"learning_rate": 0.0004056545295659772,
"loss": 3.2416,
"step": 55700
},
{
"epoch": 16.23003901927669,
"grad_norm": 0.3764645755290985,
"learning_rate": 0.00040547975531605003,
"loss": 3.2423,
"step": 55750
},
{
"epoch": 16.244598450876477,
"grad_norm": 0.36295512318611145,
"learning_rate": 0.00040530498106612287,
"loss": 3.2453,
"step": 55800
},
{
"epoch": 16.259157882476266,
"grad_norm": 0.3418979346752167,
"learning_rate": 0.0004051302068161957,
"loss": 3.2385,
"step": 55850
},
{
"epoch": 16.27371731407606,
"grad_norm": 0.3582913875579834,
"learning_rate": 0.0004049554325662686,
"loss": 3.2475,
"step": 55900
},
{
"epoch": 16.288276745675848,
"grad_norm": 0.37994620203971863,
"learning_rate": 0.00040478065831634136,
"loss": 3.2515,
"step": 55950
},
{
"epoch": 16.30283617727564,
"grad_norm": 0.40661951899528503,
"learning_rate": 0.0004046058840664142,
"loss": 3.2547,
"step": 56000
},
{
"epoch": 16.30283617727564,
"eval_accuracy": 0.372163305553377,
"eval_loss": 3.5515215396881104,
"eval_runtime": 53.9642,
"eval_samples_per_second": 308.408,
"eval_steps_per_second": 19.291,
"step": 56000
},
{
"epoch": 16.31739560887543,
"grad_norm": 0.3635038435459137,
"learning_rate": 0.00040443110981648703,
"loss": 3.2546,
"step": 56050
},
{
"epoch": 16.33195504047522,
"grad_norm": 0.3598199486732483,
"learning_rate": 0.00040425633556655986,
"loss": 3.2543,
"step": 56100
},
{
"epoch": 16.34651447207501,
"grad_norm": 0.38117557764053345,
"learning_rate": 0.0004040815613166327,
"loss": 3.2716,
"step": 56150
},
{
"epoch": 16.3610739036748,
"grad_norm": 0.37007659673690796,
"learning_rate": 0.0004039067870667055,
"loss": 3.261,
"step": 56200
},
{
"epoch": 16.375633335274593,
"grad_norm": 0.4195094108581543,
"learning_rate": 0.0004037320128167783,
"loss": 3.2635,
"step": 56250
},
{
"epoch": 16.39019276687438,
"grad_norm": 0.38829296827316284,
"learning_rate": 0.00040355723856685114,
"loss": 3.2553,
"step": 56300
},
{
"epoch": 16.40475219847417,
"grad_norm": 0.34940147399902344,
"learning_rate": 0.000403382464316924,
"loss": 3.2697,
"step": 56350
},
{
"epoch": 16.419311630073963,
"grad_norm": 0.39590585231781006,
"learning_rate": 0.00040320769006699675,
"loss": 3.2618,
"step": 56400
},
{
"epoch": 16.433871061673752,
"grad_norm": 0.37665021419525146,
"learning_rate": 0.0004030329158170696,
"loss": 3.2748,
"step": 56450
},
{
"epoch": 16.44843049327354,
"grad_norm": 0.36239564418792725,
"learning_rate": 0.0004028581415671424,
"loss": 3.2668,
"step": 56500
},
{
"epoch": 16.462989924873334,
"grad_norm": 0.3750430941581726,
"learning_rate": 0.00040268336731721525,
"loss": 3.2781,
"step": 56550
},
{
"epoch": 16.477549356473123,
"grad_norm": 0.3569280505180359,
"learning_rate": 0.0004025085930672881,
"loss": 3.2796,
"step": 56600
},
{
"epoch": 16.492108788072915,
"grad_norm": 0.37821418046951294,
"learning_rate": 0.00040233381881736086,
"loss": 3.2652,
"step": 56650
},
{
"epoch": 16.506668219672704,
"grad_norm": 0.36736053228378296,
"learning_rate": 0.0004021590445674337,
"loss": 3.2769,
"step": 56700
},
{
"epoch": 16.521227651272493,
"grad_norm": 0.3546053171157837,
"learning_rate": 0.00040198427031750653,
"loss": 3.2711,
"step": 56750
},
{
"epoch": 16.535787082872286,
"grad_norm": 0.35950738191604614,
"learning_rate": 0.00040180949606757936,
"loss": 3.2752,
"step": 56800
},
{
"epoch": 16.550346514472075,
"grad_norm": 0.35586223006248474,
"learning_rate": 0.0004016347218176522,
"loss": 3.2702,
"step": 56850
},
{
"epoch": 16.564905946071864,
"grad_norm": 0.3526204228401184,
"learning_rate": 0.000401459947567725,
"loss": 3.2772,
"step": 56900
},
{
"epoch": 16.579465377671657,
"grad_norm": 0.36836734414100647,
"learning_rate": 0.0004012851733177978,
"loss": 3.2878,
"step": 56950
},
{
"epoch": 16.594024809271446,
"grad_norm": 0.36949554085731506,
"learning_rate": 0.00040111039906787064,
"loss": 3.2692,
"step": 57000
},
{
"epoch": 16.594024809271446,
"eval_accuracy": 0.37260836023160715,
"eval_loss": 3.5415875911712646,
"eval_runtime": 53.9908,
"eval_samples_per_second": 308.256,
"eval_steps_per_second": 19.281,
"step": 57000
},
{
"epoch": 16.608584240871238,
"grad_norm": 0.3561803698539734,
"learning_rate": 0.0004009356248179435,
"loss": 3.2791,
"step": 57050
},
{
"epoch": 16.623143672471027,
"grad_norm": 0.3576090335845947,
"learning_rate": 0.00040076085056801625,
"loss": 3.2882,
"step": 57100
},
{
"epoch": 16.637703104070816,
"grad_norm": 0.3569983243942261,
"learning_rate": 0.0004005860763180891,
"loss": 3.2774,
"step": 57150
},
{
"epoch": 16.65226253567061,
"grad_norm": 0.3748653531074524,
"learning_rate": 0.0004004113020681619,
"loss": 3.2876,
"step": 57200
},
{
"epoch": 16.666821967270398,
"grad_norm": 0.371491402387619,
"learning_rate": 0.00040023652781823475,
"loss": 3.2824,
"step": 57250
},
{
"epoch": 16.681381398870187,
"grad_norm": 0.37606826424598694,
"learning_rate": 0.0004000617535683076,
"loss": 3.2731,
"step": 57300
},
{
"epoch": 16.69594083046998,
"grad_norm": 0.3529285192489624,
"learning_rate": 0.00039988697931838036,
"loss": 3.2925,
"step": 57350
},
{
"epoch": 16.71050026206977,
"grad_norm": 0.42727458477020264,
"learning_rate": 0.0003997122050684532,
"loss": 3.2851,
"step": 57400
},
{
"epoch": 16.72505969366956,
"grad_norm": 0.3473684787750244,
"learning_rate": 0.00039953743081852603,
"loss": 3.278,
"step": 57450
},
{
"epoch": 16.73961912526935,
"grad_norm": 0.40747615694999695,
"learning_rate": 0.00039936265656859886,
"loss": 3.2638,
"step": 57500
},
{
"epoch": 16.75417855686914,
"grad_norm": 0.3650287687778473,
"learning_rate": 0.0003991878823186717,
"loss": 3.2862,
"step": 57550
},
{
"epoch": 16.76873798846893,
"grad_norm": 0.361441045999527,
"learning_rate": 0.0003990131080687445,
"loss": 3.2847,
"step": 57600
},
{
"epoch": 16.78329742006872,
"grad_norm": 0.3760271668434143,
"learning_rate": 0.0003988383338188173,
"loss": 3.2954,
"step": 57650
},
{
"epoch": 16.79785685166851,
"grad_norm": 0.4076845645904541,
"learning_rate": 0.00039866355956889014,
"loss": 3.2954,
"step": 57700
},
{
"epoch": 16.812416283268302,
"grad_norm": 0.353763222694397,
"learning_rate": 0.000398488785318963,
"loss": 3.2893,
"step": 57750
},
{
"epoch": 16.82697571486809,
"grad_norm": 0.3695422112941742,
"learning_rate": 0.00039831401106903575,
"loss": 3.2879,
"step": 57800
},
{
"epoch": 16.841535146467884,
"grad_norm": 0.3703312575817108,
"learning_rate": 0.0003981392368191086,
"loss": 3.2899,
"step": 57850
},
{
"epoch": 16.856094578067673,
"grad_norm": 0.3426973819732666,
"learning_rate": 0.0003979644625691814,
"loss": 3.297,
"step": 57900
},
{
"epoch": 16.87065400966746,
"grad_norm": 0.36928045749664307,
"learning_rate": 0.00039778968831925425,
"loss": 3.2958,
"step": 57950
},
{
"epoch": 16.885213441267254,
"grad_norm": 0.35659393668174744,
"learning_rate": 0.0003976149140693271,
"loss": 3.2916,
"step": 58000
},
{
"epoch": 16.885213441267254,
"eval_accuracy": 0.37303742351320873,
"eval_loss": 3.535076856613159,
"eval_runtime": 53.7788,
"eval_samples_per_second": 309.471,
"eval_steps_per_second": 19.357,
"step": 58000
},
{
"epoch": 16.899772872867043,
"grad_norm": 0.3856133818626404,
"learning_rate": 0.00039744013981939986,
"loss": 3.2993,
"step": 58050
},
{
"epoch": 16.914332304466832,
"grad_norm": 0.3538554012775421,
"learning_rate": 0.0003972653655694727,
"loss": 3.2902,
"step": 58100
},
{
"epoch": 16.928891736066625,
"grad_norm": 0.36010587215423584,
"learning_rate": 0.00039709059131954553,
"loss": 3.3029,
"step": 58150
},
{
"epoch": 16.943451167666414,
"grad_norm": 0.3581371605396271,
"learning_rate": 0.00039691581706961836,
"loss": 3.3035,
"step": 58200
},
{
"epoch": 16.958010599266206,
"grad_norm": 0.3781713843345642,
"learning_rate": 0.0003967410428196912,
"loss": 3.2984,
"step": 58250
},
{
"epoch": 16.972570030865995,
"grad_norm": 0.36059990525245667,
"learning_rate": 0.000396566268569764,
"loss": 3.3088,
"step": 58300
},
{
"epoch": 16.987129462465784,
"grad_norm": 0.34163540601730347,
"learning_rate": 0.0003963914943198368,
"loss": 3.3084,
"step": 58350
},
{
"epoch": 17.00145594315998,
"grad_norm": 0.41299328207969666,
"learning_rate": 0.00039621672006990964,
"loss": 3.281,
"step": 58400
},
{
"epoch": 17.01601537475977,
"grad_norm": 0.4098125696182251,
"learning_rate": 0.0003960419458199825,
"loss": 3.2017,
"step": 58450
},
{
"epoch": 17.03057480635956,
"grad_norm": 0.3501276969909668,
"learning_rate": 0.00039586717157005536,
"loss": 3.1889,
"step": 58500
},
{
"epoch": 17.04513423795935,
"grad_norm": 0.3579663336277008,
"learning_rate": 0.00039569239732012814,
"loss": 3.2048,
"step": 58550
},
{
"epoch": 17.05969366955914,
"grad_norm": 0.3546302616596222,
"learning_rate": 0.00039551762307020097,
"loss": 3.1901,
"step": 58600
},
{
"epoch": 17.07425310115893,
"grad_norm": 0.39157140254974365,
"learning_rate": 0.0003953428488202738,
"loss": 3.2057,
"step": 58650
},
{
"epoch": 17.08881253275872,
"grad_norm": 0.3777744174003601,
"learning_rate": 0.00039516807457034664,
"loss": 3.2057,
"step": 58700
},
{
"epoch": 17.103371964358512,
"grad_norm": 0.3601837158203125,
"learning_rate": 0.00039499330032041947,
"loss": 3.1967,
"step": 58750
},
{
"epoch": 17.1179313959583,
"grad_norm": 0.37118658423423767,
"learning_rate": 0.00039481852607049225,
"loss": 3.2013,
"step": 58800
},
{
"epoch": 17.132490827558094,
"grad_norm": 0.35028761625289917,
"learning_rate": 0.0003946437518205651,
"loss": 3.2105,
"step": 58850
},
{
"epoch": 17.147050259157883,
"grad_norm": 0.40827909111976624,
"learning_rate": 0.0003944689775706379,
"loss": 3.2346,
"step": 58900
},
{
"epoch": 17.161609690757672,
"grad_norm": 0.37568482756614685,
"learning_rate": 0.00039429420332071075,
"loss": 3.2186,
"step": 58950
},
{
"epoch": 17.176169122357464,
"grad_norm": 0.37248700857162476,
"learning_rate": 0.0003941194290707836,
"loss": 3.2282,
"step": 59000
},
{
"epoch": 17.176169122357464,
"eval_accuracy": 0.3720623010702595,
"eval_loss": 3.553311824798584,
"eval_runtime": 54.0404,
"eval_samples_per_second": 307.973,
"eval_steps_per_second": 19.263,
"step": 59000
},
{
"epoch": 17.190728553957253,
"grad_norm": 0.3794865608215332,
"learning_rate": 0.00039394465482085636,
"loss": 3.2065,
"step": 59050
},
{
"epoch": 17.205287985557042,
"grad_norm": 0.38951265811920166,
"learning_rate": 0.0003937698805709292,
"loss": 3.2422,
"step": 59100
},
{
"epoch": 17.219847417156835,
"grad_norm": 0.3871360123157501,
"learning_rate": 0.000393595106321002,
"loss": 3.2231,
"step": 59150
},
{
"epoch": 17.234406848756624,
"grad_norm": 0.36482304334640503,
"learning_rate": 0.00039342033207107486,
"loss": 3.2388,
"step": 59200
},
{
"epoch": 17.248966280356417,
"grad_norm": 0.3711540699005127,
"learning_rate": 0.00039324555782114764,
"loss": 3.2294,
"step": 59250
},
{
"epoch": 17.263525711956206,
"grad_norm": 0.3786524534225464,
"learning_rate": 0.00039307078357122047,
"loss": 3.2428,
"step": 59300
},
{
"epoch": 17.278085143555995,
"grad_norm": 0.39472588896751404,
"learning_rate": 0.0003928960093212933,
"loss": 3.2392,
"step": 59350
},
{
"epoch": 17.292644575155787,
"grad_norm": 0.38515424728393555,
"learning_rate": 0.00039272123507136614,
"loss": 3.2422,
"step": 59400
},
{
"epoch": 17.307204006755576,
"grad_norm": 0.3816242218017578,
"learning_rate": 0.00039254646082143897,
"loss": 3.2431,
"step": 59450
},
{
"epoch": 17.321763438355365,
"grad_norm": 0.3761007785797119,
"learning_rate": 0.00039237168657151175,
"loss": 3.2431,
"step": 59500
},
{
"epoch": 17.336322869955158,
"grad_norm": 0.38267335295677185,
"learning_rate": 0.0003921969123215846,
"loss": 3.256,
"step": 59550
},
{
"epoch": 17.350882301554947,
"grad_norm": 0.36106154322624207,
"learning_rate": 0.0003920221380716574,
"loss": 3.2497,
"step": 59600
},
{
"epoch": 17.36544173315474,
"grad_norm": 0.351624071598053,
"learning_rate": 0.00039184736382173025,
"loss": 3.2565,
"step": 59650
},
{
"epoch": 17.38000116475453,
"grad_norm": 0.3526884913444519,
"learning_rate": 0.000391672589571803,
"loss": 3.2546,
"step": 59700
},
{
"epoch": 17.394560596354317,
"grad_norm": 0.39185193181037903,
"learning_rate": 0.00039149781532187586,
"loss": 3.2386,
"step": 59750
},
{
"epoch": 17.40912002795411,
"grad_norm": 0.3673184812068939,
"learning_rate": 0.0003913230410719487,
"loss": 3.2497,
"step": 59800
},
{
"epoch": 17.4236794595539,
"grad_norm": 0.36868539452552795,
"learning_rate": 0.0003911482668220215,
"loss": 3.2442,
"step": 59850
},
{
"epoch": 17.438238891153688,
"grad_norm": 0.3652259111404419,
"learning_rate": 0.00039097349257209436,
"loss": 3.2523,
"step": 59900
},
{
"epoch": 17.45279832275348,
"grad_norm": 0.370699405670166,
"learning_rate": 0.00039079871832216714,
"loss": 3.26,
"step": 59950
},
{
"epoch": 17.46735775435327,
"grad_norm": 0.3466508090496063,
"learning_rate": 0.00039062394407223997,
"loss": 3.2584,
"step": 60000
},
{
"epoch": 17.46735775435327,
"eval_accuracy": 0.3726800863488384,
"eval_loss": 3.5435853004455566,
"eval_runtime": 54.016,
"eval_samples_per_second": 308.112,
"eval_steps_per_second": 19.272,
"step": 60000
},
{
"epoch": 17.481917185953062,
"grad_norm": 0.46203598380088806,
"learning_rate": 0.0003904491698223128,
"loss": 3.2626,
"step": 60050
},
{
"epoch": 17.49647661755285,
"grad_norm": 0.34256711602211,
"learning_rate": 0.00039027439557238564,
"loss": 3.2569,
"step": 60100
},
{
"epoch": 17.51103604915264,
"grad_norm": 0.3792712092399597,
"learning_rate": 0.00039009962132245847,
"loss": 3.2535,
"step": 60150
},
{
"epoch": 17.525595480752433,
"grad_norm": 0.39607110619544983,
"learning_rate": 0.00038992484707253125,
"loss": 3.257,
"step": 60200
},
{
"epoch": 17.54015491235222,
"grad_norm": 0.3784148693084717,
"learning_rate": 0.0003897500728226041,
"loss": 3.2686,
"step": 60250
},
{
"epoch": 17.55471434395201,
"grad_norm": 0.38679102063179016,
"learning_rate": 0.0003895752985726769,
"loss": 3.2681,
"step": 60300
},
{
"epoch": 17.569273775551803,
"grad_norm": 0.38070613145828247,
"learning_rate": 0.00038940052432274975,
"loss": 3.257,
"step": 60350
},
{
"epoch": 17.583833207151592,
"grad_norm": 0.363008588552475,
"learning_rate": 0.0003892257500728225,
"loss": 3.2573,
"step": 60400
},
{
"epoch": 17.598392638751385,
"grad_norm": 0.41815274953842163,
"learning_rate": 0.00038905097582289536,
"loss": 3.2649,
"step": 60450
},
{
"epoch": 17.612952070351174,
"grad_norm": 0.37823486328125,
"learning_rate": 0.0003888762015729682,
"loss": 3.2645,
"step": 60500
},
{
"epoch": 17.627511501950963,
"grad_norm": 0.38749924302101135,
"learning_rate": 0.000388701427323041,
"loss": 3.2577,
"step": 60550
},
{
"epoch": 17.642070933550755,
"grad_norm": 0.3703409731388092,
"learning_rate": 0.00038852665307311386,
"loss": 3.2729,
"step": 60600
},
{
"epoch": 17.656630365150544,
"grad_norm": 0.36761870980262756,
"learning_rate": 0.00038835187882318664,
"loss": 3.2796,
"step": 60650
},
{
"epoch": 17.671189796750333,
"grad_norm": 0.34877169132232666,
"learning_rate": 0.00038817710457325947,
"loss": 3.2743,
"step": 60700
},
{
"epoch": 17.685749228350126,
"grad_norm": 0.3577750623226166,
"learning_rate": 0.0003880023303233323,
"loss": 3.2672,
"step": 60750
},
{
"epoch": 17.700308659949915,
"grad_norm": 0.349612832069397,
"learning_rate": 0.00038782755607340514,
"loss": 3.2765,
"step": 60800
},
{
"epoch": 17.714868091549707,
"grad_norm": 0.4024096429347992,
"learning_rate": 0.00038765278182347797,
"loss": 3.281,
"step": 60850
},
{
"epoch": 17.729427523149496,
"grad_norm": 0.3460679352283478,
"learning_rate": 0.00038747800757355075,
"loss": 3.2651,
"step": 60900
},
{
"epoch": 17.743986954749285,
"grad_norm": 0.3731793165206909,
"learning_rate": 0.00038730323332362363,
"loss": 3.2789,
"step": 60950
},
{
"epoch": 17.758546386349078,
"grad_norm": 0.38457053899765015,
"learning_rate": 0.00038712845907369647,
"loss": 3.2724,
"step": 61000
},
{
"epoch": 17.758546386349078,
"eval_accuracy": 0.37315571281474097,
"eval_loss": 3.5371994972229004,
"eval_runtime": 54.3068,
"eval_samples_per_second": 306.462,
"eval_steps_per_second": 19.169,
"step": 61000
},
{
"epoch": 17.773105817948867,
"grad_norm": 0.38386011123657227,
"learning_rate": 0.0003869536848237693,
"loss": 3.2796,
"step": 61050
},
{
"epoch": 17.787665249548656,
"grad_norm": 0.4016534090042114,
"learning_rate": 0.00038677891057384213,
"loss": 3.2771,
"step": 61100
},
{
"epoch": 17.80222468114845,
"grad_norm": 0.33354341983795166,
"learning_rate": 0.0003866041363239149,
"loss": 3.2739,
"step": 61150
},
{
"epoch": 17.816784112748238,
"grad_norm": 0.38390499353408813,
"learning_rate": 0.00038642936207398774,
"loss": 3.2782,
"step": 61200
},
{
"epoch": 17.83134354434803,
"grad_norm": 0.33095383644104004,
"learning_rate": 0.0003862545878240606,
"loss": 3.2908,
"step": 61250
},
{
"epoch": 17.84590297594782,
"grad_norm": 0.359825074672699,
"learning_rate": 0.0003860798135741334,
"loss": 3.2845,
"step": 61300
},
{
"epoch": 17.860462407547608,
"grad_norm": 0.36744236946105957,
"learning_rate": 0.00038590503932420624,
"loss": 3.2842,
"step": 61350
},
{
"epoch": 17.8750218391474,
"grad_norm": 0.3676607310771942,
"learning_rate": 0.000385730265074279,
"loss": 3.2905,
"step": 61400
},
{
"epoch": 17.88958127074719,
"grad_norm": 0.39658308029174805,
"learning_rate": 0.00038555549082435186,
"loss": 3.2796,
"step": 61450
},
{
"epoch": 17.90414070234698,
"grad_norm": 0.3673575222492218,
"learning_rate": 0.0003853807165744247,
"loss": 3.2805,
"step": 61500
},
{
"epoch": 17.91870013394677,
"grad_norm": 0.3908085525035858,
"learning_rate": 0.0003852059423244975,
"loss": 3.2885,
"step": 61550
},
{
"epoch": 17.93325956554656,
"grad_norm": 0.4000939130783081,
"learning_rate": 0.00038503116807457035,
"loss": 3.2736,
"step": 61600
},
{
"epoch": 17.947818997146353,
"grad_norm": 0.3464169204235077,
"learning_rate": 0.00038485639382464313,
"loss": 3.2832,
"step": 61650
},
{
"epoch": 17.962378428746142,
"grad_norm": 0.3489089608192444,
"learning_rate": 0.00038468161957471597,
"loss": 3.2796,
"step": 61700
},
{
"epoch": 17.97693786034593,
"grad_norm": 0.4103230834007263,
"learning_rate": 0.0003845068453247888,
"loss": 3.2685,
"step": 61750
},
{
"epoch": 17.991497291945723,
"grad_norm": 0.387511283159256,
"learning_rate": 0.00038433207107486163,
"loss": 3.2981,
"step": 61800
},
{
"epoch": 18.005823772639918,
"grad_norm": 0.3746413588523865,
"learning_rate": 0.0003841572968249344,
"loss": 3.2403,
"step": 61850
},
{
"epoch": 18.020383204239707,
"grad_norm": 0.36936619877815247,
"learning_rate": 0.00038398252257500724,
"loss": 3.1804,
"step": 61900
},
{
"epoch": 18.034942635839496,
"grad_norm": 0.37005239725112915,
"learning_rate": 0.0003838077483250801,
"loss": 3.2085,
"step": 61950
},
{
"epoch": 18.04950206743929,
"grad_norm": 0.38752469420433044,
"learning_rate": 0.0003836329740751529,
"loss": 3.1868,
"step": 62000
},
{
"epoch": 18.04950206743929,
"eval_accuracy": 0.3726120053293681,
"eval_loss": 3.54896879196167,
"eval_runtime": 53.9519,
"eval_samples_per_second": 308.478,
"eval_steps_per_second": 19.295,
"step": 62000
},
{
"epoch": 18.064061499039077,
"grad_norm": 0.3948463201522827,
"learning_rate": 0.00038345819982522574,
"loss": 3.19,
"step": 62050
},
{
"epoch": 18.078620930638866,
"grad_norm": 0.38081789016723633,
"learning_rate": 0.0003832834255752985,
"loss": 3.1918,
"step": 62100
},
{
"epoch": 18.09318036223866,
"grad_norm": 0.3688044548034668,
"learning_rate": 0.00038310865132537135,
"loss": 3.2016,
"step": 62150
},
{
"epoch": 18.107739793838448,
"grad_norm": 0.3519236743450165,
"learning_rate": 0.0003829338770754442,
"loss": 3.1926,
"step": 62200
},
{
"epoch": 18.12229922543824,
"grad_norm": 0.36413848400115967,
"learning_rate": 0.000382759102825517,
"loss": 3.2035,
"step": 62250
},
{
"epoch": 18.13685865703803,
"grad_norm": 0.39578020572662354,
"learning_rate": 0.00038258432857558985,
"loss": 3.2006,
"step": 62300
},
{
"epoch": 18.15141808863782,
"grad_norm": 0.3735930323600769,
"learning_rate": 0.00038240955432566263,
"loss": 3.1978,
"step": 62350
},
{
"epoch": 18.16597752023761,
"grad_norm": 0.367960125207901,
"learning_rate": 0.00038223478007573547,
"loss": 3.2175,
"step": 62400
},
{
"epoch": 18.1805369518374,
"grad_norm": 0.3750097453594208,
"learning_rate": 0.0003820600058258083,
"loss": 3.2119,
"step": 62450
},
{
"epoch": 18.19509638343719,
"grad_norm": 0.37870654463768005,
"learning_rate": 0.00038188523157588113,
"loss": 3.2167,
"step": 62500
},
{
"epoch": 18.20965581503698,
"grad_norm": 0.4226834177970886,
"learning_rate": 0.0003817104573259539,
"loss": 3.2163,
"step": 62550
},
{
"epoch": 18.22421524663677,
"grad_norm": 0.3715706765651703,
"learning_rate": 0.00038153568307602674,
"loss": 3.2295,
"step": 62600
},
{
"epoch": 18.238774678236563,
"grad_norm": 0.35563722252845764,
"learning_rate": 0.0003813609088260996,
"loss": 3.2179,
"step": 62650
},
{
"epoch": 18.253334109836352,
"grad_norm": 0.38243478536605835,
"learning_rate": 0.0003811861345761724,
"loss": 3.2199,
"step": 62700
},
{
"epoch": 18.26789354143614,
"grad_norm": 0.368644654750824,
"learning_rate": 0.00038101136032624524,
"loss": 3.2204,
"step": 62750
},
{
"epoch": 18.282452973035934,
"grad_norm": 0.3634442687034607,
"learning_rate": 0.000380836586076318,
"loss": 3.2344,
"step": 62800
},
{
"epoch": 18.297012404635723,
"grad_norm": 0.3635788559913635,
"learning_rate": 0.00038066181182639085,
"loss": 3.2297,
"step": 62850
},
{
"epoch": 18.31157183623551,
"grad_norm": 0.4010733962059021,
"learning_rate": 0.0003804870375764637,
"loss": 3.2351,
"step": 62900
},
{
"epoch": 18.326131267835304,
"grad_norm": 0.42111244797706604,
"learning_rate": 0.0003803122633265365,
"loss": 3.2386,
"step": 62950
},
{
"epoch": 18.340690699435093,
"grad_norm": 0.4109092950820923,
"learning_rate": 0.0003801374890766093,
"loss": 3.2329,
"step": 63000
},
{
"epoch": 18.340690699435093,
"eval_accuracy": 0.3725866072288403,
"eval_loss": 3.5461974143981934,
"eval_runtime": 53.9315,
"eval_samples_per_second": 308.595,
"eval_steps_per_second": 19.302,
"step": 63000
},
{
"epoch": 18.355250131034886,
"grad_norm": 0.379860520362854,
"learning_rate": 0.00037996271482668213,
"loss": 3.2364,
"step": 63050
},
{
"epoch": 18.369809562634675,
"grad_norm": 0.39272770285606384,
"learning_rate": 0.00037978794057675497,
"loss": 3.2378,
"step": 63100
},
{
"epoch": 18.384368994234464,
"grad_norm": 0.4046265780925751,
"learning_rate": 0.0003796131663268278,
"loss": 3.2406,
"step": 63150
},
{
"epoch": 18.398928425834256,
"grad_norm": 0.3786957859992981,
"learning_rate": 0.00037943839207690063,
"loss": 3.2385,
"step": 63200
},
{
"epoch": 18.413487857434045,
"grad_norm": 0.3831402361392975,
"learning_rate": 0.0003792636178269734,
"loss": 3.2364,
"step": 63250
},
{
"epoch": 18.428047289033834,
"grad_norm": 0.4113953411579132,
"learning_rate": 0.00037908884357704624,
"loss": 3.2396,
"step": 63300
},
{
"epoch": 18.442606720633627,
"grad_norm": 0.3877120912075043,
"learning_rate": 0.0003789140693271191,
"loss": 3.2336,
"step": 63350
},
{
"epoch": 18.457166152233416,
"grad_norm": 0.3839344382286072,
"learning_rate": 0.0003787392950771919,
"loss": 3.2497,
"step": 63400
},
{
"epoch": 18.47172558383321,
"grad_norm": 0.38105466961860657,
"learning_rate": 0.00037856452082726474,
"loss": 3.2359,
"step": 63450
},
{
"epoch": 18.486285015432998,
"grad_norm": 0.3721942901611328,
"learning_rate": 0.00037838974657733763,
"loss": 3.2513,
"step": 63500
},
{
"epoch": 18.500844447032787,
"grad_norm": 0.3668787181377411,
"learning_rate": 0.0003782149723274104,
"loss": 3.242,
"step": 63550
},
{
"epoch": 18.51540387863258,
"grad_norm": 0.3777357339859009,
"learning_rate": 0.00037804019807748324,
"loss": 3.2484,
"step": 63600
},
{
"epoch": 18.529963310232368,
"grad_norm": 0.355179101228714,
"learning_rate": 0.0003778654238275561,
"loss": 3.2535,
"step": 63650
},
{
"epoch": 18.544522741832157,
"grad_norm": 0.37869659066200256,
"learning_rate": 0.0003776906495776289,
"loss": 3.2545,
"step": 63700
},
{
"epoch": 18.55908217343195,
"grad_norm": 0.3972564935684204,
"learning_rate": 0.0003775158753277017,
"loss": 3.2569,
"step": 63750
},
{
"epoch": 18.57364160503174,
"grad_norm": 0.3839690089225769,
"learning_rate": 0.0003773411010777745,
"loss": 3.2518,
"step": 63800
},
{
"epoch": 18.58820103663153,
"grad_norm": 0.36708375811576843,
"learning_rate": 0.00037716632682784735,
"loss": 3.2481,
"step": 63850
},
{
"epoch": 18.60276046823132,
"grad_norm": 0.3941199779510498,
"learning_rate": 0.0003769915525779202,
"loss": 3.2456,
"step": 63900
},
{
"epoch": 18.61731989983111,
"grad_norm": 0.3851776123046875,
"learning_rate": 0.000376816778327993,
"loss": 3.2572,
"step": 63950
},
{
"epoch": 18.631879331430902,
"grad_norm": 0.3694128096103668,
"learning_rate": 0.0003766420040780658,
"loss": 3.2426,
"step": 64000
},
{
"epoch": 18.631879331430902,
"eval_accuracy": 0.3727275902035293,
"eval_loss": 3.544032096862793,
"eval_runtime": 54.1155,
"eval_samples_per_second": 307.546,
"eval_steps_per_second": 19.237,
"step": 64000
},
{
"epoch": 18.64643876303069,
"grad_norm": 0.36158764362335205,
"learning_rate": 0.00037646722982813863,
"loss": 3.2605,
"step": 64050
},
{
"epoch": 18.66099819463048,
"grad_norm": 0.3997897803783417,
"learning_rate": 0.00037629245557821146,
"loss": 3.2588,
"step": 64100
},
{
"epoch": 18.675557626230272,
"grad_norm": 0.38344115018844604,
"learning_rate": 0.0003761176813282843,
"loss": 3.2558,
"step": 64150
},
{
"epoch": 18.69011705783006,
"grad_norm": 0.37368038296699524,
"learning_rate": 0.00037594290707835713,
"loss": 3.2568,
"step": 64200
},
{
"epoch": 18.704676489429854,
"grad_norm": 0.3679753541946411,
"learning_rate": 0.0003757681328284299,
"loss": 3.2441,
"step": 64250
},
{
"epoch": 18.719235921029643,
"grad_norm": 0.39784008264541626,
"learning_rate": 0.00037559335857850274,
"loss": 3.2673,
"step": 64300
},
{
"epoch": 18.733795352629432,
"grad_norm": 0.3740057349205017,
"learning_rate": 0.0003754185843285756,
"loss": 3.2658,
"step": 64350
},
{
"epoch": 18.748354784229225,
"grad_norm": 0.3893584907054901,
"learning_rate": 0.0003752438100786484,
"loss": 3.2733,
"step": 64400
},
{
"epoch": 18.762914215829014,
"grad_norm": 0.3613312542438507,
"learning_rate": 0.0003750690358287212,
"loss": 3.2595,
"step": 64450
},
{
"epoch": 18.777473647428806,
"grad_norm": 0.3821388781070709,
"learning_rate": 0.000374894261578794,
"loss": 3.269,
"step": 64500
},
{
"epoch": 18.792033079028595,
"grad_norm": 0.39411360025405884,
"learning_rate": 0.00037471948732886685,
"loss": 3.2613,
"step": 64550
},
{
"epoch": 18.806592510628384,
"grad_norm": 0.36718764901161194,
"learning_rate": 0.0003745447130789397,
"loss": 3.2631,
"step": 64600
},
{
"epoch": 18.821151942228177,
"grad_norm": 0.3988918364048004,
"learning_rate": 0.0003743699388290125,
"loss": 3.2724,
"step": 64650
},
{
"epoch": 18.835711373827966,
"grad_norm": 0.37188059091567993,
"learning_rate": 0.0003741951645790853,
"loss": 3.2802,
"step": 64700
},
{
"epoch": 18.850270805427755,
"grad_norm": 0.3327014446258545,
"learning_rate": 0.00037402039032915813,
"loss": 3.2575,
"step": 64750
},
{
"epoch": 18.864830237027547,
"grad_norm": 0.38189268112182617,
"learning_rate": 0.00037384561607923096,
"loss": 3.2629,
"step": 64800
},
{
"epoch": 18.879389668627336,
"grad_norm": 0.3511042594909668,
"learning_rate": 0.0003736708418293038,
"loss": 3.2602,
"step": 64850
},
{
"epoch": 18.893949100227125,
"grad_norm": 0.38207387924194336,
"learning_rate": 0.00037349606757937663,
"loss": 3.281,
"step": 64900
},
{
"epoch": 18.908508531826918,
"grad_norm": 0.3751682937145233,
"learning_rate": 0.0003733212933294494,
"loss": 3.2783,
"step": 64950
},
{
"epoch": 18.923067963426707,
"grad_norm": 0.39629825949668884,
"learning_rate": 0.00037314651907952224,
"loss": 3.2577,
"step": 65000
},
{
"epoch": 18.923067963426707,
"eval_accuracy": 0.3736138192946313,
"eval_loss": 3.5334720611572266,
"eval_runtime": 53.8337,
"eval_samples_per_second": 309.156,
"eval_steps_per_second": 19.337,
"step": 65000
},
{
"epoch": 18.9376273950265,
"grad_norm": 0.39632660150527954,
"learning_rate": 0.00037297174482959507,
"loss": 3.2616,
"step": 65050
},
{
"epoch": 18.95218682662629,
"grad_norm": 0.407411128282547,
"learning_rate": 0.0003727969705796679,
"loss": 3.2779,
"step": 65100
},
{
"epoch": 18.966746258226078,
"grad_norm": 0.3708113133907318,
"learning_rate": 0.0003726221963297407,
"loss": 3.2795,
"step": 65150
},
{
"epoch": 18.98130568982587,
"grad_norm": 0.37315139174461365,
"learning_rate": 0.0003724474220798135,
"loss": 3.2703,
"step": 65200
},
{
"epoch": 18.99586512142566,
"grad_norm": 0.3742704689502716,
"learning_rate": 0.00037227264782988635,
"loss": 3.286,
"step": 65250
},
{
"epoch": 19.010191602119853,
"grad_norm": 0.37877383828163147,
"learning_rate": 0.0003720978735799592,
"loss": 3.2045,
"step": 65300
},
{
"epoch": 19.024751033719642,
"grad_norm": 0.37160438299179077,
"learning_rate": 0.000371923099330032,
"loss": 3.1639,
"step": 65350
},
{
"epoch": 19.039310465319435,
"grad_norm": 0.3627760112285614,
"learning_rate": 0.0003717483250801048,
"loss": 3.1622,
"step": 65400
},
{
"epoch": 19.053869896919224,
"grad_norm": 0.3930268883705139,
"learning_rate": 0.00037157355083017763,
"loss": 3.1768,
"step": 65450
},
{
"epoch": 19.068429328519013,
"grad_norm": 0.4046318829059601,
"learning_rate": 0.00037139877658025046,
"loss": 3.1912,
"step": 65500
},
{
"epoch": 19.082988760118806,
"grad_norm": 0.37506386637687683,
"learning_rate": 0.0003712240023303233,
"loss": 3.1872,
"step": 65550
},
{
"epoch": 19.097548191718595,
"grad_norm": 0.3771907389163971,
"learning_rate": 0.00037104922808039607,
"loss": 3.2021,
"step": 65600
},
{
"epoch": 19.112107623318387,
"grad_norm": 0.37820398807525635,
"learning_rate": 0.0003708744538304689,
"loss": 3.1905,
"step": 65650
},
{
"epoch": 19.126667054918176,
"grad_norm": 0.3759934604167938,
"learning_rate": 0.00037069967958054174,
"loss": 3.1817,
"step": 65700
},
{
"epoch": 19.141226486517965,
"grad_norm": 0.3758086860179901,
"learning_rate": 0.00037052490533061457,
"loss": 3.2027,
"step": 65750
},
{
"epoch": 19.155785918117758,
"grad_norm": 0.3827981650829315,
"learning_rate": 0.0003703501310806874,
"loss": 3.1894,
"step": 65800
},
{
"epoch": 19.170345349717547,
"grad_norm": 0.4200972020626068,
"learning_rate": 0.0003701753568307602,
"loss": 3.1961,
"step": 65850
},
{
"epoch": 19.184904781317336,
"grad_norm": 0.40495339035987854,
"learning_rate": 0.000370000582580833,
"loss": 3.1961,
"step": 65900
},
{
"epoch": 19.19946421291713,
"grad_norm": 0.3779332935810089,
"learning_rate": 0.00036982580833090585,
"loss": 3.1914,
"step": 65950
},
{
"epoch": 19.214023644516917,
"grad_norm": 0.38305872678756714,
"learning_rate": 0.00036965103408097874,
"loss": 3.1949,
"step": 66000
},
{
"epoch": 19.214023644516917,
"eval_accuracy": 0.3729239551474248,
"eval_loss": 3.5489401817321777,
"eval_runtime": 53.9661,
"eval_samples_per_second": 308.397,
"eval_steps_per_second": 19.29,
"step": 66000
},
{
"epoch": 19.22858307611671,
"grad_norm": 0.4134466350078583,
"learning_rate": 0.00036947625983105157,
"loss": 3.2189,
"step": 66050
},
{
"epoch": 19.2431425077165,
"grad_norm": 0.35542577505111694,
"learning_rate": 0.0003693014855811244,
"loss": 3.2184,
"step": 66100
},
{
"epoch": 19.257701939316288,
"grad_norm": 0.37749409675598145,
"learning_rate": 0.0003691267113311972,
"loss": 3.2118,
"step": 66150
},
{
"epoch": 19.27226137091608,
"grad_norm": 0.36184874176979065,
"learning_rate": 0.00036895193708127,
"loss": 3.215,
"step": 66200
},
{
"epoch": 19.28682080251587,
"grad_norm": 0.3879592716693878,
"learning_rate": 0.00036877716283134285,
"loss": 3.22,
"step": 66250
},
{
"epoch": 19.30138023411566,
"grad_norm": 0.37836670875549316,
"learning_rate": 0.0003686023885814157,
"loss": 3.2235,
"step": 66300
},
{
"epoch": 19.31593966571545,
"grad_norm": 0.38774940371513367,
"learning_rate": 0.00036842761433148846,
"loss": 3.2293,
"step": 66350
},
{
"epoch": 19.33049909731524,
"grad_norm": 0.3813439905643463,
"learning_rate": 0.0003682528400815613,
"loss": 3.2192,
"step": 66400
},
{
"epoch": 19.345058528915033,
"grad_norm": 0.40040069818496704,
"learning_rate": 0.0003680780658316341,
"loss": 3.2372,
"step": 66450
},
{
"epoch": 19.35961796051482,
"grad_norm": 0.37192991375923157,
"learning_rate": 0.00036790329158170696,
"loss": 3.219,
"step": 66500
},
{
"epoch": 19.37417739211461,
"grad_norm": 0.42093369364738464,
"learning_rate": 0.0003677285173317798,
"loss": 3.2251,
"step": 66550
},
{
"epoch": 19.388736823714403,
"grad_norm": 0.37080198526382446,
"learning_rate": 0.00036755374308185257,
"loss": 3.2347,
"step": 66600
},
{
"epoch": 19.403296255314192,
"grad_norm": 0.3722926676273346,
"learning_rate": 0.0003673789688319254,
"loss": 3.2358,
"step": 66650
},
{
"epoch": 19.41785568691398,
"grad_norm": 0.388348788022995,
"learning_rate": 0.00036720419458199824,
"loss": 3.2263,
"step": 66700
},
{
"epoch": 19.432415118513774,
"grad_norm": 0.3911234140396118,
"learning_rate": 0.00036702942033207107,
"loss": 3.227,
"step": 66750
},
{
"epoch": 19.446974550113563,
"grad_norm": 0.38625290989875793,
"learning_rate": 0.0003668546460821439,
"loss": 3.2332,
"step": 66800
},
{
"epoch": 19.461533981713355,
"grad_norm": 0.35609227418899536,
"learning_rate": 0.0003666798718322167,
"loss": 3.2197,
"step": 66850
},
{
"epoch": 19.476093413313144,
"grad_norm": 0.4059786796569824,
"learning_rate": 0.0003665050975822895,
"loss": 3.2387,
"step": 66900
},
{
"epoch": 19.490652844912933,
"grad_norm": 0.40667444467544556,
"learning_rate": 0.00036633032333236235,
"loss": 3.238,
"step": 66950
},
{
"epoch": 19.505212276512726,
"grad_norm": 0.3943587839603424,
"learning_rate": 0.0003661555490824352,
"loss": 3.2359,
"step": 67000
},
{
"epoch": 19.505212276512726,
"eval_accuracy": 0.3730882197142643,
"eval_loss": 3.5423266887664795,
"eval_runtime": 55.0514,
"eval_samples_per_second": 302.318,
"eval_steps_per_second": 18.91,
"step": 67000
},
{
"epoch": 19.519771708112515,
"grad_norm": 0.4029746353626251,
"learning_rate": 0.00036598077483250796,
"loss": 3.2504,
"step": 67050
},
{
"epoch": 19.534331139712307,
"grad_norm": 0.38361015915870667,
"learning_rate": 0.0003658060005825808,
"loss": 3.241,
"step": 67100
},
{
"epoch": 19.548890571312096,
"grad_norm": 0.4034869372844696,
"learning_rate": 0.0003656312263326536,
"loss": 3.2417,
"step": 67150
},
{
"epoch": 19.563450002911885,
"grad_norm": 0.41540029644966125,
"learning_rate": 0.00036545645208272646,
"loss": 3.2375,
"step": 67200
},
{
"epoch": 19.578009434511678,
"grad_norm": 0.3546990156173706,
"learning_rate": 0.0003652816778327993,
"loss": 3.246,
"step": 67250
},
{
"epoch": 19.592568866111467,
"grad_norm": 0.39462944865226746,
"learning_rate": 0.00036510690358287207,
"loss": 3.2517,
"step": 67300
},
{
"epoch": 19.607128297711256,
"grad_norm": 0.39903032779693604,
"learning_rate": 0.0003649321293329449,
"loss": 3.2395,
"step": 67350
},
{
"epoch": 19.62168772931105,
"grad_norm": 0.3746011555194855,
"learning_rate": 0.00036475735508301774,
"loss": 3.2482,
"step": 67400
},
{
"epoch": 19.636247160910838,
"grad_norm": 0.40539586544036865,
"learning_rate": 0.00036458258083309057,
"loss": 3.2469,
"step": 67450
},
{
"epoch": 19.650806592510627,
"grad_norm": 0.378738135099411,
"learning_rate": 0.0003644078065831634,
"loss": 3.2508,
"step": 67500
},
{
"epoch": 19.66536602411042,
"grad_norm": 0.384886771440506,
"learning_rate": 0.0003642330323332362,
"loss": 3.2453,
"step": 67550
},
{
"epoch": 19.679925455710208,
"grad_norm": 0.38220494985580444,
"learning_rate": 0.000364058258083309,
"loss": 3.2378,
"step": 67600
},
{
"epoch": 19.69448488731,
"grad_norm": 0.40992140769958496,
"learning_rate": 0.00036388348383338185,
"loss": 3.247,
"step": 67650
},
{
"epoch": 19.70904431890979,
"grad_norm": 0.3588728606700897,
"learning_rate": 0.0003637087095834547,
"loss": 3.2447,
"step": 67700
},
{
"epoch": 19.72360375050958,
"grad_norm": 0.39845365285873413,
"learning_rate": 0.00036353393533352746,
"loss": 3.2365,
"step": 67750
},
{
"epoch": 19.73816318210937,
"grad_norm": 0.3711094260215759,
"learning_rate": 0.0003633591610836003,
"loss": 3.2589,
"step": 67800
},
{
"epoch": 19.75272261370916,
"grad_norm": 0.44665664434432983,
"learning_rate": 0.0003631843868336731,
"loss": 3.2394,
"step": 67850
},
{
"epoch": 19.767282045308953,
"grad_norm": 0.35950037837028503,
"learning_rate": 0.00036300961258374596,
"loss": 3.2603,
"step": 67900
},
{
"epoch": 19.781841476908742,
"grad_norm": 0.3663308024406433,
"learning_rate": 0.0003628348383338188,
"loss": 3.252,
"step": 67950
},
{
"epoch": 19.79640090850853,
"grad_norm": 0.3737200200557709,
"learning_rate": 0.00036266006408389157,
"loss": 3.2688,
"step": 68000
},
{
"epoch": 19.79640090850853,
"eval_accuracy": 0.3734972937500801,
"eval_loss": 3.5395822525024414,
"eval_runtime": 55.4217,
"eval_samples_per_second": 300.298,
"eval_steps_per_second": 18.783,
"step": 68000
},
{
"epoch": 19.810960340108323,
"grad_norm": 0.3715822398662567,
"learning_rate": 0.0003624852898339644,
"loss": 3.234,
"step": 68050
},
{
"epoch": 19.825519771708112,
"grad_norm": 0.4164450764656067,
"learning_rate": 0.00036231051558403723,
"loss": 3.249,
"step": 68100
},
{
"epoch": 19.8400792033079,
"grad_norm": 0.37426820397377014,
"learning_rate": 0.00036213574133411007,
"loss": 3.2556,
"step": 68150
},
{
"epoch": 19.854638634907694,
"grad_norm": 0.37632179260253906,
"learning_rate": 0.0003619609670841829,
"loss": 3.2623,
"step": 68200
},
{
"epoch": 19.869198066507483,
"grad_norm": 0.37010031938552856,
"learning_rate": 0.0003617861928342557,
"loss": 3.2537,
"step": 68250
},
{
"epoch": 19.883757498107276,
"grad_norm": 0.3916720747947693,
"learning_rate": 0.0003616114185843285,
"loss": 3.2451,
"step": 68300
},
{
"epoch": 19.898316929707065,
"grad_norm": 0.37939032912254333,
"learning_rate": 0.00036143664433440135,
"loss": 3.2707,
"step": 68350
},
{
"epoch": 19.912876361306854,
"grad_norm": 0.4042569100856781,
"learning_rate": 0.0003612618700844742,
"loss": 3.26,
"step": 68400
},
{
"epoch": 19.927435792906646,
"grad_norm": 0.35950231552124023,
"learning_rate": 0.00036108709583454696,
"loss": 3.2666,
"step": 68450
},
{
"epoch": 19.941995224506435,
"grad_norm": 0.388754665851593,
"learning_rate": 0.0003609123215846198,
"loss": 3.2571,
"step": 68500
},
{
"epoch": 19.956554656106224,
"grad_norm": 0.39523687958717346,
"learning_rate": 0.0003607375473346927,
"loss": 3.2525,
"step": 68550
},
{
"epoch": 19.971114087706017,
"grad_norm": 0.37882113456726074,
"learning_rate": 0.0003605627730847655,
"loss": 3.2668,
"step": 68600
},
{
"epoch": 19.985673519305806,
"grad_norm": 0.3885203003883362,
"learning_rate": 0.00036038799883483834,
"loss": 3.2625,
"step": 68650
},
{
"epoch": 20.0,
"grad_norm": 0.9490720629692078,
"learning_rate": 0.0003602132245849112,
"loss": 3.2624,
"step": 68700
},
{
"epoch": 20.01455943159979,
"grad_norm": 0.36799395084381104,
"learning_rate": 0.00036003845033498395,
"loss": 3.1537,
"step": 68750
},
{
"epoch": 20.02911886319958,
"grad_norm": 0.3699009418487549,
"learning_rate": 0.0003598636760850568,
"loss": 3.1452,
"step": 68800
},
{
"epoch": 20.04367829479937,
"grad_norm": 0.3853961229324341,
"learning_rate": 0.0003596889018351296,
"loss": 3.162,
"step": 68850
},
{
"epoch": 20.058237726399163,
"grad_norm": 0.3762233257293701,
"learning_rate": 0.00035951412758520245,
"loss": 3.163,
"step": 68900
},
{
"epoch": 20.072797157998952,
"grad_norm": 0.4281475841999054,
"learning_rate": 0.00035933935333527523,
"loss": 3.1756,
"step": 68950
},
{
"epoch": 20.08735658959874,
"grad_norm": 0.40628287196159363,
"learning_rate": 0.00035916457908534807,
"loss": 3.1636,
"step": 69000
},
{
"epoch": 20.08735658959874,
"eval_accuracy": 0.37290549449102267,
"eval_loss": 3.550813913345337,
"eval_runtime": 55.4801,
"eval_samples_per_second": 299.981,
"eval_steps_per_second": 18.763,
"step": 69000
},
{
"epoch": 20.101916021198534,
"grad_norm": 0.4060651361942291,
"learning_rate": 0.0003589898048354209,
"loss": 3.1772,
"step": 69050
},
{
"epoch": 20.116475452798323,
"grad_norm": 0.35740476846694946,
"learning_rate": 0.00035881503058549373,
"loss": 3.1712,
"step": 69100
},
{
"epoch": 20.13103488439811,
"grad_norm": 0.3880947232246399,
"learning_rate": 0.00035864025633556656,
"loss": 3.1762,
"step": 69150
},
{
"epoch": 20.145594315997904,
"grad_norm": 0.38205868005752563,
"learning_rate": 0.00035846548208563934,
"loss": 3.1846,
"step": 69200
},
{
"epoch": 20.160153747597693,
"grad_norm": 0.3950273394584656,
"learning_rate": 0.0003582907078357122,
"loss": 3.1975,
"step": 69250
},
{
"epoch": 20.174713179197486,
"grad_norm": 0.4085865020751953,
"learning_rate": 0.000358115933585785,
"loss": 3.1907,
"step": 69300
},
{
"epoch": 20.189272610797275,
"grad_norm": 0.41469231247901917,
"learning_rate": 0.00035794115933585784,
"loss": 3.1888,
"step": 69350
},
{
"epoch": 20.203832042397064,
"grad_norm": 0.40203621983528137,
"learning_rate": 0.0003577663850859307,
"loss": 3.2023,
"step": 69400
},
{
"epoch": 20.218391473996856,
"grad_norm": 0.39782029390335083,
"learning_rate": 0.00035759161083600345,
"loss": 3.1859,
"step": 69450
},
{
"epoch": 20.232950905596645,
"grad_norm": 0.396565318107605,
"learning_rate": 0.0003574168365860763,
"loss": 3.202,
"step": 69500
},
{
"epoch": 20.247510337196434,
"grad_norm": 0.36073732376098633,
"learning_rate": 0.0003572420623361491,
"loss": 3.1976,
"step": 69550
},
{
"epoch": 20.262069768796227,
"grad_norm": 0.380855530500412,
"learning_rate": 0.00035706728808622195,
"loss": 3.2032,
"step": 69600
},
{
"epoch": 20.276629200396016,
"grad_norm": 0.40812456607818604,
"learning_rate": 0.00035689251383629473,
"loss": 3.2076,
"step": 69650
},
{
"epoch": 20.29118863199581,
"grad_norm": 0.38896799087524414,
"learning_rate": 0.00035671773958636757,
"loss": 3.1939,
"step": 69700
},
{
"epoch": 20.305748063595598,
"grad_norm": 0.3849445581436157,
"learning_rate": 0.0003565429653364404,
"loss": 3.2116,
"step": 69750
},
{
"epoch": 20.320307495195387,
"grad_norm": 0.444782018661499,
"learning_rate": 0.00035636819108651323,
"loss": 3.2223,
"step": 69800
},
{
"epoch": 20.33486692679518,
"grad_norm": 0.37147748470306396,
"learning_rate": 0.00035619341683658606,
"loss": 3.2055,
"step": 69850
},
{
"epoch": 20.349426358394968,
"grad_norm": 0.3865772485733032,
"learning_rate": 0.00035601864258665884,
"loss": 3.2103,
"step": 69900
},
{
"epoch": 20.363985789994757,
"grad_norm": 0.3948829770088196,
"learning_rate": 0.0003558438683367317,
"loss": 3.2175,
"step": 69950
},
{
"epoch": 20.37854522159455,
"grad_norm": 0.3815650939941406,
"learning_rate": 0.0003556690940868045,
"loss": 3.2153,
"step": 70000
},
{
"epoch": 20.37854522159455,
"eval_accuracy": 0.37331374544024726,
"eval_loss": 3.5450870990753174,
"eval_runtime": 55.4338,
"eval_samples_per_second": 300.232,
"eval_steps_per_second": 18.779,
"step": 70000
},
{
"epoch": 20.39310465319434,
"grad_norm": 0.37321028113365173,
"learning_rate": 0.00035549431983687734,
"loss": 3.2223,
"step": 70050
},
{
"epoch": 20.40766408479413,
"grad_norm": 0.42437463998794556,
"learning_rate": 0.0003553195455869502,
"loss": 3.225,
"step": 70100
},
{
"epoch": 20.42222351639392,
"grad_norm": 0.3605141341686249,
"learning_rate": 0.00035514477133702295,
"loss": 3.222,
"step": 70150
},
{
"epoch": 20.43678294799371,
"grad_norm": 0.38323965668678284,
"learning_rate": 0.0003549699970870958,
"loss": 3.2165,
"step": 70200
},
{
"epoch": 20.451342379593502,
"grad_norm": 0.3995019197463989,
"learning_rate": 0.0003547952228371686,
"loss": 3.2169,
"step": 70250
},
{
"epoch": 20.46590181119329,
"grad_norm": 0.4000272750854492,
"learning_rate": 0.00035462044858724145,
"loss": 3.2186,
"step": 70300
},
{
"epoch": 20.48046124279308,
"grad_norm": 0.3743245005607605,
"learning_rate": 0.00035444567433731423,
"loss": 3.2285,
"step": 70350
},
{
"epoch": 20.495020674392872,
"grad_norm": 0.3767825663089752,
"learning_rate": 0.00035427090008738706,
"loss": 3.2267,
"step": 70400
},
{
"epoch": 20.50958010599266,
"grad_norm": 0.41871246695518494,
"learning_rate": 0.0003540961258374599,
"loss": 3.2377,
"step": 70450
},
{
"epoch": 20.524139537592454,
"grad_norm": 0.3861671984195709,
"learning_rate": 0.00035392135158753273,
"loss": 3.234,
"step": 70500
},
{
"epoch": 20.538698969192243,
"grad_norm": 0.3660334348678589,
"learning_rate": 0.00035374657733760556,
"loss": 3.2345,
"step": 70550
},
{
"epoch": 20.553258400792032,
"grad_norm": 0.4146880805492401,
"learning_rate": 0.00035357180308767834,
"loss": 3.2271,
"step": 70600
},
{
"epoch": 20.567817832391825,
"grad_norm": 0.4045553207397461,
"learning_rate": 0.0003533970288377512,
"loss": 3.2282,
"step": 70650
},
{
"epoch": 20.582377263991614,
"grad_norm": 0.39800694584846497,
"learning_rate": 0.000353222254587824,
"loss": 3.2371,
"step": 70700
},
{
"epoch": 20.596936695591403,
"grad_norm": 0.37746721506118774,
"learning_rate": 0.00035304748033789684,
"loss": 3.242,
"step": 70750
},
{
"epoch": 20.611496127191195,
"grad_norm": 0.37762463092803955,
"learning_rate": 0.0003528727060879697,
"loss": 3.2312,
"step": 70800
},
{
"epoch": 20.626055558790984,
"grad_norm": 0.36045747995376587,
"learning_rate": 0.00035269793183804245,
"loss": 3.2324,
"step": 70850
},
{
"epoch": 20.640614990390777,
"grad_norm": 0.40877410769462585,
"learning_rate": 0.0003525231575881153,
"loss": 3.2411,
"step": 70900
},
{
"epoch": 20.655174421990566,
"grad_norm": 0.37934237718582153,
"learning_rate": 0.0003523483833381881,
"loss": 3.2351,
"step": 70950
},
{
"epoch": 20.669733853590355,
"grad_norm": 0.379980206489563,
"learning_rate": 0.00035217360908826095,
"loss": 3.2345,
"step": 71000
},
{
"epoch": 20.669733853590355,
"eval_accuracy": 0.373392056250208,
"eval_loss": 3.540816068649292,
"eval_runtime": 55.7309,
"eval_samples_per_second": 298.631,
"eval_steps_per_second": 18.679,
"step": 71000
},
{
"epoch": 20.684293285190147,
"grad_norm": 0.41839686036109924,
"learning_rate": 0.00035199883483833384,
"loss": 3.2169,
"step": 71050
},
{
"epoch": 20.698852716789936,
"grad_norm": 0.41018539667129517,
"learning_rate": 0.0003518240605884066,
"loss": 3.2481,
"step": 71100
},
{
"epoch": 20.713412148389725,
"grad_norm": 0.39199137687683105,
"learning_rate": 0.00035164928633847945,
"loss": 3.2431,
"step": 71150
},
{
"epoch": 20.727971579989518,
"grad_norm": 0.39780452847480774,
"learning_rate": 0.0003514745120885523,
"loss": 3.2461,
"step": 71200
},
{
"epoch": 20.742531011589307,
"grad_norm": 0.3732914626598358,
"learning_rate": 0.0003512997378386251,
"loss": 3.247,
"step": 71250
},
{
"epoch": 20.7570904431891,
"grad_norm": 0.3949578106403351,
"learning_rate": 0.00035112496358869795,
"loss": 3.2449,
"step": 71300
},
{
"epoch": 20.77164987478889,
"grad_norm": 0.41203632950782776,
"learning_rate": 0.00035095018933877073,
"loss": 3.2269,
"step": 71350
},
{
"epoch": 20.786209306388677,
"grad_norm": 0.40544483065605164,
"learning_rate": 0.00035077541508884356,
"loss": 3.2396,
"step": 71400
},
{
"epoch": 20.80076873798847,
"grad_norm": 0.39241766929626465,
"learning_rate": 0.0003506006408389164,
"loss": 3.2499,
"step": 71450
},
{
"epoch": 20.81532816958826,
"grad_norm": 0.390432208776474,
"learning_rate": 0.00035042586658898923,
"loss": 3.265,
"step": 71500
},
{
"epoch": 20.829887601188048,
"grad_norm": 0.4000423550605774,
"learning_rate": 0.00035025109233906206,
"loss": 3.2445,
"step": 71550
},
{
"epoch": 20.84444703278784,
"grad_norm": 0.3772542476654053,
"learning_rate": 0.00035007631808913484,
"loss": 3.2466,
"step": 71600
},
{
"epoch": 20.85900646438763,
"grad_norm": 0.3796822130680084,
"learning_rate": 0.00034990154383920767,
"loss": 3.248,
"step": 71650
},
{
"epoch": 20.873565895987422,
"grad_norm": 0.39603352546691895,
"learning_rate": 0.0003497267695892805,
"loss": 3.2477,
"step": 71700
},
{
"epoch": 20.88812532758721,
"grad_norm": 0.4122164845466614,
"learning_rate": 0.00034955199533935334,
"loss": 3.2418,
"step": 71750
},
{
"epoch": 20.902684759187,
"grad_norm": 0.41109615564346313,
"learning_rate": 0.0003493772210894261,
"loss": 3.2463,
"step": 71800
},
{
"epoch": 20.917244190786793,
"grad_norm": 0.39169979095458984,
"learning_rate": 0.00034920244683949895,
"loss": 3.2625,
"step": 71850
},
{
"epoch": 20.93180362238658,
"grad_norm": 0.42167893052101135,
"learning_rate": 0.0003490276725895718,
"loss": 3.2399,
"step": 71900
},
{
"epoch": 20.94636305398637,
"grad_norm": 0.39163246750831604,
"learning_rate": 0.0003488528983396446,
"loss": 3.252,
"step": 71950
},
{
"epoch": 20.960922485586163,
"grad_norm": 0.41562479734420776,
"learning_rate": 0.00034867812408971745,
"loss": 3.2576,
"step": 72000
},
{
"epoch": 20.960922485586163,
"eval_accuracy": 0.3742182000201539,
"eval_loss": 3.528524875640869,
"eval_runtime": 55.4191,
"eval_samples_per_second": 300.312,
"eval_steps_per_second": 18.784,
"step": 72000
},
{
"epoch": 20.975481917185952,
"grad_norm": 0.3895370662212372,
"learning_rate": 0.00034850334983979023,
"loss": 3.2408,
"step": 72050
},
{
"epoch": 20.990041348785745,
"grad_norm": 0.38841962814331055,
"learning_rate": 0.00034832857558986306,
"loss": 3.2424,
"step": 72100
},
{
"epoch": 21.004367829479936,
"grad_norm": 0.38999074697494507,
"learning_rate": 0.0003481538013399359,
"loss": 3.2279,
"step": 72150
},
{
"epoch": 21.018927261079728,
"grad_norm": 0.3888770043849945,
"learning_rate": 0.0003479790270900087,
"loss": 3.145,
"step": 72200
},
{
"epoch": 21.033486692679517,
"grad_norm": 0.3620569109916687,
"learning_rate": 0.0003478042528400815,
"loss": 3.1389,
"step": 72250
},
{
"epoch": 21.04804612427931,
"grad_norm": 0.3902473449707031,
"learning_rate": 0.00034762947859015434,
"loss": 3.1505,
"step": 72300
},
{
"epoch": 21.0626055558791,
"grad_norm": 0.3933819830417633,
"learning_rate": 0.00034745470434022717,
"loss": 3.1558,
"step": 72350
},
{
"epoch": 21.077164987478888,
"grad_norm": 0.3986400067806244,
"learning_rate": 0.0003472799300903,
"loss": 3.1592,
"step": 72400
},
{
"epoch": 21.09172441907868,
"grad_norm": 0.3890974521636963,
"learning_rate": 0.00034710515584037284,
"loss": 3.1688,
"step": 72450
},
{
"epoch": 21.10628385067847,
"grad_norm": 0.38709503412246704,
"learning_rate": 0.0003469303815904456,
"loss": 3.1723,
"step": 72500
},
{
"epoch": 21.12084328227826,
"grad_norm": 0.4304654896259308,
"learning_rate": 0.00034675560734051845,
"loss": 3.1755,
"step": 72550
},
{
"epoch": 21.13540271387805,
"grad_norm": 0.4124937653541565,
"learning_rate": 0.0003465808330905913,
"loss": 3.1816,
"step": 72600
},
{
"epoch": 21.14996214547784,
"grad_norm": 0.39741483330726624,
"learning_rate": 0.0003464060588406641,
"loss": 3.1765,
"step": 72650
},
{
"epoch": 21.164521577077632,
"grad_norm": 0.38367438316345215,
"learning_rate": 0.00034623128459073695,
"loss": 3.1732,
"step": 72700
},
{
"epoch": 21.17908100867742,
"grad_norm": 0.38440006971359253,
"learning_rate": 0.00034605651034080973,
"loss": 3.1819,
"step": 72750
},
{
"epoch": 21.19364044027721,
"grad_norm": 0.3843246102333069,
"learning_rate": 0.00034588173609088256,
"loss": 3.1871,
"step": 72800
},
{
"epoch": 21.208199871877003,
"grad_norm": 0.407388299703598,
"learning_rate": 0.0003457069618409554,
"loss": 3.1924,
"step": 72850
},
{
"epoch": 21.222759303476792,
"grad_norm": 0.38186925649642944,
"learning_rate": 0.0003455321875910282,
"loss": 3.1898,
"step": 72900
},
{
"epoch": 21.23731873507658,
"grad_norm": 0.38251790404319763,
"learning_rate": 0.000345357413341101,
"loss": 3.1994,
"step": 72950
},
{
"epoch": 21.251878166676374,
"grad_norm": 0.43736377358436584,
"learning_rate": 0.00034518263909117384,
"loss": 3.1837,
"step": 73000
},
{
"epoch": 21.251878166676374,
"eval_accuracy": 0.3736014729957636,
"eval_loss": 3.547090530395508,
"eval_runtime": 55.385,
"eval_samples_per_second": 300.496,
"eval_steps_per_second": 18.796,
"step": 73000
},
{
"epoch": 21.266437598276163,
"grad_norm": 0.3770950138568878,
"learning_rate": 0.00034500786484124667,
"loss": 3.1809,
"step": 73050
},
{
"epoch": 21.280997029875955,
"grad_norm": 0.3902764618396759,
"learning_rate": 0.0003448330905913195,
"loss": 3.2044,
"step": 73100
},
{
"epoch": 21.295556461475744,
"grad_norm": 0.37991878390312195,
"learning_rate": 0.00034465831634139234,
"loss": 3.1995,
"step": 73150
},
{
"epoch": 21.310115893075533,
"grad_norm": 0.3790615499019623,
"learning_rate": 0.0003444835420914651,
"loss": 3.2026,
"step": 73200
},
{
"epoch": 21.324675324675326,
"grad_norm": 0.36328187584877014,
"learning_rate": 0.00034430876784153795,
"loss": 3.1957,
"step": 73250
},
{
"epoch": 21.339234756275115,
"grad_norm": 0.39685186743736267,
"learning_rate": 0.0003441339935916108,
"loss": 3.1948,
"step": 73300
},
{
"epoch": 21.353794187874904,
"grad_norm": 0.384804368019104,
"learning_rate": 0.0003439592193416836,
"loss": 3.1912,
"step": 73350
},
{
"epoch": 21.368353619474696,
"grad_norm": 0.3883644640445709,
"learning_rate": 0.00034378444509175645,
"loss": 3.2182,
"step": 73400
},
{
"epoch": 21.382913051074485,
"grad_norm": 0.40188512206077576,
"learning_rate": 0.0003436096708418292,
"loss": 3.2072,
"step": 73450
},
{
"epoch": 21.397472482674278,
"grad_norm": 0.4035356640815735,
"learning_rate": 0.00034343489659190206,
"loss": 3.1982,
"step": 73500
},
{
"epoch": 21.412031914274067,
"grad_norm": 0.37305113673210144,
"learning_rate": 0.0003432601223419749,
"loss": 3.2027,
"step": 73550
},
{
"epoch": 21.426591345873856,
"grad_norm": 0.3963967263698578,
"learning_rate": 0.0003430853480920478,
"loss": 3.2171,
"step": 73600
},
{
"epoch": 21.44115077747365,
"grad_norm": 0.4038044810295105,
"learning_rate": 0.0003429105738421206,
"loss": 3.2203,
"step": 73650
},
{
"epoch": 21.455710209073438,
"grad_norm": 0.3818981647491455,
"learning_rate": 0.0003427357995921934,
"loss": 3.2169,
"step": 73700
},
{
"epoch": 21.470269640673227,
"grad_norm": 0.37264305353164673,
"learning_rate": 0.0003425610253422662,
"loss": 3.2133,
"step": 73750
},
{
"epoch": 21.48482907227302,
"grad_norm": 0.3894643783569336,
"learning_rate": 0.00034238625109233906,
"loss": 3.2316,
"step": 73800
},
{
"epoch": 21.499388503872808,
"grad_norm": 0.4069054424762726,
"learning_rate": 0.0003422114768424119,
"loss": 3.215,
"step": 73850
},
{
"epoch": 21.5139479354726,
"grad_norm": 0.4019501507282257,
"learning_rate": 0.0003420367025924847,
"loss": 3.2111,
"step": 73900
},
{
"epoch": 21.52850736707239,
"grad_norm": 0.39605239033699036,
"learning_rate": 0.0003418619283425575,
"loss": 3.2143,
"step": 73950
},
{
"epoch": 21.54306679867218,
"grad_norm": 0.38995805382728577,
"learning_rate": 0.00034168715409263034,
"loss": 3.2214,
"step": 74000
},
{
"epoch": 21.54306679867218,
"eval_accuracy": 0.37393999675233547,
"eval_loss": 3.5405991077423096,
"eval_runtime": 55.3361,
"eval_samples_per_second": 300.762,
"eval_steps_per_second": 18.812,
"step": 74000
},
{
"epoch": 21.55762623027197,
"grad_norm": 0.3883754312992096,
"learning_rate": 0.00034151237984270317,
"loss": 3.2207,
"step": 74050
},
{
"epoch": 21.57218566187176,
"grad_norm": 0.4097517132759094,
"learning_rate": 0.000341337605592776,
"loss": 3.2087,
"step": 74100
},
{
"epoch": 21.58674509347155,
"grad_norm": 0.3909226953983307,
"learning_rate": 0.00034116283134284883,
"loss": 3.219,
"step": 74150
},
{
"epoch": 21.601304525071342,
"grad_norm": 0.41672003269195557,
"learning_rate": 0.0003409880570929216,
"loss": 3.2251,
"step": 74200
},
{
"epoch": 21.61586395667113,
"grad_norm": 0.37609753012657166,
"learning_rate": 0.00034081328284299445,
"loss": 3.2111,
"step": 74250
},
{
"epoch": 21.630423388270923,
"grad_norm": 0.39157718420028687,
"learning_rate": 0.0003406385085930673,
"loss": 3.2187,
"step": 74300
},
{
"epoch": 21.644982819870712,
"grad_norm": 0.39113712310791016,
"learning_rate": 0.0003404637343431401,
"loss": 3.2149,
"step": 74350
},
{
"epoch": 21.6595422514705,
"grad_norm": 0.40942496061325073,
"learning_rate": 0.0003402889600932129,
"loss": 3.2303,
"step": 74400
},
{
"epoch": 21.674101683070294,
"grad_norm": 0.43008992075920105,
"learning_rate": 0.0003401141858432857,
"loss": 3.2299,
"step": 74450
},
{
"epoch": 21.688661114670083,
"grad_norm": 0.4003450870513916,
"learning_rate": 0.00033993941159335856,
"loss": 3.2234,
"step": 74500
},
{
"epoch": 21.703220546269872,
"grad_norm": 0.3863767981529236,
"learning_rate": 0.0003397646373434314,
"loss": 3.2188,
"step": 74550
},
{
"epoch": 21.717779977869665,
"grad_norm": 0.37120547890663147,
"learning_rate": 0.0003395898630935042,
"loss": 3.2273,
"step": 74600
},
{
"epoch": 21.732339409469454,
"grad_norm": 0.44255581498146057,
"learning_rate": 0.000339415088843577,
"loss": 3.2166,
"step": 74650
},
{
"epoch": 21.746898841069246,
"grad_norm": 0.39260992407798767,
"learning_rate": 0.00033924031459364983,
"loss": 3.2313,
"step": 74700
},
{
"epoch": 21.761458272669035,
"grad_norm": 0.38604602217674255,
"learning_rate": 0.00033906554034372267,
"loss": 3.2316,
"step": 74750
},
{
"epoch": 21.776017704268824,
"grad_norm": 0.43335166573524475,
"learning_rate": 0.0003388907660937955,
"loss": 3.2346,
"step": 74800
},
{
"epoch": 21.790577135868617,
"grad_norm": 0.4126780331134796,
"learning_rate": 0.0003387159918438683,
"loss": 3.2284,
"step": 74850
},
{
"epoch": 21.805136567468406,
"grad_norm": 0.4032544195652008,
"learning_rate": 0.0003385412175939411,
"loss": 3.2464,
"step": 74900
},
{
"epoch": 21.819695999068195,
"grad_norm": 0.3821982741355896,
"learning_rate": 0.00033836644334401395,
"loss": 3.2471,
"step": 74950
},
{
"epoch": 21.834255430667987,
"grad_norm": 0.4045506417751312,
"learning_rate": 0.0003381916690940868,
"loss": 3.2441,
"step": 75000
},
{
"epoch": 21.834255430667987,
"eval_accuracy": 0.37437952499202487,
"eval_loss": 3.532663345336914,
"eval_runtime": 55.3218,
"eval_samples_per_second": 300.84,
"eval_steps_per_second": 18.817,
"step": 75000
},
{
"epoch": 21.848814862267776,
"grad_norm": 0.38999539613723755,
"learning_rate": 0.0003380168948441596,
"loss": 3.2378,
"step": 75050
},
{
"epoch": 21.86337429386757,
"grad_norm": 0.42581823468208313,
"learning_rate": 0.0003378421205942324,
"loss": 3.2437,
"step": 75100
},
{
"epoch": 21.877933725467358,
"grad_norm": 0.3946520984172821,
"learning_rate": 0.0003376673463443052,
"loss": 3.2464,
"step": 75150
},
{
"epoch": 21.892493157067147,
"grad_norm": 0.35369718074798584,
"learning_rate": 0.00033749257209437806,
"loss": 3.2364,
"step": 75200
},
{
"epoch": 21.90705258866694,
"grad_norm": 0.3942389488220215,
"learning_rate": 0.0003373177978444509,
"loss": 3.2421,
"step": 75250
},
{
"epoch": 21.92161202026673,
"grad_norm": 0.3681791126728058,
"learning_rate": 0.0003371430235945237,
"loss": 3.2329,
"step": 75300
},
{
"epoch": 21.93617145186652,
"grad_norm": 0.39455777406692505,
"learning_rate": 0.0003369682493445965,
"loss": 3.2446,
"step": 75350
},
{
"epoch": 21.95073088346631,
"grad_norm": 0.4163340926170349,
"learning_rate": 0.00033679347509466933,
"loss": 3.233,
"step": 75400
},
{
"epoch": 21.9652903150661,
"grad_norm": 0.3981238603591919,
"learning_rate": 0.00033661870084474217,
"loss": 3.245,
"step": 75450
},
{
"epoch": 21.97984974666589,
"grad_norm": 0.3684476315975189,
"learning_rate": 0.000336443926594815,
"loss": 3.2443,
"step": 75500
},
{
"epoch": 21.99440917826568,
"grad_norm": 0.38006895780563354,
"learning_rate": 0.0003362691523448878,
"loss": 3.2413,
"step": 75550
},
{
"epoch": 22.008735658959875,
"grad_norm": 0.40769121050834656,
"learning_rate": 0.0003360943780949606,
"loss": 3.1715,
"step": 75600
},
{
"epoch": 22.023295090559664,
"grad_norm": 0.3952407240867615,
"learning_rate": 0.00033591960384503344,
"loss": 3.1341,
"step": 75650
},
{
"epoch": 22.037854522159456,
"grad_norm": 0.3831493556499481,
"learning_rate": 0.0003357448295951063,
"loss": 3.1399,
"step": 75700
},
{
"epoch": 22.052413953759245,
"grad_norm": 0.4123232960700989,
"learning_rate": 0.0003355700553451791,
"loss": 3.1501,
"step": 75750
},
{
"epoch": 22.066973385359034,
"grad_norm": 0.3950226902961731,
"learning_rate": 0.0003353952810952519,
"loss": 3.1533,
"step": 75800
},
{
"epoch": 22.081532816958827,
"grad_norm": 0.36806586384773254,
"learning_rate": 0.0003352205068453247,
"loss": 3.1547,
"step": 75850
},
{
"epoch": 22.096092248558616,
"grad_norm": 0.37511464953422546,
"learning_rate": 0.00033504573259539756,
"loss": 3.1488,
"step": 75900
},
{
"epoch": 22.110651680158405,
"grad_norm": 0.38706332445144653,
"learning_rate": 0.0003348709583454704,
"loss": 3.163,
"step": 75950
},
{
"epoch": 22.125211111758198,
"grad_norm": 0.3739766776561737,
"learning_rate": 0.0003346961840955432,
"loss": 3.162,
"step": 76000
},
{
"epoch": 22.125211111758198,
"eval_accuracy": 0.37371035559339666,
"eval_loss": 3.5474770069122314,
"eval_runtime": 55.0086,
"eval_samples_per_second": 302.553,
"eval_steps_per_second": 18.924,
"step": 76000
},
{
"epoch": 22.139770543357987,
"grad_norm": 0.40529629588127136,
"learning_rate": 0.000334521409845616,
"loss": 3.1764,
"step": 76050
},
{
"epoch": 22.15432997495778,
"grad_norm": 0.4074355661869049,
"learning_rate": 0.0003343466355956889,
"loss": 3.1671,
"step": 76100
},
{
"epoch": 22.168889406557568,
"grad_norm": 0.39302748441696167,
"learning_rate": 0.0003341718613457617,
"loss": 3.1593,
"step": 76150
},
{
"epoch": 22.183448838157357,
"grad_norm": 0.43343594670295715,
"learning_rate": 0.00033399708709583455,
"loss": 3.1704,
"step": 76200
},
{
"epoch": 22.19800826975715,
"grad_norm": 0.459338903427124,
"learning_rate": 0.0003338223128459074,
"loss": 3.1819,
"step": 76250
},
{
"epoch": 22.21256770135694,
"grad_norm": 0.43016624450683594,
"learning_rate": 0.00033364753859598016,
"loss": 3.174,
"step": 76300
},
{
"epoch": 22.227127132956728,
"grad_norm": 0.4130750000476837,
"learning_rate": 0.000333472764346053,
"loss": 3.1781,
"step": 76350
},
{
"epoch": 22.24168656455652,
"grad_norm": 0.4306490123271942,
"learning_rate": 0.00033329799009612583,
"loss": 3.1801,
"step": 76400
},
{
"epoch": 22.25624599615631,
"grad_norm": 0.38700979948043823,
"learning_rate": 0.00033312321584619866,
"loss": 3.1954,
"step": 76450
},
{
"epoch": 22.270805427756102,
"grad_norm": 0.38101381063461304,
"learning_rate": 0.0003329484415962715,
"loss": 3.1823,
"step": 76500
},
{
"epoch": 22.28536485935589,
"grad_norm": 0.3876706063747406,
"learning_rate": 0.0003327736673463443,
"loss": 3.1935,
"step": 76550
},
{
"epoch": 22.29992429095568,
"grad_norm": 0.41006049513816833,
"learning_rate": 0.0003325988930964171,
"loss": 3.1965,
"step": 76600
},
{
"epoch": 22.314483722555472,
"grad_norm": 0.3990377187728882,
"learning_rate": 0.00033242411884648994,
"loss": 3.1862,
"step": 76650
},
{
"epoch": 22.32904315415526,
"grad_norm": 0.408883661031723,
"learning_rate": 0.0003322493445965628,
"loss": 3.1911,
"step": 76700
},
{
"epoch": 22.34360258575505,
"grad_norm": 0.4165717363357544,
"learning_rate": 0.0003320745703466356,
"loss": 3.1866,
"step": 76750
},
{
"epoch": 22.358162017354843,
"grad_norm": 0.40794703364372253,
"learning_rate": 0.0003318997960967084,
"loss": 3.201,
"step": 76800
},
{
"epoch": 22.372721448954632,
"grad_norm": 0.43184053897857666,
"learning_rate": 0.0003317250218467812,
"loss": 3.1934,
"step": 76850
},
{
"epoch": 22.387280880554425,
"grad_norm": 0.3932684063911438,
"learning_rate": 0.00033155024759685405,
"loss": 3.1961,
"step": 76900
},
{
"epoch": 22.401840312154214,
"grad_norm": 0.4096086621284485,
"learning_rate": 0.0003313754733469269,
"loss": 3.2176,
"step": 76950
},
{
"epoch": 22.416399743754003,
"grad_norm": 0.39115527272224426,
"learning_rate": 0.00033120069909699966,
"loss": 3.1943,
"step": 77000
},
{
"epoch": 22.416399743754003,
"eval_accuracy": 0.37367707937835326,
"eval_loss": 3.5434892177581787,
"eval_runtime": 55.0354,
"eval_samples_per_second": 302.405,
"eval_steps_per_second": 18.915,
"step": 77000
},
{
"epoch": 22.430959175353795,
"grad_norm": 0.40539973974227905,
"learning_rate": 0.0003310259248470725,
"loss": 3.2077,
"step": 77050
},
{
"epoch": 22.445518606953584,
"grad_norm": 0.44398367404937744,
"learning_rate": 0.00033085115059714533,
"loss": 3.2003,
"step": 77100
},
{
"epoch": 22.460078038553373,
"grad_norm": 0.3839823305606842,
"learning_rate": 0.00033067637634721816,
"loss": 3.2015,
"step": 77150
},
{
"epoch": 22.474637470153166,
"grad_norm": 0.41733473539352417,
"learning_rate": 0.000330501602097291,
"loss": 3.1997,
"step": 77200
},
{
"epoch": 22.489196901752955,
"grad_norm": 0.41814976930618286,
"learning_rate": 0.0003303268278473638,
"loss": 3.2144,
"step": 77250
},
{
"epoch": 22.503756333352747,
"grad_norm": 0.4052809178829193,
"learning_rate": 0.0003301520535974366,
"loss": 3.1983,
"step": 77300
},
{
"epoch": 22.518315764952536,
"grad_norm": 0.3910194933414459,
"learning_rate": 0.00032997727934750944,
"loss": 3.2074,
"step": 77350
},
{
"epoch": 22.532875196552325,
"grad_norm": 0.4210590422153473,
"learning_rate": 0.0003298025050975823,
"loss": 3.2031,
"step": 77400
},
{
"epoch": 22.547434628152118,
"grad_norm": 0.4334495961666107,
"learning_rate": 0.0003296277308476551,
"loss": 3.2054,
"step": 77450
},
{
"epoch": 22.561994059751907,
"grad_norm": 0.3857288360595703,
"learning_rate": 0.0003294529565977279,
"loss": 3.2119,
"step": 77500
},
{
"epoch": 22.576553491351696,
"grad_norm": 0.3976559638977051,
"learning_rate": 0.0003292781823478007,
"loss": 3.1939,
"step": 77550
},
{
"epoch": 22.59111292295149,
"grad_norm": 0.3808916211128235,
"learning_rate": 0.00032910340809787355,
"loss": 3.2074,
"step": 77600
},
{
"epoch": 22.605672354551277,
"grad_norm": 0.38793352246284485,
"learning_rate": 0.0003289286338479464,
"loss": 3.2058,
"step": 77650
},
{
"epoch": 22.62023178615107,
"grad_norm": 0.40990737080574036,
"learning_rate": 0.00032875385959801916,
"loss": 3.2082,
"step": 77700
},
{
"epoch": 22.63479121775086,
"grad_norm": 0.40758588910102844,
"learning_rate": 0.000328579085348092,
"loss": 3.2192,
"step": 77750
},
{
"epoch": 22.649350649350648,
"grad_norm": 0.3899351954460144,
"learning_rate": 0.00032840431109816483,
"loss": 3.2044,
"step": 77800
},
{
"epoch": 22.66391008095044,
"grad_norm": 0.4190896153450012,
"learning_rate": 0.00032822953684823766,
"loss": 3.2246,
"step": 77850
},
{
"epoch": 22.67846951255023,
"grad_norm": 0.4242609739303589,
"learning_rate": 0.0003280547625983105,
"loss": 3.2182,
"step": 77900
},
{
"epoch": 22.693028944150022,
"grad_norm": 0.4129343330860138,
"learning_rate": 0.0003278799883483833,
"loss": 3.2331,
"step": 77950
},
{
"epoch": 22.70758837574981,
"grad_norm": 0.41839760541915894,
"learning_rate": 0.0003277052140984561,
"loss": 3.2206,
"step": 78000
},
{
"epoch": 22.70758837574981,
"eval_accuracy": 0.3743237902714222,
"eval_loss": 3.5353493690490723,
"eval_runtime": 55.1684,
"eval_samples_per_second": 301.676,
"eval_steps_per_second": 18.869,
"step": 78000
},
{
"epoch": 22.7221478073496,
"grad_norm": 0.3958353102207184,
"learning_rate": 0.00032753043984852894,
"loss": 3.2286,
"step": 78050
},
{
"epoch": 22.736707238949393,
"grad_norm": 0.4032213091850281,
"learning_rate": 0.0003273556655986018,
"loss": 3.2232,
"step": 78100
},
{
"epoch": 22.75126667054918,
"grad_norm": 0.3718810975551605,
"learning_rate": 0.00032718089134867455,
"loss": 3.225,
"step": 78150
},
{
"epoch": 22.76582610214897,
"grad_norm": 0.39873242378234863,
"learning_rate": 0.0003270061170987474,
"loss": 3.2038,
"step": 78200
},
{
"epoch": 22.780385533748763,
"grad_norm": 0.41898322105407715,
"learning_rate": 0.0003268313428488202,
"loss": 3.2325,
"step": 78250
},
{
"epoch": 22.794944965348552,
"grad_norm": 0.3915160596370697,
"learning_rate": 0.00032665656859889305,
"loss": 3.2263,
"step": 78300
},
{
"epoch": 22.80950439694834,
"grad_norm": 0.3975723087787628,
"learning_rate": 0.0003264817943489659,
"loss": 3.225,
"step": 78350
},
{
"epoch": 22.824063828548134,
"grad_norm": 0.4013522267341614,
"learning_rate": 0.00032630702009903866,
"loss": 3.2191,
"step": 78400
},
{
"epoch": 22.838623260147923,
"grad_norm": 0.40975821018218994,
"learning_rate": 0.0003261322458491115,
"loss": 3.2224,
"step": 78450
},
{
"epoch": 22.853182691747715,
"grad_norm": 0.4058254361152649,
"learning_rate": 0.00032595747159918433,
"loss": 3.2206,
"step": 78500
},
{
"epoch": 22.867742123347504,
"grad_norm": 0.414315789937973,
"learning_rate": 0.00032578269734925716,
"loss": 3.2192,
"step": 78550
},
{
"epoch": 22.882301554947293,
"grad_norm": 0.41001883149147034,
"learning_rate": 0.00032560792309933,
"loss": 3.2294,
"step": 78600
},
{
"epoch": 22.896860986547086,
"grad_norm": 0.4034368097782135,
"learning_rate": 0.0003254331488494029,
"loss": 3.2209,
"step": 78650
},
{
"epoch": 22.911420418146875,
"grad_norm": 0.39516010880470276,
"learning_rate": 0.00032525837459947566,
"loss": 3.2287,
"step": 78700
},
{
"epoch": 22.925979849746668,
"grad_norm": 0.4021008610725403,
"learning_rate": 0.0003250836003495485,
"loss": 3.2343,
"step": 78750
},
{
"epoch": 22.940539281346457,
"grad_norm": 0.4237155318260193,
"learning_rate": 0.0003249088260996213,
"loss": 3.2408,
"step": 78800
},
{
"epoch": 22.955098712946246,
"grad_norm": 0.3744175434112549,
"learning_rate": 0.00032473405184969416,
"loss": 3.2244,
"step": 78850
},
{
"epoch": 22.969658144546038,
"grad_norm": 0.4011383652687073,
"learning_rate": 0.00032455927759976694,
"loss": 3.2274,
"step": 78900
},
{
"epoch": 22.984217576145827,
"grad_norm": 0.3921768367290497,
"learning_rate": 0.00032438450334983977,
"loss": 3.2219,
"step": 78950
},
{
"epoch": 22.998777007745616,
"grad_norm": 0.41207054257392883,
"learning_rate": 0.0003242097290999126,
"loss": 3.2293,
"step": 79000
},
{
"epoch": 22.998777007745616,
"eval_accuracy": 0.3748646757456253,
"eval_loss": 3.5263609886169434,
"eval_runtime": 55.2416,
"eval_samples_per_second": 301.276,
"eval_steps_per_second": 18.844,
"step": 79000
},
{
"epoch": 23.01310348843981,
"grad_norm": 0.4697108566761017,
"learning_rate": 0.00032403495484998544,
"loss": 3.1308,
"step": 79050
},
{
"epoch": 23.027662920039603,
"grad_norm": 0.433270126581192,
"learning_rate": 0.00032386018060005827,
"loss": 3.1183,
"step": 79100
},
{
"epoch": 23.042222351639392,
"grad_norm": 0.3799859285354614,
"learning_rate": 0.00032368540635013105,
"loss": 3.1369,
"step": 79150
},
{
"epoch": 23.05678178323918,
"grad_norm": 0.4204369783401489,
"learning_rate": 0.0003235106321002039,
"loss": 3.1339,
"step": 79200
},
{
"epoch": 23.071341214838974,
"grad_norm": 0.42532503604888916,
"learning_rate": 0.0003233358578502767,
"loss": 3.1521,
"step": 79250
},
{
"epoch": 23.085900646438763,
"grad_norm": 0.39714616537094116,
"learning_rate": 0.00032316108360034955,
"loss": 3.1428,
"step": 79300
},
{
"epoch": 23.10046007803855,
"grad_norm": 0.4164256453514099,
"learning_rate": 0.0003229863093504224,
"loss": 3.1444,
"step": 79350
},
{
"epoch": 23.115019509638344,
"grad_norm": 0.41725417971611023,
"learning_rate": 0.00032281153510049516,
"loss": 3.1417,
"step": 79400
},
{
"epoch": 23.129578941238133,
"grad_norm": 0.4337908923625946,
"learning_rate": 0.000322636760850568,
"loss": 3.1551,
"step": 79450
},
{
"epoch": 23.144138372837926,
"grad_norm": 0.4192178249359131,
"learning_rate": 0.0003224619866006408,
"loss": 3.1568,
"step": 79500
},
{
"epoch": 23.158697804437715,
"grad_norm": 0.38770732283592224,
"learning_rate": 0.00032228721235071366,
"loss": 3.1595,
"step": 79550
},
{
"epoch": 23.173257236037504,
"grad_norm": 0.4365769922733307,
"learning_rate": 0.00032211243810078644,
"loss": 3.1469,
"step": 79600
},
{
"epoch": 23.187816667637296,
"grad_norm": 0.41161519289016724,
"learning_rate": 0.00032193766385085927,
"loss": 3.1547,
"step": 79650
},
{
"epoch": 23.202376099237085,
"grad_norm": 0.41916653513908386,
"learning_rate": 0.0003217628896009321,
"loss": 3.1662,
"step": 79700
},
{
"epoch": 23.216935530836878,
"grad_norm": 0.4116290807723999,
"learning_rate": 0.00032158811535100494,
"loss": 3.1636,
"step": 79750
},
{
"epoch": 23.231494962436667,
"grad_norm": 0.3962598443031311,
"learning_rate": 0.00032141334110107777,
"loss": 3.1618,
"step": 79800
},
{
"epoch": 23.246054394036456,
"grad_norm": 0.3972434401512146,
"learning_rate": 0.00032123856685115055,
"loss": 3.1679,
"step": 79850
},
{
"epoch": 23.26061382563625,
"grad_norm": 0.42853844165802,
"learning_rate": 0.0003210637926012234,
"loss": 3.1707,
"step": 79900
},
{
"epoch": 23.275173257236037,
"grad_norm": 0.4082277715206146,
"learning_rate": 0.0003208890183512962,
"loss": 3.1847,
"step": 79950
},
{
"epoch": 23.289732688835826,
"grad_norm": 0.4235358238220215,
"learning_rate": 0.00032071424410136905,
"loss": 3.1645,
"step": 80000
},
{
"epoch": 23.289732688835826,
"eval_accuracy": 0.3736458020878885,
"eval_loss": 3.548175811767578,
"eval_runtime": 55.4012,
"eval_samples_per_second": 300.409,
"eval_steps_per_second": 18.79,
"step": 80000
},
{
"epoch": 23.30429212043562,
"grad_norm": 0.4308682680130005,
"learning_rate": 0.0003205394698514419,
"loss": 3.1709,
"step": 80050
},
{
"epoch": 23.318851552035408,
"grad_norm": 0.40191811323165894,
"learning_rate": 0.00032036469560151466,
"loss": 3.1735,
"step": 80100
},
{
"epoch": 23.3334109836352,
"grad_norm": 0.43274742364883423,
"learning_rate": 0.0003201899213515875,
"loss": 3.1868,
"step": 80150
},
{
"epoch": 23.34797041523499,
"grad_norm": 0.4104597270488739,
"learning_rate": 0.0003200151471016603,
"loss": 3.1871,
"step": 80200
},
{
"epoch": 23.36252984683478,
"grad_norm": 0.419541597366333,
"learning_rate": 0.00031984037285173316,
"loss": 3.1847,
"step": 80250
},
{
"epoch": 23.37708927843457,
"grad_norm": 0.3963826596736908,
"learning_rate": 0.00031966559860180594,
"loss": 3.1771,
"step": 80300
},
{
"epoch": 23.39164871003436,
"grad_norm": 0.4283881187438965,
"learning_rate": 0.00031949082435187877,
"loss": 3.1902,
"step": 80350
},
{
"epoch": 23.40620814163415,
"grad_norm": 0.40874427556991577,
"learning_rate": 0.0003193160501019516,
"loss": 3.1907,
"step": 80400
},
{
"epoch": 23.42076757323394,
"grad_norm": 0.41835615038871765,
"learning_rate": 0.00031914127585202444,
"loss": 3.1836,
"step": 80450
},
{
"epoch": 23.43532700483373,
"grad_norm": 0.39473778009414673,
"learning_rate": 0.00031896650160209727,
"loss": 3.1851,
"step": 80500
},
{
"epoch": 23.449886436433523,
"grad_norm": 0.4216783344745636,
"learning_rate": 0.00031879172735217005,
"loss": 3.187,
"step": 80550
},
{
"epoch": 23.464445868033312,
"grad_norm": 0.4428766071796417,
"learning_rate": 0.0003186169531022429,
"loss": 3.1891,
"step": 80600
},
{
"epoch": 23.4790052996331,
"grad_norm": 0.40239444375038147,
"learning_rate": 0.0003184421788523157,
"loss": 3.1913,
"step": 80650
},
{
"epoch": 23.493564731232894,
"grad_norm": 0.4475367069244385,
"learning_rate": 0.00031826740460238855,
"loss": 3.201,
"step": 80700
},
{
"epoch": 23.508124162832683,
"grad_norm": 0.4128247797489166,
"learning_rate": 0.0003180926303524614,
"loss": 3.1934,
"step": 80750
},
{
"epoch": 23.522683594432472,
"grad_norm": 0.42478007078170776,
"learning_rate": 0.00031791785610253416,
"loss": 3.2025,
"step": 80800
},
{
"epoch": 23.537243026032264,
"grad_norm": 0.39820677042007446,
"learning_rate": 0.000317743081852607,
"loss": 3.1953,
"step": 80850
},
{
"epoch": 23.551802457632053,
"grad_norm": 0.41164451837539673,
"learning_rate": 0.0003175683076026798,
"loss": 3.2044,
"step": 80900
},
{
"epoch": 23.566361889231846,
"grad_norm": 0.42769721150398254,
"learning_rate": 0.00031739353335275266,
"loss": 3.2048,
"step": 80950
},
{
"epoch": 23.580921320831635,
"grad_norm": 0.37441954016685486,
"learning_rate": 0.00031721875910282544,
"loss": 3.201,
"step": 81000
},
{
"epoch": 23.580921320831635,
"eval_accuracy": 0.3742237264586946,
"eval_loss": 3.538792610168457,
"eval_runtime": 55.0335,
"eval_samples_per_second": 302.416,
"eval_steps_per_second": 18.916,
"step": 81000
},
{
"epoch": 23.595480752431424,
"grad_norm": 0.42640015482902527,
"learning_rate": 0.00031704398485289827,
"loss": 3.2071,
"step": 81050
},
{
"epoch": 23.610040184031217,
"grad_norm": 0.40239205956459045,
"learning_rate": 0.0003168692106029711,
"loss": 3.2035,
"step": 81100
},
{
"epoch": 23.624599615631006,
"grad_norm": 0.3871544897556305,
"learning_rate": 0.000316694436353044,
"loss": 3.2138,
"step": 81150
},
{
"epoch": 23.639159047230795,
"grad_norm": 0.3887910842895508,
"learning_rate": 0.0003165196621031168,
"loss": 3.2048,
"step": 81200
},
{
"epoch": 23.653718478830587,
"grad_norm": 0.3865518569946289,
"learning_rate": 0.00031634488785318966,
"loss": 3.214,
"step": 81250
},
{
"epoch": 23.668277910430376,
"grad_norm": 0.403149276971817,
"learning_rate": 0.00031617011360326243,
"loss": 3.2221,
"step": 81300
},
{
"epoch": 23.68283734203017,
"grad_norm": 0.40452221035957336,
"learning_rate": 0.00031599533935333527,
"loss": 3.1981,
"step": 81350
},
{
"epoch": 23.697396773629958,
"grad_norm": 0.4011068642139435,
"learning_rate": 0.0003158205651034081,
"loss": 3.1904,
"step": 81400
},
{
"epoch": 23.711956205229747,
"grad_norm": 0.4230940043926239,
"learning_rate": 0.00031564579085348093,
"loss": 3.2106,
"step": 81450
},
{
"epoch": 23.72651563682954,
"grad_norm": 0.3911479711532593,
"learning_rate": 0.0003154710166035537,
"loss": 3.2152,
"step": 81500
},
{
"epoch": 23.74107506842933,
"grad_norm": 0.41586339473724365,
"learning_rate": 0.00031529624235362655,
"loss": 3.2144,
"step": 81550
},
{
"epoch": 23.755634500029117,
"grad_norm": 0.4051753580570221,
"learning_rate": 0.0003151214681036994,
"loss": 3.2231,
"step": 81600
},
{
"epoch": 23.77019393162891,
"grad_norm": 0.41884034872055054,
"learning_rate": 0.0003149466938537722,
"loss": 3.2185,
"step": 81650
},
{
"epoch": 23.7847533632287,
"grad_norm": 0.38643860816955566,
"learning_rate": 0.00031477191960384504,
"loss": 3.2289,
"step": 81700
},
{
"epoch": 23.79931279482849,
"grad_norm": 0.3834930956363678,
"learning_rate": 0.0003145971453539178,
"loss": 3.2076,
"step": 81750
},
{
"epoch": 23.81387222642828,
"grad_norm": 0.43425455689430237,
"learning_rate": 0.00031442237110399066,
"loss": 3.2109,
"step": 81800
},
{
"epoch": 23.82843165802807,
"grad_norm": 0.40798330307006836,
"learning_rate": 0.0003142475968540635,
"loss": 3.2153,
"step": 81850
},
{
"epoch": 23.842991089627862,
"grad_norm": 0.387335866689682,
"learning_rate": 0.0003140728226041363,
"loss": 3.2292,
"step": 81900
},
{
"epoch": 23.85755052122765,
"grad_norm": 0.4191502332687378,
"learning_rate": 0.00031389804835420915,
"loss": 3.2335,
"step": 81950
},
{
"epoch": 23.87210995282744,
"grad_norm": 0.4108533263206482,
"learning_rate": 0.00031372327410428193,
"loss": 3.2211,
"step": 82000
},
{
"epoch": 23.87210995282744,
"eval_accuracy": 0.3747248686089237,
"eval_loss": 3.529864549636841,
"eval_runtime": 55.2779,
"eval_samples_per_second": 301.079,
"eval_steps_per_second": 18.832,
"step": 82000
},
{
"epoch": 23.886669384427233,
"grad_norm": 0.38015156984329224,
"learning_rate": 0.00031354849985435477,
"loss": 3.2234,
"step": 82050
},
{
"epoch": 23.90122881602702,
"grad_norm": 0.4143611490726471,
"learning_rate": 0.0003133737256044276,
"loss": 3.2185,
"step": 82100
},
{
"epoch": 23.915788247626814,
"grad_norm": 0.4278761148452759,
"learning_rate": 0.00031319895135450043,
"loss": 3.223,
"step": 82150
},
{
"epoch": 23.930347679226603,
"grad_norm": 0.3869278132915497,
"learning_rate": 0.0003130241771045732,
"loss": 3.2287,
"step": 82200
},
{
"epoch": 23.944907110826392,
"grad_norm": 0.40504610538482666,
"learning_rate": 0.00031284940285464604,
"loss": 3.2226,
"step": 82250
},
{
"epoch": 23.959466542426185,
"grad_norm": 0.4033997654914856,
"learning_rate": 0.0003126746286047189,
"loss": 3.2121,
"step": 82300
},
{
"epoch": 23.974025974025974,
"grad_norm": 0.4058132469654083,
"learning_rate": 0.0003124998543547917,
"loss": 3.2233,
"step": 82350
},
{
"epoch": 23.988585405625763,
"grad_norm": 0.4490283727645874,
"learning_rate": 0.00031232508010486454,
"loss": 3.2168,
"step": 82400
},
{
"epoch": 24.002911886319957,
"grad_norm": 0.4183407723903656,
"learning_rate": 0.0003121503058549373,
"loss": 3.2061,
"step": 82450
},
{
"epoch": 24.01747131791975,
"grad_norm": 0.42069587111473083,
"learning_rate": 0.00031197553160501016,
"loss": 3.1255,
"step": 82500
},
{
"epoch": 24.03203074951954,
"grad_norm": 0.39097297191619873,
"learning_rate": 0.000311800757355083,
"loss": 3.1343,
"step": 82550
},
{
"epoch": 24.046590181119328,
"grad_norm": 0.3835448920726776,
"learning_rate": 0.0003116259831051558,
"loss": 3.1294,
"step": 82600
},
{
"epoch": 24.06114961271912,
"grad_norm": 0.46320199966430664,
"learning_rate": 0.00031145120885522865,
"loss": 3.1294,
"step": 82650
},
{
"epoch": 24.07570904431891,
"grad_norm": 0.4252343773841858,
"learning_rate": 0.00031127643460530143,
"loss": 3.1271,
"step": 82700
},
{
"epoch": 24.090268475918702,
"grad_norm": 0.41149669885635376,
"learning_rate": 0.00031110166035537427,
"loss": 3.1377,
"step": 82750
},
{
"epoch": 24.10482790751849,
"grad_norm": 0.4045622944831848,
"learning_rate": 0.0003109268861054471,
"loss": 3.1455,
"step": 82800
},
{
"epoch": 24.11938733911828,
"grad_norm": 0.40084847807884216,
"learning_rate": 0.00031075211185551993,
"loss": 3.1424,
"step": 82850
},
{
"epoch": 24.133946770718072,
"grad_norm": 0.40093955397605896,
"learning_rate": 0.0003105773376055927,
"loss": 3.1459,
"step": 82900
},
{
"epoch": 24.14850620231786,
"grad_norm": 0.40974506735801697,
"learning_rate": 0.00031040256335566554,
"loss": 3.1353,
"step": 82950
},
{
"epoch": 24.16306563391765,
"grad_norm": 0.4075878858566284,
"learning_rate": 0.0003102277891057384,
"loss": 3.1515,
"step": 83000
},
{
"epoch": 24.16306563391765,
"eval_accuracy": 0.37420162070453156,
"eval_loss": 3.5482983589172363,
"eval_runtime": 55.4288,
"eval_samples_per_second": 300.259,
"eval_steps_per_second": 18.781,
"step": 83000
},
{
"epoch": 24.177625065517443,
"grad_norm": 0.47770339250564575,
"learning_rate": 0.0003100530148558112,
"loss": 3.14,
"step": 83050
},
{
"epoch": 24.192184497117232,
"grad_norm": 0.42492687702178955,
"learning_rate": 0.00030987824060588404,
"loss": 3.1502,
"step": 83100
},
{
"epoch": 24.206743928717025,
"grad_norm": 0.4377771317958832,
"learning_rate": 0.0003097034663559568,
"loss": 3.135,
"step": 83150
},
{
"epoch": 24.221303360316814,
"grad_norm": 0.407498300075531,
"learning_rate": 0.00030952869210602965,
"loss": 3.1564,
"step": 83200
},
{
"epoch": 24.235862791916603,
"grad_norm": 0.40489810705184937,
"learning_rate": 0.0003093539178561025,
"loss": 3.1645,
"step": 83250
},
{
"epoch": 24.250422223516395,
"grad_norm": 0.4017831087112427,
"learning_rate": 0.0003091791436061753,
"loss": 3.1623,
"step": 83300
},
{
"epoch": 24.264981655116184,
"grad_norm": 0.4044218063354492,
"learning_rate": 0.00030900436935624815,
"loss": 3.1628,
"step": 83350
},
{
"epoch": 24.279541086715973,
"grad_norm": 0.4220918118953705,
"learning_rate": 0.00030882959510632093,
"loss": 3.1707,
"step": 83400
},
{
"epoch": 24.294100518315766,
"grad_norm": 0.4163309335708618,
"learning_rate": 0.00030865482085639377,
"loss": 3.1552,
"step": 83450
},
{
"epoch": 24.308659949915555,
"grad_norm": 0.41771310567855835,
"learning_rate": 0.0003084800466064666,
"loss": 3.1748,
"step": 83500
},
{
"epoch": 24.323219381515347,
"grad_norm": 0.43180546164512634,
"learning_rate": 0.00030830527235653943,
"loss": 3.1792,
"step": 83550
},
{
"epoch": 24.337778813115136,
"grad_norm": 0.4262148141860962,
"learning_rate": 0.0003081304981066122,
"loss": 3.1772,
"step": 83600
},
{
"epoch": 24.352338244714925,
"grad_norm": 0.4312186539173126,
"learning_rate": 0.00030795572385668504,
"loss": 3.1681,
"step": 83650
},
{
"epoch": 24.366897676314718,
"grad_norm": 0.40577229857444763,
"learning_rate": 0.00030778094960675793,
"loss": 3.1762,
"step": 83700
},
{
"epoch": 24.381457107914507,
"grad_norm": 0.3947165906429291,
"learning_rate": 0.00030760617535683076,
"loss": 3.1619,
"step": 83750
},
{
"epoch": 24.396016539514296,
"grad_norm": 0.40584006905555725,
"learning_rate": 0.0003074314011069036,
"loss": 3.1753,
"step": 83800
},
{
"epoch": 24.41057597111409,
"grad_norm": 0.4310337007045746,
"learning_rate": 0.00030725662685697643,
"loss": 3.169,
"step": 83850
},
{
"epoch": 24.425135402713877,
"grad_norm": 0.4535801410675049,
"learning_rate": 0.0003070818526070492,
"loss": 3.1758,
"step": 83900
},
{
"epoch": 24.43969483431367,
"grad_norm": 0.4137560725212097,
"learning_rate": 0.00030690707835712204,
"loss": 3.1876,
"step": 83950
},
{
"epoch": 24.45425426591346,
"grad_norm": 0.4285685420036316,
"learning_rate": 0.0003067323041071949,
"loss": 3.1878,
"step": 84000
},
{
"epoch": 24.45425426591346,
"eval_accuracy": 0.37403418137512606,
"eval_loss": 3.5453009605407715,
"eval_runtime": 55.213,
"eval_samples_per_second": 301.432,
"eval_steps_per_second": 18.854,
"step": 84000
},
{
"epoch": 24.468813697513248,
"grad_norm": 0.40945473313331604,
"learning_rate": 0.0003065575298572677,
"loss": 3.1896,
"step": 84050
},
{
"epoch": 24.48337312911304,
"grad_norm": 0.44182032346725464,
"learning_rate": 0.00030638275560734054,
"loss": 3.1842,
"step": 84100
},
{
"epoch": 24.49793256071283,
"grad_norm": 0.43453431129455566,
"learning_rate": 0.0003062079813574133,
"loss": 3.1918,
"step": 84150
},
{
"epoch": 24.51249199231262,
"grad_norm": 0.3916553556919098,
"learning_rate": 0.00030603320710748615,
"loss": 3.1792,
"step": 84200
},
{
"epoch": 24.52705142391241,
"grad_norm": 0.4035761058330536,
"learning_rate": 0.000305858432857559,
"loss": 3.1862,
"step": 84250
},
{
"epoch": 24.5416108555122,
"grad_norm": 0.4316408932209015,
"learning_rate": 0.0003056836586076318,
"loss": 3.1937,
"step": 84300
},
{
"epoch": 24.556170287111993,
"grad_norm": 0.43183833360671997,
"learning_rate": 0.0003055088843577046,
"loss": 3.1832,
"step": 84350
},
{
"epoch": 24.57072971871178,
"grad_norm": 0.40193501114845276,
"learning_rate": 0.00030533411010777743,
"loss": 3.199,
"step": 84400
},
{
"epoch": 24.58528915031157,
"grad_norm": 0.4043532609939575,
"learning_rate": 0.00030515933585785026,
"loss": 3.1868,
"step": 84450
},
{
"epoch": 24.599848581911363,
"grad_norm": 0.40620195865631104,
"learning_rate": 0.0003049845616079231,
"loss": 3.1785,
"step": 84500
},
{
"epoch": 24.614408013511152,
"grad_norm": 0.41706621646881104,
"learning_rate": 0.00030480978735799593,
"loss": 3.205,
"step": 84550
},
{
"epoch": 24.62896744511094,
"grad_norm": 0.41417932510375977,
"learning_rate": 0.0003046350131080687,
"loss": 3.186,
"step": 84600
},
{
"epoch": 24.643526876710734,
"grad_norm": 0.45793968439102173,
"learning_rate": 0.00030446023885814154,
"loss": 3.1948,
"step": 84650
},
{
"epoch": 24.658086308310523,
"grad_norm": 0.4425913393497467,
"learning_rate": 0.0003042854646082144,
"loss": 3.1952,
"step": 84700
},
{
"epoch": 24.672645739910315,
"grad_norm": 0.41631364822387695,
"learning_rate": 0.0003041106903582872,
"loss": 3.1955,
"step": 84750
},
{
"epoch": 24.687205171510104,
"grad_norm": 0.395309180021286,
"learning_rate": 0.00030393591610836,
"loss": 3.2076,
"step": 84800
},
{
"epoch": 24.701764603109893,
"grad_norm": 0.4197699725627899,
"learning_rate": 0.0003037611418584328,
"loss": 3.2049,
"step": 84850
},
{
"epoch": 24.716324034709686,
"grad_norm": 0.44752877950668335,
"learning_rate": 0.00030358636760850565,
"loss": 3.2045,
"step": 84900
},
{
"epoch": 24.730883466309475,
"grad_norm": 0.43083128333091736,
"learning_rate": 0.0003034115933585785,
"loss": 3.203,
"step": 84950
},
{
"epoch": 24.745442897909264,
"grad_norm": 0.4351908564567566,
"learning_rate": 0.0003032368191086513,
"loss": 3.2018,
"step": 85000
},
{
"epoch": 24.745442897909264,
"eval_accuracy": 0.3745138056901857,
"eval_loss": 3.534050941467285,
"eval_runtime": 54.5127,
"eval_samples_per_second": 305.305,
"eval_steps_per_second": 19.096,
"step": 85000
},
{
"epoch": 24.760002329509057,
"grad_norm": 0.3845418393611908,
"learning_rate": 0.0003030620448587241,
"loss": 3.2058,
"step": 85050
},
{
"epoch": 24.774561761108846,
"grad_norm": 0.3963243365287781,
"learning_rate": 0.00030288727060879693,
"loss": 3.2008,
"step": 85100
},
{
"epoch": 24.789121192708638,
"grad_norm": 0.4321513772010803,
"learning_rate": 0.00030271249635886976,
"loss": 3.1983,
"step": 85150
},
{
"epoch": 24.803680624308427,
"grad_norm": 0.44876664876937866,
"learning_rate": 0.0003025377221089426,
"loss": 3.2135,
"step": 85200
},
{
"epoch": 24.818240055908216,
"grad_norm": 0.409246563911438,
"learning_rate": 0.00030236294785901543,
"loss": 3.217,
"step": 85250
},
{
"epoch": 24.83279948750801,
"grad_norm": 0.4117927551269531,
"learning_rate": 0.0003021881736090882,
"loss": 3.215,
"step": 85300
},
{
"epoch": 24.847358919107798,
"grad_norm": 0.40099072456359863,
"learning_rate": 0.00030201339935916104,
"loss": 3.2129,
"step": 85350
},
{
"epoch": 24.861918350707587,
"grad_norm": 0.3935149908065796,
"learning_rate": 0.00030183862510923387,
"loss": 3.218,
"step": 85400
},
{
"epoch": 24.87647778230738,
"grad_norm": 0.39060723781585693,
"learning_rate": 0.0003016638508593067,
"loss": 3.1989,
"step": 85450
},
{
"epoch": 24.89103721390717,
"grad_norm": 0.41106799244880676,
"learning_rate": 0.0003014890766093795,
"loss": 3.2128,
"step": 85500
},
{
"epoch": 24.90559664550696,
"grad_norm": 0.39192450046539307,
"learning_rate": 0.0003013143023594523,
"loss": 3.2181,
"step": 85550
},
{
"epoch": 24.92015607710675,
"grad_norm": 0.40416061878204346,
"learning_rate": 0.00030113952810952515,
"loss": 3.2211,
"step": 85600
},
{
"epoch": 24.93471550870654,
"grad_norm": 0.3965405821800232,
"learning_rate": 0.000300964753859598,
"loss": 3.2135,
"step": 85650
},
{
"epoch": 24.94927494030633,
"grad_norm": 0.42308685183525085,
"learning_rate": 0.0003007899796096708,
"loss": 3.2049,
"step": 85700
},
{
"epoch": 24.96383437190612,
"grad_norm": 0.41107550263404846,
"learning_rate": 0.0003006152053597436,
"loss": 3.2127,
"step": 85750
},
{
"epoch": 24.97839380350591,
"grad_norm": 0.42214998602867126,
"learning_rate": 0.00030044043110981643,
"loss": 3.2114,
"step": 85800
},
{
"epoch": 24.992953235105702,
"grad_norm": 0.4449729323387146,
"learning_rate": 0.00030026565685988926,
"loss": 3.2252,
"step": 85850
},
{
"epoch": 25.007279715799896,
"grad_norm": 0.4267348349094391,
"learning_rate": 0.0003000908826099621,
"loss": 3.1622,
"step": 85900
},
{
"epoch": 25.021839147399685,
"grad_norm": 0.40839192271232605,
"learning_rate": 0.0002999161083600349,
"loss": 3.106,
"step": 85950
},
{
"epoch": 25.036398578999474,
"grad_norm": 0.4623876214027405,
"learning_rate": 0.00029974133411010776,
"loss": 3.1168,
"step": 86000
},
{
"epoch": 25.036398578999474,
"eval_accuracy": 0.37416763898669575,
"eval_loss": 3.5501725673675537,
"eval_runtime": 55.1681,
"eval_samples_per_second": 301.678,
"eval_steps_per_second": 18.87,
"step": 86000
},
{
"epoch": 25.050958010599267,
"grad_norm": 0.41614830493927,
"learning_rate": 0.0002995665598601806,
"loss": 3.125,
"step": 86050
},
{
"epoch": 25.065517442199056,
"grad_norm": 0.3899344801902771,
"learning_rate": 0.00029939178561025337,
"loss": 3.1259,
"step": 86100
},
{
"epoch": 25.08007687379885,
"grad_norm": 0.4196447432041168,
"learning_rate": 0.0002992170113603262,
"loss": 3.1196,
"step": 86150
},
{
"epoch": 25.094636305398637,
"grad_norm": 0.43210721015930176,
"learning_rate": 0.00029904223711039904,
"loss": 3.1397,
"step": 86200
},
{
"epoch": 25.109195736998426,
"grad_norm": 0.4325079321861267,
"learning_rate": 0.00029886746286047187,
"loss": 3.1354,
"step": 86250
},
{
"epoch": 25.12375516859822,
"grad_norm": 0.39312827587127686,
"learning_rate": 0.0002986926886105447,
"loss": 3.1388,
"step": 86300
},
{
"epoch": 25.138314600198008,
"grad_norm": 0.4397601783275604,
"learning_rate": 0.0002985179143606175,
"loss": 3.1425,
"step": 86350
},
{
"epoch": 25.152874031797797,
"grad_norm": 0.446545273065567,
"learning_rate": 0.0002983431401106903,
"loss": 3.1282,
"step": 86400
},
{
"epoch": 25.16743346339759,
"grad_norm": 0.43214336037635803,
"learning_rate": 0.00029816836586076315,
"loss": 3.1458,
"step": 86450
},
{
"epoch": 25.18199289499738,
"grad_norm": 0.3953275680541992,
"learning_rate": 0.000297993591610836,
"loss": 3.1296,
"step": 86500
},
{
"epoch": 25.19655232659717,
"grad_norm": 0.40356674790382385,
"learning_rate": 0.0002978188173609088,
"loss": 3.1467,
"step": 86550
},
{
"epoch": 25.21111175819696,
"grad_norm": 0.41587722301483154,
"learning_rate": 0.0002976440431109816,
"loss": 3.1566,
"step": 86600
},
{
"epoch": 25.22567118979675,
"grad_norm": 0.39485782384872437,
"learning_rate": 0.0002974692688610544,
"loss": 3.1511,
"step": 86650
},
{
"epoch": 25.24023062139654,
"grad_norm": 0.41744348406791687,
"learning_rate": 0.00029729449461112726,
"loss": 3.1639,
"step": 86700
},
{
"epoch": 25.25479005299633,
"grad_norm": 0.4251966178417206,
"learning_rate": 0.0002971197203612001,
"loss": 3.1446,
"step": 86750
},
{
"epoch": 25.26934948459612,
"grad_norm": 0.4173142611980438,
"learning_rate": 0.00029694494611127287,
"loss": 3.1646,
"step": 86800
},
{
"epoch": 25.283908916195912,
"grad_norm": 0.401950478553772,
"learning_rate": 0.00029677017186134576,
"loss": 3.1517,
"step": 86850
},
{
"epoch": 25.2984683477957,
"grad_norm": 0.4227960407733917,
"learning_rate": 0.0002965953976114186,
"loss": 3.1665,
"step": 86900
},
{
"epoch": 25.313027779395494,
"grad_norm": 0.44079259037971497,
"learning_rate": 0.00029642062336149137,
"loss": 3.161,
"step": 86950
},
{
"epoch": 25.327587210995283,
"grad_norm": 0.3958124816417694,
"learning_rate": 0.0002962458491115642,
"loss": 3.158,
"step": 87000
},
{
"epoch": 25.327587210995283,
"eval_accuracy": 0.37426346978266867,
"eval_loss": 3.545079469680786,
"eval_runtime": 55.2417,
"eval_samples_per_second": 301.276,
"eval_steps_per_second": 18.844,
"step": 87000
},
{
"epoch": 25.342146642595072,
"grad_norm": 0.4321572184562683,
"learning_rate": 0.00029607107486163704,
"loss": 3.1602,
"step": 87050
},
{
"epoch": 25.356706074194864,
"grad_norm": 0.4014240503311157,
"learning_rate": 0.00029589630061170987,
"loss": 3.1579,
"step": 87100
},
{
"epoch": 25.371265505794653,
"grad_norm": 0.4316968619823456,
"learning_rate": 0.0002957215263617827,
"loss": 3.1832,
"step": 87150
},
{
"epoch": 25.385824937394442,
"grad_norm": 0.423284649848938,
"learning_rate": 0.0002955467521118555,
"loss": 3.162,
"step": 87200
},
{
"epoch": 25.400384368994235,
"grad_norm": 0.4421418011188507,
"learning_rate": 0.0002953719778619283,
"loss": 3.1639,
"step": 87250
},
{
"epoch": 25.414943800594024,
"grad_norm": 0.4119024872779846,
"learning_rate": 0.00029519720361200115,
"loss": 3.1674,
"step": 87300
},
{
"epoch": 25.429503232193817,
"grad_norm": 0.39992815256118774,
"learning_rate": 0.000295022429362074,
"loss": 3.1677,
"step": 87350
},
{
"epoch": 25.444062663793606,
"grad_norm": 0.42137086391448975,
"learning_rate": 0.00029484765511214676,
"loss": 3.1666,
"step": 87400
},
{
"epoch": 25.458622095393395,
"grad_norm": 0.42965787649154663,
"learning_rate": 0.0002946728808622196,
"loss": 3.1711,
"step": 87450
},
{
"epoch": 25.473181526993187,
"grad_norm": 0.4427332878112793,
"learning_rate": 0.0002944981066122924,
"loss": 3.1795,
"step": 87500
},
{
"epoch": 25.487740958592976,
"grad_norm": 0.43246063590049744,
"learning_rate": 0.00029432333236236526,
"loss": 3.1767,
"step": 87550
},
{
"epoch": 25.502300390192765,
"grad_norm": 0.44357001781463623,
"learning_rate": 0.0002941485581124381,
"loss": 3.1766,
"step": 87600
},
{
"epoch": 25.516859821792558,
"grad_norm": 0.4451310932636261,
"learning_rate": 0.00029397378386251087,
"loss": 3.1828,
"step": 87650
},
{
"epoch": 25.531419253392347,
"grad_norm": 0.4538399875164032,
"learning_rate": 0.0002937990096125837,
"loss": 3.1741,
"step": 87700
},
{
"epoch": 25.54597868499214,
"grad_norm": 0.4227699339389801,
"learning_rate": 0.00029362423536265654,
"loss": 3.1792,
"step": 87750
},
{
"epoch": 25.56053811659193,
"grad_norm": 0.4142102003097534,
"learning_rate": 0.00029344946111272937,
"loss": 3.1811,
"step": 87800
},
{
"epoch": 25.575097548191717,
"grad_norm": 0.4136430323123932,
"learning_rate": 0.0002932746868628022,
"loss": 3.1922,
"step": 87850
},
{
"epoch": 25.58965697979151,
"grad_norm": 0.435993492603302,
"learning_rate": 0.000293099912612875,
"loss": 3.1807,
"step": 87900
},
{
"epoch": 25.6042164113913,
"grad_norm": 0.43335679173469543,
"learning_rate": 0.0002929251383629478,
"loss": 3.1829,
"step": 87950
},
{
"epoch": 25.61877584299109,
"grad_norm": 0.4427852928638458,
"learning_rate": 0.00029275036411302065,
"loss": 3.1849,
"step": 88000
},
{
"epoch": 25.61877584299109,
"eval_accuracy": 0.37467183831569206,
"eval_loss": 3.540363311767578,
"eval_runtime": 54.1234,
"eval_samples_per_second": 307.501,
"eval_steps_per_second": 19.234,
"step": 88000
},
{
"epoch": 25.63333527459088,
"grad_norm": 0.42897364497184753,
"learning_rate": 0.0002925755898630935,
"loss": 3.1897,
"step": 88050
},
{
"epoch": 25.64789470619067,
"grad_norm": 0.42489439249038696,
"learning_rate": 0.0002924008156131663,
"loss": 3.1953,
"step": 88100
},
{
"epoch": 25.662454137790462,
"grad_norm": 0.4370625615119934,
"learning_rate": 0.00029222604136323915,
"loss": 3.1984,
"step": 88150
},
{
"epoch": 25.67701356939025,
"grad_norm": 0.4505648612976074,
"learning_rate": 0.000292051267113312,
"loss": 3.2036,
"step": 88200
},
{
"epoch": 25.69157300099004,
"grad_norm": 0.4252602458000183,
"learning_rate": 0.00029187649286338476,
"loss": 3.1819,
"step": 88250
},
{
"epoch": 25.706132432589833,
"grad_norm": 0.4348510205745697,
"learning_rate": 0.0002917017186134576,
"loss": 3.1889,
"step": 88300
},
{
"epoch": 25.72069186418962,
"grad_norm": 0.3967111110687256,
"learning_rate": 0.0002915269443635304,
"loss": 3.1895,
"step": 88350
},
{
"epoch": 25.73525129578941,
"grad_norm": 0.4344504773616791,
"learning_rate": 0.00029135217011360326,
"loss": 3.1952,
"step": 88400
},
{
"epoch": 25.749810727389203,
"grad_norm": 0.3971911072731018,
"learning_rate": 0.0002911773958636761,
"loss": 3.1879,
"step": 88450
},
{
"epoch": 25.764370158988992,
"grad_norm": 0.41786593198776245,
"learning_rate": 0.00029100262161374887,
"loss": 3.2031,
"step": 88500
},
{
"epoch": 25.778929590588785,
"grad_norm": 0.4138876497745514,
"learning_rate": 0.0002908278473638217,
"loss": 3.1932,
"step": 88550
},
{
"epoch": 25.793489022188574,
"grad_norm": 0.42914995551109314,
"learning_rate": 0.00029065307311389453,
"loss": 3.1864,
"step": 88600
},
{
"epoch": 25.808048453788363,
"grad_norm": 0.41269779205322266,
"learning_rate": 0.00029047829886396737,
"loss": 3.2,
"step": 88650
},
{
"epoch": 25.822607885388155,
"grad_norm": 0.39072921872138977,
"learning_rate": 0.0002903035246140402,
"loss": 3.195,
"step": 88700
},
{
"epoch": 25.837167316987944,
"grad_norm": 0.42368221282958984,
"learning_rate": 0.000290128750364113,
"loss": 3.1982,
"step": 88750
},
{
"epoch": 25.851726748587737,
"grad_norm": 0.38363558053970337,
"learning_rate": 0.0002899539761141858,
"loss": 3.2075,
"step": 88800
},
{
"epoch": 25.866286180187526,
"grad_norm": 0.4270095229148865,
"learning_rate": 0.00028977920186425864,
"loss": 3.2044,
"step": 88850
},
{
"epoch": 25.880845611787315,
"grad_norm": 0.4175295829772949,
"learning_rate": 0.0002896044276143315,
"loss": 3.1821,
"step": 88900
},
{
"epoch": 25.895405043387107,
"grad_norm": 0.4077088534832001,
"learning_rate": 0.00028942965336440426,
"loss": 3.1904,
"step": 88950
},
{
"epoch": 25.909964474986896,
"grad_norm": 0.41781890392303467,
"learning_rate": 0.0002892548791144771,
"loss": 3.1933,
"step": 89000
},
{
"epoch": 25.909964474986896,
"eval_accuracy": 0.37507350457218724,
"eval_loss": 3.53283953666687,
"eval_runtime": 53.8952,
"eval_samples_per_second": 308.803,
"eval_steps_per_second": 19.315,
"step": 89000
},
{
"epoch": 25.924523906586685,
"grad_norm": 0.412068247795105,
"learning_rate": 0.0002890801048645499,
"loss": 3.1933,
"step": 89050
},
{
"epoch": 25.939083338186478,
"grad_norm": 0.4057926535606384,
"learning_rate": 0.00028890533061462276,
"loss": 3.2093,
"step": 89100
},
{
"epoch": 25.953642769786267,
"grad_norm": 0.41411447525024414,
"learning_rate": 0.0002887305563646956,
"loss": 3.2055,
"step": 89150
},
{
"epoch": 25.968202201386056,
"grad_norm": 0.3971509039402008,
"learning_rate": 0.00028855578211476837,
"loss": 3.2091,
"step": 89200
},
{
"epoch": 25.98276163298585,
"grad_norm": 0.40727517008781433,
"learning_rate": 0.0002883810078648412,
"loss": 3.2159,
"step": 89250
},
{
"epoch": 25.997321064585638,
"grad_norm": 0.4136078655719757,
"learning_rate": 0.00028820623361491403,
"loss": 3.2117,
"step": 89300
},
{
"epoch": 26.011647545279832,
"grad_norm": 0.40786686539649963,
"learning_rate": 0.00028803145936498687,
"loss": 3.1265,
"step": 89350
},
{
"epoch": 26.02620697687962,
"grad_norm": 0.4166758060455322,
"learning_rate": 0.0002878566851150597,
"loss": 3.1048,
"step": 89400
},
{
"epoch": 26.040766408479413,
"grad_norm": 0.46875035762786865,
"learning_rate": 0.00028768191086513253,
"loss": 3.109,
"step": 89450
},
{
"epoch": 26.055325840079202,
"grad_norm": 0.44817620515823364,
"learning_rate": 0.00028750713661520536,
"loss": 3.1166,
"step": 89500
},
{
"epoch": 26.069885271678995,
"grad_norm": 0.423794150352478,
"learning_rate": 0.00028733236236527814,
"loss": 3.1114,
"step": 89550
},
{
"epoch": 26.084444703278784,
"grad_norm": 0.4060429036617279,
"learning_rate": 0.000287157588115351,
"loss": 3.1176,
"step": 89600
},
{
"epoch": 26.099004134878573,
"grad_norm": 0.4524560272693634,
"learning_rate": 0.0002869828138654238,
"loss": 3.1338,
"step": 89650
},
{
"epoch": 26.113563566478366,
"grad_norm": 0.40616855025291443,
"learning_rate": 0.00028680803961549664,
"loss": 3.1233,
"step": 89700
},
{
"epoch": 26.128122998078155,
"grad_norm": 0.4383051097393036,
"learning_rate": 0.0002866332653655695,
"loss": 3.1301,
"step": 89750
},
{
"epoch": 26.142682429677944,
"grad_norm": 0.4387468099594116,
"learning_rate": 0.00028645849111564225,
"loss": 3.1249,
"step": 89800
},
{
"epoch": 26.157241861277736,
"grad_norm": 0.40583929419517517,
"learning_rate": 0.0002862837168657151,
"loss": 3.1334,
"step": 89850
},
{
"epoch": 26.171801292877525,
"grad_norm": 0.4405916631221771,
"learning_rate": 0.0002861089426157879,
"loss": 3.1373,
"step": 89900
},
{
"epoch": 26.186360724477318,
"grad_norm": 0.42231664061546326,
"learning_rate": 0.00028593416836586075,
"loss": 3.1288,
"step": 89950
},
{
"epoch": 26.200920156077107,
"grad_norm": 0.4573913514614105,
"learning_rate": 0.0002857593941159336,
"loss": 3.1394,
"step": 90000
},
{
"epoch": 26.200920156077107,
"eval_accuracy": 0.3741908029950475,
"eval_loss": 3.55145263671875,
"eval_runtime": 53.8903,
"eval_samples_per_second": 308.831,
"eval_steps_per_second": 19.317,
"step": 90000
},
{
"epoch": 26.215479587676896,
"grad_norm": 0.427679181098938,
"learning_rate": 0.00028558461986600637,
"loss": 3.1458,
"step": 90050
},
{
"epoch": 26.23003901927669,
"grad_norm": 0.4170896112918854,
"learning_rate": 0.0002854098456160792,
"loss": 3.1466,
"step": 90100
},
{
"epoch": 26.244598450876477,
"grad_norm": 0.42612671852111816,
"learning_rate": 0.00028523507136615203,
"loss": 3.1484,
"step": 90150
},
{
"epoch": 26.259157882476266,
"grad_norm": 0.4112977385520935,
"learning_rate": 0.00028506029711622486,
"loss": 3.1561,
"step": 90200
},
{
"epoch": 26.27371731407606,
"grad_norm": 0.42549455165863037,
"learning_rate": 0.00028488552286629764,
"loss": 3.148,
"step": 90250
},
{
"epoch": 26.288276745675848,
"grad_norm": 0.41805240511894226,
"learning_rate": 0.0002847107486163705,
"loss": 3.1499,
"step": 90300
},
{
"epoch": 26.30283617727564,
"grad_norm": 0.42571932077407837,
"learning_rate": 0.0002845359743664433,
"loss": 3.1329,
"step": 90350
},
{
"epoch": 26.31739560887543,
"grad_norm": 0.4267009198665619,
"learning_rate": 0.00028436120011651614,
"loss": 3.1595,
"step": 90400
},
{
"epoch": 26.33195504047522,
"grad_norm": 0.402904748916626,
"learning_rate": 0.000284186425866589,
"loss": 3.1578,
"step": 90450
},
{
"epoch": 26.34651447207501,
"grad_norm": 0.41704311966896057,
"learning_rate": 0.00028401165161666175,
"loss": 3.1664,
"step": 90500
},
{
"epoch": 26.3610739036748,
"grad_norm": 0.43055227398872375,
"learning_rate": 0.0002838368773667346,
"loss": 3.1438,
"step": 90550
},
{
"epoch": 26.375633335274593,
"grad_norm": 0.42857736349105835,
"learning_rate": 0.0002836621031168075,
"loss": 3.1536,
"step": 90600
},
{
"epoch": 26.39019276687438,
"grad_norm": 0.40300217270851135,
"learning_rate": 0.00028348732886688025,
"loss": 3.1539,
"step": 90650
},
{
"epoch": 26.40475219847417,
"grad_norm": 0.4351729452610016,
"learning_rate": 0.0002833125546169531,
"loss": 3.1485,
"step": 90700
},
{
"epoch": 26.419311630073963,
"grad_norm": 0.4290107786655426,
"learning_rate": 0.0002831377803670259,
"loss": 3.1567,
"step": 90750
},
{
"epoch": 26.433871061673752,
"grad_norm": 0.45089462399482727,
"learning_rate": 0.00028296300611709875,
"loss": 3.1598,
"step": 90800
},
{
"epoch": 26.44843049327354,
"grad_norm": 0.4224042296409607,
"learning_rate": 0.00028278823186717153,
"loss": 3.1537,
"step": 90850
},
{
"epoch": 26.462989924873334,
"grad_norm": 0.42360836267471313,
"learning_rate": 0.00028261345761724436,
"loss": 3.1632,
"step": 90900
},
{
"epoch": 26.477549356473123,
"grad_norm": 0.42730751633644104,
"learning_rate": 0.0002824386833673172,
"loss": 3.1746,
"step": 90950
},
{
"epoch": 26.492108788072915,
"grad_norm": 0.4391239583492279,
"learning_rate": 0.00028226390911739003,
"loss": 3.1698,
"step": 91000
},
{
"epoch": 26.492108788072915,
"eval_accuracy": 0.3748994805500523,
"eval_loss": 3.539992094039917,
"eval_runtime": 53.9072,
"eval_samples_per_second": 308.734,
"eval_steps_per_second": 19.311,
"step": 91000
},
{
"epoch": 26.506668219672704,
"grad_norm": 0.42610499262809753,
"learning_rate": 0.00028208913486746286,
"loss": 3.1573,
"step": 91050
},
{
"epoch": 26.521227651272493,
"grad_norm": 0.44624412059783936,
"learning_rate": 0.00028191436061753564,
"loss": 3.1727,
"step": 91100
},
{
"epoch": 26.535787082872286,
"grad_norm": 0.3991565704345703,
"learning_rate": 0.0002817395863676085,
"loss": 3.1758,
"step": 91150
},
{
"epoch": 26.550346514472075,
"grad_norm": 0.4363832175731659,
"learning_rate": 0.0002815648121176813,
"loss": 3.1751,
"step": 91200
},
{
"epoch": 26.564905946071864,
"grad_norm": 0.4245220720767975,
"learning_rate": 0.00028139003786775414,
"loss": 3.1757,
"step": 91250
},
{
"epoch": 26.579465377671657,
"grad_norm": 0.4245074391365051,
"learning_rate": 0.000281215263617827,
"loss": 3.1701,
"step": 91300
},
{
"epoch": 26.594024809271446,
"grad_norm": 0.45923250913619995,
"learning_rate": 0.00028104048936789975,
"loss": 3.1626,
"step": 91350
},
{
"epoch": 26.608584240871238,
"grad_norm": 0.4250701367855072,
"learning_rate": 0.0002808657151179726,
"loss": 3.1795,
"step": 91400
},
{
"epoch": 26.623143672471027,
"grad_norm": 0.41687679290771484,
"learning_rate": 0.0002806909408680454,
"loss": 3.1825,
"step": 91450
},
{
"epoch": 26.637703104070816,
"grad_norm": 0.4420589804649353,
"learning_rate": 0.00028051616661811825,
"loss": 3.1845,
"step": 91500
},
{
"epoch": 26.65226253567061,
"grad_norm": 0.46142151951789856,
"learning_rate": 0.00028034139236819103,
"loss": 3.1727,
"step": 91550
},
{
"epoch": 26.666821967270398,
"grad_norm": 0.4562520682811737,
"learning_rate": 0.00028016661811826386,
"loss": 3.1747,
"step": 91600
},
{
"epoch": 26.681381398870187,
"grad_norm": 0.43289703130722046,
"learning_rate": 0.0002799918438683367,
"loss": 3.1772,
"step": 91650
},
{
"epoch": 26.69594083046998,
"grad_norm": 0.4018307030200958,
"learning_rate": 0.00027981706961840953,
"loss": 3.1794,
"step": 91700
},
{
"epoch": 26.71050026206977,
"grad_norm": 0.4325880706310272,
"learning_rate": 0.00027964229536848236,
"loss": 3.1831,
"step": 91750
},
{
"epoch": 26.72505969366956,
"grad_norm": 0.4957205057144165,
"learning_rate": 0.00027946752111855514,
"loss": 3.1828,
"step": 91800
},
{
"epoch": 26.73961912526935,
"grad_norm": 0.4486503303050995,
"learning_rate": 0.000279292746868628,
"loss": 3.1889,
"step": 91850
},
{
"epoch": 26.75417855686914,
"grad_norm": 0.4349845051765442,
"learning_rate": 0.00027911797261870086,
"loss": 3.1738,
"step": 91900
},
{
"epoch": 26.76873798846893,
"grad_norm": 0.39484739303588867,
"learning_rate": 0.00027894319836877364,
"loss": 3.1805,
"step": 91950
},
{
"epoch": 26.78329742006872,
"grad_norm": 0.427202433347702,
"learning_rate": 0.00027876842411884647,
"loss": 3.1911,
"step": 92000
},
{
"epoch": 26.78329742006872,
"eval_accuracy": 0.37494863057792555,
"eval_loss": 3.538325309753418,
"eval_runtime": 55.5984,
"eval_samples_per_second": 299.343,
"eval_steps_per_second": 18.724,
"step": 92000
},
{
"epoch": 26.79785685166851,
"grad_norm": 0.4337463080883026,
"learning_rate": 0.0002785936498689193,
"loss": 3.1873,
"step": 92050
},
{
"epoch": 26.812416283268302,
"grad_norm": 0.4569494426250458,
"learning_rate": 0.00027841887561899214,
"loss": 3.1808,
"step": 92100
},
{
"epoch": 26.82697571486809,
"grad_norm": 0.4768456816673279,
"learning_rate": 0.0002782441013690649,
"loss": 3.201,
"step": 92150
},
{
"epoch": 26.841535146467884,
"grad_norm": 0.4294453263282776,
"learning_rate": 0.00027806932711913775,
"loss": 3.1983,
"step": 92200
},
{
"epoch": 26.856094578067673,
"grad_norm": 0.42703986167907715,
"learning_rate": 0.0002778945528692106,
"loss": 3.1866,
"step": 92250
},
{
"epoch": 26.87065400966746,
"grad_norm": 0.4226187467575073,
"learning_rate": 0.0002777197786192834,
"loss": 3.1958,
"step": 92300
},
{
"epoch": 26.885213441267254,
"grad_norm": 0.41608643531799316,
"learning_rate": 0.00027754500436935625,
"loss": 3.1835,
"step": 92350
},
{
"epoch": 26.899772872867043,
"grad_norm": 0.4281024634838104,
"learning_rate": 0.00027737023011942903,
"loss": 3.1929,
"step": 92400
},
{
"epoch": 26.914332304466832,
"grad_norm": 0.4086182713508606,
"learning_rate": 0.00027719545586950186,
"loss": 3.1878,
"step": 92450
},
{
"epoch": 26.928891736066625,
"grad_norm": 0.4695216119289398,
"learning_rate": 0.0002770206816195747,
"loss": 3.2038,
"step": 92500
},
{
"epoch": 26.943451167666414,
"grad_norm": 0.4161894619464874,
"learning_rate": 0.0002768459073696475,
"loss": 3.1999,
"step": 92550
},
{
"epoch": 26.958010599266206,
"grad_norm": 0.4216294586658478,
"learning_rate": 0.00027667113311972036,
"loss": 3.1936,
"step": 92600
},
{
"epoch": 26.972570030865995,
"grad_norm": 0.4127732217311859,
"learning_rate": 0.00027649635886979314,
"loss": 3.2093,
"step": 92650
},
{
"epoch": 26.987129462465784,
"grad_norm": 0.40765687823295593,
"learning_rate": 0.00027632158461986597,
"loss": 3.1847,
"step": 92700
},
{
"epoch": 27.00145594315998,
"grad_norm": 0.44568103551864624,
"learning_rate": 0.0002761468103699388,
"loss": 3.203,
"step": 92750
},
{
"epoch": 27.01601537475977,
"grad_norm": 0.46214574575424194,
"learning_rate": 0.00027597203612001164,
"loss": 3.1023,
"step": 92800
},
{
"epoch": 27.03057480635956,
"grad_norm": 0.4300304651260376,
"learning_rate": 0.0002757972618700844,
"loss": 3.1,
"step": 92850
},
{
"epoch": 27.04513423795935,
"grad_norm": 0.4407559335231781,
"learning_rate": 0.00027562248762015725,
"loss": 3.1003,
"step": 92900
},
{
"epoch": 27.05969366955914,
"grad_norm": 0.45435330271720886,
"learning_rate": 0.0002754477133702301,
"loss": 3.1014,
"step": 92950
},
{
"epoch": 27.07425310115893,
"grad_norm": 0.4291175603866577,
"learning_rate": 0.0002752729391203029,
"loss": 3.099,
"step": 93000
},
{
"epoch": 27.07425310115893,
"eval_accuracy": 0.37463797418165495,
"eval_loss": 3.550109624862671,
"eval_runtime": 55.4523,
"eval_samples_per_second": 300.132,
"eval_steps_per_second": 18.773,
"step": 93000
},
{
"epoch": 27.08881253275872,
"grad_norm": 0.4241757392883301,
"learning_rate": 0.00027509816487037575,
"loss": 3.1121,
"step": 93050
},
{
"epoch": 27.103371964358512,
"grad_norm": 0.4496310353279114,
"learning_rate": 0.00027492339062044853,
"loss": 3.1109,
"step": 93100
},
{
"epoch": 27.1179313959583,
"grad_norm": 0.4374052882194519,
"learning_rate": 0.0002747486163705214,
"loss": 3.1225,
"step": 93150
},
{
"epoch": 27.132490827558094,
"grad_norm": 0.4774245321750641,
"learning_rate": 0.00027457384212059425,
"loss": 3.1188,
"step": 93200
},
{
"epoch": 27.147050259157883,
"grad_norm": 0.4365275204181671,
"learning_rate": 0.000274399067870667,
"loss": 3.1333,
"step": 93250
},
{
"epoch": 27.161609690757672,
"grad_norm": 0.429744154214859,
"learning_rate": 0.00027422429362073986,
"loss": 3.1266,
"step": 93300
},
{
"epoch": 27.176169122357464,
"grad_norm": 0.4345669746398926,
"learning_rate": 0.0002740495193708127,
"loss": 3.1184,
"step": 93350
},
{
"epoch": 27.190728553957253,
"grad_norm": 0.43556392192840576,
"learning_rate": 0.0002738747451208855,
"loss": 3.1196,
"step": 93400
},
{
"epoch": 27.205287985557042,
"grad_norm": 0.4418703317642212,
"learning_rate": 0.0002736999708709583,
"loss": 3.126,
"step": 93450
},
{
"epoch": 27.219847417156835,
"grad_norm": 0.4402376115322113,
"learning_rate": 0.00027352519662103114,
"loss": 3.1286,
"step": 93500
},
{
"epoch": 27.234406848756624,
"grad_norm": 0.42841628193855286,
"learning_rate": 0.00027335042237110397,
"loss": 3.1342,
"step": 93550
},
{
"epoch": 27.248966280356417,
"grad_norm": 0.42505958676338196,
"learning_rate": 0.0002731756481211768,
"loss": 3.1276,
"step": 93600
},
{
"epoch": 27.263525711956206,
"grad_norm": 0.4213034212589264,
"learning_rate": 0.00027300087387124964,
"loss": 3.1337,
"step": 93650
},
{
"epoch": 27.278085143555995,
"grad_norm": 0.4645802974700928,
"learning_rate": 0.0002728260996213224,
"loss": 3.1321,
"step": 93700
},
{
"epoch": 27.292644575155787,
"grad_norm": 0.4217985272407532,
"learning_rate": 0.00027265132537139525,
"loss": 3.1386,
"step": 93750
},
{
"epoch": 27.307204006755576,
"grad_norm": 0.4257556200027466,
"learning_rate": 0.0002724765511214681,
"loss": 3.1362,
"step": 93800
},
{
"epoch": 27.321763438355365,
"grad_norm": 0.4358249008655548,
"learning_rate": 0.0002723017768715409,
"loss": 3.14,
"step": 93850
},
{
"epoch": 27.336322869955158,
"grad_norm": 0.42620640993118286,
"learning_rate": 0.00027212700262161375,
"loss": 3.1548,
"step": 93900
},
{
"epoch": 27.350882301554947,
"grad_norm": 0.43325406312942505,
"learning_rate": 0.0002719522283716865,
"loss": 3.1507,
"step": 93950
},
{
"epoch": 27.36544173315474,
"grad_norm": 0.4155369997024536,
"learning_rate": 0.00027177745412175936,
"loss": 3.1493,
"step": 94000
},
{
"epoch": 27.36544173315474,
"eval_accuracy": 0.3745591930364993,
"eval_loss": 3.543552875518799,
"eval_runtime": 55.2955,
"eval_samples_per_second": 300.983,
"eval_steps_per_second": 18.826,
"step": 94000
},
{
"epoch": 27.38000116475453,
"grad_norm": 0.4649721682071686,
"learning_rate": 0.0002716026798718322,
"loss": 3.1425,
"step": 94050
},
{
"epoch": 27.394560596354317,
"grad_norm": 0.424880713224411,
"learning_rate": 0.000271427905621905,
"loss": 3.1603,
"step": 94100
},
{
"epoch": 27.40912002795411,
"grad_norm": 0.4196343421936035,
"learning_rate": 0.0002712531313719778,
"loss": 3.1546,
"step": 94150
},
{
"epoch": 27.4236794595539,
"grad_norm": 0.425191193819046,
"learning_rate": 0.00027107835712205064,
"loss": 3.1566,
"step": 94200
},
{
"epoch": 27.438238891153688,
"grad_norm": 0.44504839181900024,
"learning_rate": 0.00027090358287212347,
"loss": 3.149,
"step": 94250
},
{
"epoch": 27.45279832275348,
"grad_norm": 0.4285842478275299,
"learning_rate": 0.0002707288086221963,
"loss": 3.1469,
"step": 94300
},
{
"epoch": 27.46735775435327,
"grad_norm": 0.45686304569244385,
"learning_rate": 0.00027055403437226914,
"loss": 3.1682,
"step": 94350
},
{
"epoch": 27.481917185953062,
"grad_norm": 0.4554402530193329,
"learning_rate": 0.00027037926012234197,
"loss": 3.1603,
"step": 94400
},
{
"epoch": 27.49647661755285,
"grad_norm": 0.4219898283481598,
"learning_rate": 0.0002702044858724148,
"loss": 3.1582,
"step": 94450
},
{
"epoch": 27.51103604915264,
"grad_norm": 0.42963922023773193,
"learning_rate": 0.00027002971162248763,
"loss": 3.1599,
"step": 94500
},
{
"epoch": 27.525595480752433,
"grad_norm": 0.4354709982872009,
"learning_rate": 0.0002698549373725604,
"loss": 3.157,
"step": 94550
},
{
"epoch": 27.54015491235222,
"grad_norm": 0.4310063123703003,
"learning_rate": 0.00026968016312263325,
"loss": 3.1667,
"step": 94600
},
{
"epoch": 27.55471434395201,
"grad_norm": 0.4260794520378113,
"learning_rate": 0.0002695053888727061,
"loss": 3.1647,
"step": 94650
},
{
"epoch": 27.569273775551803,
"grad_norm": 0.4308391809463501,
"learning_rate": 0.0002693306146227789,
"loss": 3.1756,
"step": 94700
},
{
"epoch": 27.583833207151592,
"grad_norm": 0.43271583318710327,
"learning_rate": 0.0002691558403728517,
"loss": 3.1751,
"step": 94750
},
{
"epoch": 27.598392638751385,
"grad_norm": 0.4199526011943817,
"learning_rate": 0.0002689810661229245,
"loss": 3.1625,
"step": 94800
},
{
"epoch": 27.612952070351174,
"grad_norm": 0.4638511538505554,
"learning_rate": 0.00026880629187299736,
"loss": 3.1835,
"step": 94850
},
{
"epoch": 27.627511501950963,
"grad_norm": 0.4496512711048126,
"learning_rate": 0.0002686315176230702,
"loss": 3.1731,
"step": 94900
},
{
"epoch": 27.642070933550755,
"grad_norm": 0.44952234625816345,
"learning_rate": 0.000268456743373143,
"loss": 3.1743,
"step": 94950
},
{
"epoch": 27.656630365150544,
"grad_norm": 0.42717215418815613,
"learning_rate": 0.0002682819691232158,
"loss": 3.1648,
"step": 95000
},
{
"epoch": 27.656630365150544,
"eval_accuracy": 0.3751180688319096,
"eval_loss": 3.536431312561035,
"eval_runtime": 55.4927,
"eval_samples_per_second": 299.914,
"eval_steps_per_second": 18.759,
"step": 95000
},
{
"epoch": 27.671189796750333,
"grad_norm": 0.4398097097873688,
"learning_rate": 0.00026810719487328863,
"loss": 3.1798,
"step": 95050
},
{
"epoch": 27.685749228350126,
"grad_norm": 0.4571005702018738,
"learning_rate": 0.00026793242062336147,
"loss": 3.1786,
"step": 95100
},
{
"epoch": 27.700308659949915,
"grad_norm": 0.42584386467933655,
"learning_rate": 0.0002677576463734343,
"loss": 3.1811,
"step": 95150
},
{
"epoch": 27.714868091549707,
"grad_norm": 0.4381604790687561,
"learning_rate": 0.00026758287212350713,
"loss": 3.1708,
"step": 95200
},
{
"epoch": 27.729427523149496,
"grad_norm": 0.42696070671081543,
"learning_rate": 0.0002674080978735799,
"loss": 3.1845,
"step": 95250
},
{
"epoch": 27.743986954749285,
"grad_norm": 0.4467693865299225,
"learning_rate": 0.00026723332362365275,
"loss": 3.1682,
"step": 95300
},
{
"epoch": 27.758546386349078,
"grad_norm": 0.4203813076019287,
"learning_rate": 0.0002670585493737256,
"loss": 3.1724,
"step": 95350
},
{
"epoch": 27.773105817948867,
"grad_norm": 0.4817917048931122,
"learning_rate": 0.0002668837751237984,
"loss": 3.1876,
"step": 95400
},
{
"epoch": 27.787665249548656,
"grad_norm": 0.4254100024700165,
"learning_rate": 0.0002667090008738712,
"loss": 3.1805,
"step": 95450
},
{
"epoch": 27.80222468114845,
"grad_norm": 0.44012710452079773,
"learning_rate": 0.000266534226623944,
"loss": 3.1863,
"step": 95500
},
{
"epoch": 27.816784112748238,
"grad_norm": 0.45511409640312195,
"learning_rate": 0.00026635945237401686,
"loss": 3.1857,
"step": 95550
},
{
"epoch": 27.83134354434803,
"grad_norm": 0.431951105594635,
"learning_rate": 0.0002661846781240897,
"loss": 3.1765,
"step": 95600
},
{
"epoch": 27.84590297594782,
"grad_norm": 0.4615152180194855,
"learning_rate": 0.0002660099038741625,
"loss": 3.1802,
"step": 95650
},
{
"epoch": 27.860462407547608,
"grad_norm": 0.43755945563316345,
"learning_rate": 0.00026583512962423536,
"loss": 3.1821,
"step": 95700
},
{
"epoch": 27.8750218391474,
"grad_norm": 0.424096941947937,
"learning_rate": 0.0002656603553743082,
"loss": 3.1786,
"step": 95750
},
{
"epoch": 27.88958127074719,
"grad_norm": 0.44469699263572693,
"learning_rate": 0.000265485581124381,
"loss": 3.1789,
"step": 95800
},
{
"epoch": 27.90414070234698,
"grad_norm": 0.4233236610889435,
"learning_rate": 0.0002653108068744538,
"loss": 3.1792,
"step": 95850
},
{
"epoch": 27.91870013394677,
"grad_norm": 0.43293777108192444,
"learning_rate": 0.00026513603262452663,
"loss": 3.1763,
"step": 95900
},
{
"epoch": 27.93325956554656,
"grad_norm": 0.45778122544288635,
"learning_rate": 0.00026496125837459947,
"loss": 3.1841,
"step": 95950
},
{
"epoch": 27.947818997146353,
"grad_norm": 0.4322860538959503,
"learning_rate": 0.0002647864841246723,
"loss": 3.1793,
"step": 96000
},
{
"epoch": 27.947818997146353,
"eval_accuracy": 0.3751108962201865,
"eval_loss": 3.5336105823516846,
"eval_runtime": 55.4476,
"eval_samples_per_second": 300.157,
"eval_steps_per_second": 18.774,
"step": 96000
},
{
"epoch": 27.962378428746142,
"grad_norm": 0.4258585274219513,
"learning_rate": 0.0002646117098747451,
"loss": 3.1967,
"step": 96050
},
{
"epoch": 27.97693786034593,
"grad_norm": 0.4479847848415375,
"learning_rate": 0.0002644369356248179,
"loss": 3.1921,
"step": 96100
},
{
"epoch": 27.991497291945723,
"grad_norm": 0.4440501928329468,
"learning_rate": 0.00026426216137489074,
"loss": 3.1911,
"step": 96150
},
{
"epoch": 28.005823772639918,
"grad_norm": 0.4409486651420593,
"learning_rate": 0.0002640873871249636,
"loss": 3.1243,
"step": 96200
},
{
"epoch": 28.020383204239707,
"grad_norm": 0.4427194595336914,
"learning_rate": 0.0002639126128750364,
"loss": 3.0868,
"step": 96250
},
{
"epoch": 28.034942635839496,
"grad_norm": 0.43971019983291626,
"learning_rate": 0.0002637378386251092,
"loss": 3.0921,
"step": 96300
},
{
"epoch": 28.04950206743929,
"grad_norm": 0.45166823267936707,
"learning_rate": 0.000263563064375182,
"loss": 3.0929,
"step": 96350
},
{
"epoch": 28.064061499039077,
"grad_norm": 0.45439887046813965,
"learning_rate": 0.00026338829012525485,
"loss": 3.0843,
"step": 96400
},
{
"epoch": 28.078620930638866,
"grad_norm": 0.44814586639404297,
"learning_rate": 0.0002632135158753277,
"loss": 3.1028,
"step": 96450
},
{
"epoch": 28.09318036223866,
"grad_norm": 0.42964088916778564,
"learning_rate": 0.0002630387416254005,
"loss": 3.1121,
"step": 96500
},
{
"epoch": 28.107739793838448,
"grad_norm": 0.40774595737457275,
"learning_rate": 0.0002628639673754733,
"loss": 3.1128,
"step": 96550
},
{
"epoch": 28.12229922543824,
"grad_norm": 0.45805495977401733,
"learning_rate": 0.00026268919312554613,
"loss": 3.1068,
"step": 96600
},
{
"epoch": 28.13685865703803,
"grad_norm": 0.4234671890735626,
"learning_rate": 0.00026251441887561897,
"loss": 3.1048,
"step": 96650
},
{
"epoch": 28.15141808863782,
"grad_norm": 0.4316425621509552,
"learning_rate": 0.0002623396446256918,
"loss": 3.1249,
"step": 96700
},
{
"epoch": 28.16597752023761,
"grad_norm": 0.4478599429130554,
"learning_rate": 0.0002621648703757646,
"loss": 3.1169,
"step": 96750
},
{
"epoch": 28.1805369518374,
"grad_norm": 0.4409092962741852,
"learning_rate": 0.0002619900961258374,
"loss": 3.111,
"step": 96800
},
{
"epoch": 28.19509638343719,
"grad_norm": 0.48611727356910706,
"learning_rate": 0.00026181532187591024,
"loss": 3.1161,
"step": 96850
},
{
"epoch": 28.20965581503698,
"grad_norm": 0.4241076409816742,
"learning_rate": 0.0002616405476259831,
"loss": 3.1215,
"step": 96900
},
{
"epoch": 28.22421524663677,
"grad_norm": 0.44242626428604126,
"learning_rate": 0.0002614657733760559,
"loss": 3.1297,
"step": 96950
},
{
"epoch": 28.238774678236563,
"grad_norm": 0.45010676980018616,
"learning_rate": 0.00026129099912612874,
"loss": 3.1201,
"step": 97000
},
{
"epoch": 28.238774678236563,
"eval_accuracy": 0.37466113819000674,
"eval_loss": 3.5505800247192383,
"eval_runtime": 55.3733,
"eval_samples_per_second": 300.56,
"eval_steps_per_second": 18.8,
"step": 97000
},
{
"epoch": 28.253334109836352,
"grad_norm": 0.4329627752304077,
"learning_rate": 0.0002611162248762016,
"loss": 3.1274,
"step": 97050
},
{
"epoch": 28.26789354143614,
"grad_norm": 0.4276256263256073,
"learning_rate": 0.0002609414506262744,
"loss": 3.1237,
"step": 97100
},
{
"epoch": 28.282452973035934,
"grad_norm": 0.43386736512184143,
"learning_rate": 0.0002607666763763472,
"loss": 3.1408,
"step": 97150
},
{
"epoch": 28.297012404635723,
"grad_norm": 0.4516732394695282,
"learning_rate": 0.00026059190212642,
"loss": 3.1539,
"step": 97200
},
{
"epoch": 28.31157183623551,
"grad_norm": 0.4807344675064087,
"learning_rate": 0.00026041712787649285,
"loss": 3.1368,
"step": 97250
},
{
"epoch": 28.326131267835304,
"grad_norm": 0.4625699818134308,
"learning_rate": 0.0002602423536265657,
"loss": 3.1323,
"step": 97300
},
{
"epoch": 28.340690699435093,
"grad_norm": 0.4425176978111267,
"learning_rate": 0.00026006757937663846,
"loss": 3.1344,
"step": 97350
},
{
"epoch": 28.355250131034886,
"grad_norm": 0.46198466420173645,
"learning_rate": 0.0002598928051267113,
"loss": 3.1409,
"step": 97400
},
{
"epoch": 28.369809562634675,
"grad_norm": 0.4490656852722168,
"learning_rate": 0.00025971803087678413,
"loss": 3.1404,
"step": 97450
},
{
"epoch": 28.384368994234464,
"grad_norm": 0.436927855014801,
"learning_rate": 0.00025954325662685696,
"loss": 3.1442,
"step": 97500
},
{
"epoch": 28.398928425834256,
"grad_norm": 0.472896009683609,
"learning_rate": 0.0002593684823769298,
"loss": 3.1434,
"step": 97550
},
{
"epoch": 28.413487857434045,
"grad_norm": 0.4772699773311615,
"learning_rate": 0.0002591937081270026,
"loss": 3.1384,
"step": 97600
},
{
"epoch": 28.428047289033834,
"grad_norm": 0.44761937856674194,
"learning_rate": 0.0002590189338770754,
"loss": 3.1614,
"step": 97650
},
{
"epoch": 28.442606720633627,
"grad_norm": 0.4448677897453308,
"learning_rate": 0.00025884415962714824,
"loss": 3.1475,
"step": 97700
},
{
"epoch": 28.457166152233416,
"grad_norm": 0.4507066309452057,
"learning_rate": 0.0002586693853772211,
"loss": 3.1554,
"step": 97750
},
{
"epoch": 28.47172558383321,
"grad_norm": 0.4424573481082916,
"learning_rate": 0.0002584946111272939,
"loss": 3.1512,
"step": 97800
},
{
"epoch": 28.486285015432998,
"grad_norm": 0.42159804701805115,
"learning_rate": 0.0002583198368773667,
"loss": 3.14,
"step": 97850
},
{
"epoch": 28.500844447032787,
"grad_norm": 0.441180944442749,
"learning_rate": 0.0002581450626274395,
"loss": 3.1606,
"step": 97900
},
{
"epoch": 28.51540387863258,
"grad_norm": 0.4541659355163574,
"learning_rate": 0.00025797028837751235,
"loss": 3.16,
"step": 97950
},
{
"epoch": 28.529963310232368,
"grad_norm": 0.4408447742462158,
"learning_rate": 0.0002577955141275852,
"loss": 3.1451,
"step": 98000
},
{
"epoch": 28.529963310232368,
"eval_accuracy": 0.37521836781223467,
"eval_loss": 3.5380377769470215,
"eval_runtime": 55.3762,
"eval_samples_per_second": 300.545,
"eval_steps_per_second": 18.799,
"step": 98000
},
{
"epoch": 28.544522741832157,
"grad_norm": 0.42458575963974,
"learning_rate": 0.00025762073987765796,
"loss": 3.1664,
"step": 98050
},
{
"epoch": 28.55908217343195,
"grad_norm": 0.46024999022483826,
"learning_rate": 0.0002574459656277308,
"loss": 3.1775,
"step": 98100
},
{
"epoch": 28.57364160503174,
"grad_norm": 0.4679096043109894,
"learning_rate": 0.00025727119137780363,
"loss": 3.1596,
"step": 98150
},
{
"epoch": 28.58820103663153,
"grad_norm": 0.4214268922805786,
"learning_rate": 0.00025709641712787646,
"loss": 3.1469,
"step": 98200
},
{
"epoch": 28.60276046823132,
"grad_norm": 0.45428961515426636,
"learning_rate": 0.0002569216428779493,
"loss": 3.1632,
"step": 98250
},
{
"epoch": 28.61731989983111,
"grad_norm": 0.4411943256855011,
"learning_rate": 0.00025674686862802213,
"loss": 3.1604,
"step": 98300
},
{
"epoch": 28.631879331430902,
"grad_norm": 0.4553915560245514,
"learning_rate": 0.00025657209437809496,
"loss": 3.1644,
"step": 98350
},
{
"epoch": 28.64643876303069,
"grad_norm": 0.4235278069972992,
"learning_rate": 0.0002563973201281678,
"loss": 3.1465,
"step": 98400
},
{
"epoch": 28.66099819463048,
"grad_norm": 0.42616787552833557,
"learning_rate": 0.0002562225458782406,
"loss": 3.1613,
"step": 98450
},
{
"epoch": 28.675557626230272,
"grad_norm": 0.44103261828422546,
"learning_rate": 0.0002560477716283134,
"loss": 3.168,
"step": 98500
},
{
"epoch": 28.69011705783006,
"grad_norm": 0.457415908575058,
"learning_rate": 0.00025587299737838624,
"loss": 3.1648,
"step": 98550
},
{
"epoch": 28.704676489429854,
"grad_norm": 0.44758105278015137,
"learning_rate": 0.00025569822312845907,
"loss": 3.1643,
"step": 98600
},
{
"epoch": 28.719235921029643,
"grad_norm": 0.4369789659976959,
"learning_rate": 0.00025552344887853185,
"loss": 3.1629,
"step": 98650
},
{
"epoch": 28.733795352629432,
"grad_norm": 0.4466242492198944,
"learning_rate": 0.0002553486746286047,
"loss": 3.1671,
"step": 98700
},
{
"epoch": 28.748354784229225,
"grad_norm": 0.4449853301048279,
"learning_rate": 0.0002551739003786775,
"loss": 3.172,
"step": 98750
},
{
"epoch": 28.762914215829014,
"grad_norm": 0.4349530339241028,
"learning_rate": 0.00025499912612875035,
"loss": 3.1667,
"step": 98800
},
{
"epoch": 28.777473647428806,
"grad_norm": 0.4433923661708832,
"learning_rate": 0.0002548243518788232,
"loss": 3.1605,
"step": 98850
},
{
"epoch": 28.792033079028595,
"grad_norm": 0.45331159234046936,
"learning_rate": 0.00025464957762889596,
"loss": 3.1688,
"step": 98900
},
{
"epoch": 28.806592510628384,
"grad_norm": 0.4952585697174072,
"learning_rate": 0.0002544748033789688,
"loss": 3.1656,
"step": 98950
},
{
"epoch": 28.821151942228177,
"grad_norm": 0.438007116317749,
"learning_rate": 0.00025430002912904163,
"loss": 3.1765,
"step": 99000
},
{
"epoch": 28.821151942228177,
"eval_accuracy": 0.37500518838511937,
"eval_loss": 3.537261724472046,
"eval_runtime": 55.4042,
"eval_samples_per_second": 300.393,
"eval_steps_per_second": 18.789,
"step": 99000
},
{
"epoch": 28.821151942228177,
"step": 99000,
"total_flos": 2.068928077234176e+18,
"train_loss": 3.3924570856576013,
"train_runtime": 44486.5704,
"train_samples_per_second": 308.78,
"train_steps_per_second": 3.861
}
],
"logging_steps": 50,
"max_steps": 171750,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 11
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.068928077234176e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}