craa's picture
Upload folder using huggingface_hub
3e98a6a verified
{
"best_global_step": 72000,
"best_metric": 3.5321924686431885,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_drop_frequency_40817/checkpoint-40000",
"epoch": 29.11239881195038,
"eval_steps": 1000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014559431599790344,
"grad_norm": 1.5551334619522095,
"learning_rate": 0.000294,
"loss": 8.4667,
"step": 50
},
{
"epoch": 0.029118863199580687,
"grad_norm": 0.7336986064910889,
"learning_rate": 0.0005939999999999999,
"loss": 6.7245,
"step": 100
},
{
"epoch": 0.043678294799371034,
"grad_norm": 0.4792507588863373,
"learning_rate": 0.0005998287212350713,
"loss": 6.3255,
"step": 150
},
{
"epoch": 0.058237726399161374,
"grad_norm": 0.47392821311950684,
"learning_rate": 0.0005996539469851441,
"loss": 6.1138,
"step": 200
},
{
"epoch": 0.07279715799895171,
"grad_norm": 0.442217618227005,
"learning_rate": 0.000599479172735217,
"loss": 5.9746,
"step": 250
},
{
"epoch": 0.08735658959874207,
"grad_norm": 0.4978708028793335,
"learning_rate": 0.0005993043984852897,
"loss": 5.8573,
"step": 300
},
{
"epoch": 0.10191602119853241,
"grad_norm": 0.5078408122062683,
"learning_rate": 0.0005991296242353626,
"loss": 5.7377,
"step": 350
},
{
"epoch": 0.11647545279832275,
"grad_norm": 0.4501552879810333,
"learning_rate": 0.0005989548499854355,
"loss": 5.613,
"step": 400
},
{
"epoch": 0.1310348843981131,
"grad_norm": 0.41562119126319885,
"learning_rate": 0.0005987800757355083,
"loss": 5.5049,
"step": 450
},
{
"epoch": 0.14559431599790343,
"grad_norm": 0.39685097336769104,
"learning_rate": 0.0005986053014855811,
"loss": 5.4153,
"step": 500
},
{
"epoch": 0.1601537475976938,
"grad_norm": 0.4735598862171173,
"learning_rate": 0.000598430527235654,
"loss": 5.3339,
"step": 550
},
{
"epoch": 0.17471317919748414,
"grad_norm": 0.4490765929222107,
"learning_rate": 0.0005982557529857267,
"loss": 5.2571,
"step": 600
},
{
"epoch": 0.18927261079727448,
"grad_norm": 0.5662270188331604,
"learning_rate": 0.0005980809787357995,
"loss": 5.183,
"step": 650
},
{
"epoch": 0.20383204239706482,
"grad_norm": 0.4178728759288788,
"learning_rate": 0.0005979062044858724,
"loss": 5.1337,
"step": 700
},
{
"epoch": 0.21839147399685516,
"grad_norm": 0.4277268648147583,
"learning_rate": 0.0005977314302359452,
"loss": 5.082,
"step": 750
},
{
"epoch": 0.2329509055966455,
"grad_norm": 0.49093976616859436,
"learning_rate": 0.0005975566559860181,
"loss": 5.0414,
"step": 800
},
{
"epoch": 0.24751033719643586,
"grad_norm": 0.40832236409187317,
"learning_rate": 0.0005973818817360908,
"loss": 4.9782,
"step": 850
},
{
"epoch": 0.2620697687962262,
"grad_norm": 0.42992544174194336,
"learning_rate": 0.0005972071074861636,
"loss": 4.9343,
"step": 900
},
{
"epoch": 0.2766292003960165,
"grad_norm": 0.5416184067726135,
"learning_rate": 0.0005970323332362365,
"loss": 4.8685,
"step": 950
},
{
"epoch": 0.29118863199580686,
"grad_norm": 0.5198241472244263,
"learning_rate": 0.0005968575589863093,
"loss": 4.849,
"step": 1000
},
{
"epoch": 0.29118863199580686,
"eval_accuracy": 0.25379510529217636,
"eval_loss": 4.761143684387207,
"eval_runtime": 183.641,
"eval_samples_per_second": 90.628,
"eval_steps_per_second": 5.669,
"step": 1000
},
{
"epoch": 0.30574806359559725,
"grad_norm": 0.47911056876182556,
"learning_rate": 0.0005966827847363822,
"loss": 4.7726,
"step": 1050
},
{
"epoch": 0.3203074951953876,
"grad_norm": 0.4758392572402954,
"learning_rate": 0.000596508010486455,
"loss": 4.7537,
"step": 1100
},
{
"epoch": 0.33486692679517793,
"grad_norm": 0.47129762172698975,
"learning_rate": 0.0005963332362365277,
"loss": 4.7115,
"step": 1150
},
{
"epoch": 0.3494263583949683,
"grad_norm": 0.42803141474723816,
"learning_rate": 0.0005961584619866006,
"loss": 4.6751,
"step": 1200
},
{
"epoch": 0.3639857899947586,
"grad_norm": 0.4740878641605377,
"learning_rate": 0.0005959836877366734,
"loss": 4.6417,
"step": 1250
},
{
"epoch": 0.37854522159454895,
"grad_norm": 0.40221309661865234,
"learning_rate": 0.0005958089134867463,
"loss": 4.6053,
"step": 1300
},
{
"epoch": 0.3931046531943393,
"grad_norm": 0.44672706723213196,
"learning_rate": 0.0005956341392368191,
"loss": 4.5801,
"step": 1350
},
{
"epoch": 0.40766408479412963,
"grad_norm": 0.4823697507381439,
"learning_rate": 0.0005954593649868918,
"loss": 4.5599,
"step": 1400
},
{
"epoch": 0.42222351639392,
"grad_norm": 0.5122449398040771,
"learning_rate": 0.0005952845907369647,
"loss": 4.5344,
"step": 1450
},
{
"epoch": 0.4367829479937103,
"grad_norm": 0.4088864028453827,
"learning_rate": 0.0005951098164870375,
"loss": 4.4951,
"step": 1500
},
{
"epoch": 0.45134237959350065,
"grad_norm": 0.40731462836265564,
"learning_rate": 0.0005949350422371104,
"loss": 4.5018,
"step": 1550
},
{
"epoch": 0.465901811193291,
"grad_norm": 0.4263319671154022,
"learning_rate": 0.0005947602679871832,
"loss": 4.4755,
"step": 1600
},
{
"epoch": 0.48046124279308133,
"grad_norm": 0.38340768218040466,
"learning_rate": 0.000594585493737256,
"loss": 4.4569,
"step": 1650
},
{
"epoch": 0.49502067439287173,
"grad_norm": 0.3979549705982208,
"learning_rate": 0.0005944107194873288,
"loss": 4.4444,
"step": 1700
},
{
"epoch": 0.509580105992662,
"grad_norm": 0.4176700711250305,
"learning_rate": 0.0005942359452374016,
"loss": 4.4173,
"step": 1750
},
{
"epoch": 0.5241395375924524,
"grad_norm": 0.3926246464252472,
"learning_rate": 0.0005940611709874745,
"loss": 4.4004,
"step": 1800
},
{
"epoch": 0.5386989691922427,
"grad_norm": 0.3877740502357483,
"learning_rate": 0.0005938863967375473,
"loss": 4.3869,
"step": 1850
},
{
"epoch": 0.553258400792033,
"grad_norm": 0.4315814971923828,
"learning_rate": 0.0005937116224876201,
"loss": 4.3636,
"step": 1900
},
{
"epoch": 0.5678178323918234,
"grad_norm": 0.403978556394577,
"learning_rate": 0.000593536848237693,
"loss": 4.3628,
"step": 1950
},
{
"epoch": 0.5823772639916137,
"grad_norm": 0.39984941482543945,
"learning_rate": 0.0005933620739877657,
"loss": 4.3402,
"step": 2000
},
{
"epoch": 0.5823772639916137,
"eval_accuracy": 0.29928804185701036,
"eval_loss": 4.288719177246094,
"eval_runtime": 180.6232,
"eval_samples_per_second": 92.142,
"eval_steps_per_second": 5.763,
"step": 2000
},
{
"epoch": 0.5969366955914042,
"grad_norm": 0.4173935055732727,
"learning_rate": 0.0005931872997378385,
"loss": 4.3411,
"step": 2050
},
{
"epoch": 0.6114961271911945,
"grad_norm": 0.37241849303245544,
"learning_rate": 0.0005930125254879114,
"loss": 4.3243,
"step": 2100
},
{
"epoch": 0.6260555587909848,
"grad_norm": 0.4274754822254181,
"learning_rate": 0.0005928377512379842,
"loss": 4.2883,
"step": 2150
},
{
"epoch": 0.6406149903907752,
"grad_norm": 0.4375714063644409,
"learning_rate": 0.0005926629769880571,
"loss": 4.2941,
"step": 2200
},
{
"epoch": 0.6551744219905655,
"grad_norm": 0.39245837926864624,
"learning_rate": 0.0005924882027381298,
"loss": 4.2863,
"step": 2250
},
{
"epoch": 0.6697338535903559,
"grad_norm": 0.3508373498916626,
"learning_rate": 0.0005923134284882026,
"loss": 4.2683,
"step": 2300
},
{
"epoch": 0.6842932851901462,
"grad_norm": 0.37966057658195496,
"learning_rate": 0.0005921386542382755,
"loss": 4.268,
"step": 2350
},
{
"epoch": 0.6988527167899365,
"grad_norm": 0.4270515441894531,
"learning_rate": 0.0005919638799883483,
"loss": 4.2548,
"step": 2400
},
{
"epoch": 0.7134121483897269,
"grad_norm": 0.36582618951797485,
"learning_rate": 0.0005917891057384212,
"loss": 4.2418,
"step": 2450
},
{
"epoch": 0.7279715799895172,
"grad_norm": 0.3588745594024658,
"learning_rate": 0.000591614331488494,
"loss": 4.2315,
"step": 2500
},
{
"epoch": 0.7425310115893076,
"grad_norm": 0.3805822730064392,
"learning_rate": 0.0005914395572385667,
"loss": 4.2263,
"step": 2550
},
{
"epoch": 0.7570904431890979,
"grad_norm": 0.37862271070480347,
"learning_rate": 0.0005912647829886396,
"loss": 4.2177,
"step": 2600
},
{
"epoch": 0.7716498747888882,
"grad_norm": 0.40694668889045715,
"learning_rate": 0.0005910900087387124,
"loss": 4.1886,
"step": 2650
},
{
"epoch": 0.7862093063886786,
"grad_norm": 0.3988340497016907,
"learning_rate": 0.0005909152344887853,
"loss": 4.1907,
"step": 2700
},
{
"epoch": 0.8007687379884689,
"grad_norm": 0.4412493109703064,
"learning_rate": 0.0005907404602388581,
"loss": 4.1929,
"step": 2750
},
{
"epoch": 0.8153281695882593,
"grad_norm": 0.37306517362594604,
"learning_rate": 0.0005905656859889308,
"loss": 4.1721,
"step": 2800
},
{
"epoch": 0.8298876011880496,
"grad_norm": 0.36752834916114807,
"learning_rate": 0.0005903909117390037,
"loss": 4.1729,
"step": 2850
},
{
"epoch": 0.84444703278784,
"grad_norm": 0.38249292969703674,
"learning_rate": 0.0005902161374890766,
"loss": 4.17,
"step": 2900
},
{
"epoch": 0.8590064643876303,
"grad_norm": 0.3479909598827362,
"learning_rate": 0.0005900413632391494,
"loss": 4.1629,
"step": 2950
},
{
"epoch": 0.8735658959874206,
"grad_norm": 0.34885624051094055,
"learning_rate": 0.0005898665889892223,
"loss": 4.1563,
"step": 3000
},
{
"epoch": 0.8735658959874206,
"eval_accuracy": 0.31519818808069494,
"eval_loss": 4.099164009094238,
"eval_runtime": 183.4247,
"eval_samples_per_second": 90.735,
"eval_steps_per_second": 5.675,
"step": 3000
},
{
"epoch": 0.888125327587211,
"grad_norm": 0.38681846857070923,
"learning_rate": 0.0005896918147392951,
"loss": 4.1567,
"step": 3050
},
{
"epoch": 0.9026847591870013,
"grad_norm": 0.3432327210903168,
"learning_rate": 0.0005895170404893678,
"loss": 4.1293,
"step": 3100
},
{
"epoch": 0.9172441907867916,
"grad_norm": 0.3937830626964569,
"learning_rate": 0.0005893422662394407,
"loss": 4.1285,
"step": 3150
},
{
"epoch": 0.931803622386582,
"grad_norm": 0.39171546697616577,
"learning_rate": 0.0005891674919895135,
"loss": 4.1279,
"step": 3200
},
{
"epoch": 0.9463630539863723,
"grad_norm": 0.37026646733283997,
"learning_rate": 0.0005889927177395864,
"loss": 4.1106,
"step": 3250
},
{
"epoch": 0.9609224855861627,
"grad_norm": 0.3460790812969208,
"learning_rate": 0.0005888179434896592,
"loss": 4.1132,
"step": 3300
},
{
"epoch": 0.975481917185953,
"grad_norm": 0.36886388063430786,
"learning_rate": 0.000588643169239732,
"loss": 4.0977,
"step": 3350
},
{
"epoch": 0.9900413487857435,
"grad_norm": 0.36020082235336304,
"learning_rate": 0.0005884683949898048,
"loss": 4.0966,
"step": 3400
},
{
"epoch": 1.0043678294799372,
"grad_norm": 0.33763444423675537,
"learning_rate": 0.0005882936207398776,
"loss": 4.0577,
"step": 3450
},
{
"epoch": 1.0189272610797275,
"grad_norm": 0.34525808691978455,
"learning_rate": 0.0005881188464899504,
"loss": 4.0248,
"step": 3500
},
{
"epoch": 1.0334866926795179,
"grad_norm": 0.37068355083465576,
"learning_rate": 0.0005879440722400233,
"loss": 4.0183,
"step": 3550
},
{
"epoch": 1.0480461242793082,
"grad_norm": 0.34973421692848206,
"learning_rate": 0.0005877692979900961,
"loss": 4.0291,
"step": 3600
},
{
"epoch": 1.0626055558790986,
"grad_norm": 0.3637358248233795,
"learning_rate": 0.000587594523740169,
"loss": 4.0199,
"step": 3650
},
{
"epoch": 1.077164987478889,
"grad_norm": 0.34920114278793335,
"learning_rate": 0.0005874197494902417,
"loss": 4.0247,
"step": 3700
},
{
"epoch": 1.0917244190786792,
"grad_norm": 0.3420464098453522,
"learning_rate": 0.0005872449752403145,
"loss": 4.0189,
"step": 3750
},
{
"epoch": 1.1062838506784696,
"grad_norm": 0.34696176648139954,
"learning_rate": 0.0005870702009903874,
"loss": 4.0089,
"step": 3800
},
{
"epoch": 1.12084328227826,
"grad_norm": 0.3416752815246582,
"learning_rate": 0.0005868954267404602,
"loss": 3.9978,
"step": 3850
},
{
"epoch": 1.1354027138780503,
"grad_norm": 0.3729047179222107,
"learning_rate": 0.0005867206524905331,
"loss": 3.9976,
"step": 3900
},
{
"epoch": 1.1499621454778406,
"grad_norm": 0.34707263112068176,
"learning_rate": 0.0005865458782406058,
"loss": 3.9927,
"step": 3950
},
{
"epoch": 1.164521577077631,
"grad_norm": 0.3424519896507263,
"learning_rate": 0.0005863711039906786,
"loss": 3.9798,
"step": 4000
},
{
"epoch": 1.164521577077631,
"eval_accuracy": 0.32528629009357674,
"eval_loss": 3.9908077716827393,
"eval_runtime": 180.5563,
"eval_samples_per_second": 92.176,
"eval_steps_per_second": 5.766,
"step": 4000
},
{
"epoch": 1.1790810086774213,
"grad_norm": 0.3473677635192871,
"learning_rate": 0.0005861963297407515,
"loss": 3.9837,
"step": 4050
},
{
"epoch": 1.1936404402772116,
"grad_norm": 0.3695130944252014,
"learning_rate": 0.0005860215554908243,
"loss": 3.9857,
"step": 4100
},
{
"epoch": 1.208199871877002,
"grad_norm": 0.3494517207145691,
"learning_rate": 0.0005858467812408972,
"loss": 3.9749,
"step": 4150
},
{
"epoch": 1.2227593034767923,
"grad_norm": 0.3514440655708313,
"learning_rate": 0.00058567200699097,
"loss": 3.9773,
"step": 4200
},
{
"epoch": 1.2373187350765826,
"grad_norm": 0.33939051628112793,
"learning_rate": 0.0005854972327410427,
"loss": 3.9868,
"step": 4250
},
{
"epoch": 1.251878166676373,
"grad_norm": 0.39269140362739563,
"learning_rate": 0.0005853224584911156,
"loss": 3.9676,
"step": 4300
},
{
"epoch": 1.2664375982761633,
"grad_norm": 0.3487934470176697,
"learning_rate": 0.0005851476842411884,
"loss": 3.973,
"step": 4350
},
{
"epoch": 1.2809970298759537,
"grad_norm": 0.33803650736808777,
"learning_rate": 0.0005849729099912613,
"loss": 3.9805,
"step": 4400
},
{
"epoch": 1.295556461475744,
"grad_norm": 0.34375283122062683,
"learning_rate": 0.0005847981357413341,
"loss": 3.9729,
"step": 4450
},
{
"epoch": 1.3101158930755343,
"grad_norm": 0.3429529070854187,
"learning_rate": 0.0005846233614914068,
"loss": 3.9492,
"step": 4500
},
{
"epoch": 1.3246753246753247,
"grad_norm": 0.3482668399810791,
"learning_rate": 0.0005844485872414797,
"loss": 3.9654,
"step": 4550
},
{
"epoch": 1.339234756275115,
"grad_norm": 0.3361050486564636,
"learning_rate": 0.0005842738129915525,
"loss": 3.9693,
"step": 4600
},
{
"epoch": 1.3537941878749054,
"grad_norm": 0.34350207448005676,
"learning_rate": 0.0005840990387416253,
"loss": 3.9628,
"step": 4650
},
{
"epoch": 1.3683536194746957,
"grad_norm": 0.35732749104499817,
"learning_rate": 0.0005839242644916982,
"loss": 3.9383,
"step": 4700
},
{
"epoch": 1.382913051074486,
"grad_norm": 0.32812654972076416,
"learning_rate": 0.000583749490241771,
"loss": 3.9402,
"step": 4750
},
{
"epoch": 1.3974724826742764,
"grad_norm": 0.3359614312648773,
"learning_rate": 0.0005835747159918438,
"loss": 3.9409,
"step": 4800
},
{
"epoch": 1.4120319142740667,
"grad_norm": 0.36291930079460144,
"learning_rate": 0.0005833999417419166,
"loss": 3.9373,
"step": 4850
},
{
"epoch": 1.426591345873857,
"grad_norm": 0.3357282876968384,
"learning_rate": 0.0005832251674919894,
"loss": 3.9373,
"step": 4900
},
{
"epoch": 1.4411507774736474,
"grad_norm": 0.3662075996398926,
"learning_rate": 0.0005830503932420623,
"loss": 3.9326,
"step": 4950
},
{
"epoch": 1.4557102090734377,
"grad_norm": 0.3387506604194641,
"learning_rate": 0.0005828756189921351,
"loss": 3.9189,
"step": 5000
},
{
"epoch": 1.4557102090734377,
"eval_accuracy": 0.3320894535210645,
"eval_loss": 3.91398549079895,
"eval_runtime": 185.101,
"eval_samples_per_second": 89.913,
"eval_steps_per_second": 5.624,
"step": 5000
},
{
"epoch": 1.470269640673228,
"grad_norm": 0.32989710569381714,
"learning_rate": 0.000582700844742208,
"loss": 3.9282,
"step": 5050
},
{
"epoch": 1.4848290722730184,
"grad_norm": 0.3328815996646881,
"learning_rate": 0.0005825260704922807,
"loss": 3.9183,
"step": 5100
},
{
"epoch": 1.4993885038728088,
"grad_norm": 0.33961018919944763,
"learning_rate": 0.0005823512962423535,
"loss": 3.9253,
"step": 5150
},
{
"epoch": 1.5139479354725993,
"grad_norm": 0.33562958240509033,
"learning_rate": 0.0005821765219924264,
"loss": 3.9222,
"step": 5200
},
{
"epoch": 1.5285073670723897,
"grad_norm": 0.3406899571418762,
"learning_rate": 0.0005820017477424992,
"loss": 3.9185,
"step": 5250
},
{
"epoch": 1.54306679867218,
"grad_norm": 0.3406858742237091,
"learning_rate": 0.0005818269734925721,
"loss": 3.9156,
"step": 5300
},
{
"epoch": 1.5576262302719703,
"grad_norm": 0.34090015292167664,
"learning_rate": 0.0005816521992426448,
"loss": 3.8969,
"step": 5350
},
{
"epoch": 1.5721856618717607,
"grad_norm": 0.31158268451690674,
"learning_rate": 0.0005814774249927176,
"loss": 3.9143,
"step": 5400
},
{
"epoch": 1.586745093471551,
"grad_norm": 0.34926122426986694,
"learning_rate": 0.0005813026507427905,
"loss": 3.9132,
"step": 5450
},
{
"epoch": 1.6013045250713414,
"grad_norm": 0.34333717823028564,
"learning_rate": 0.0005811278764928634,
"loss": 3.9041,
"step": 5500
},
{
"epoch": 1.6158639566711317,
"grad_norm": 0.3164921998977661,
"learning_rate": 0.0005809531022429362,
"loss": 3.908,
"step": 5550
},
{
"epoch": 1.630423388270922,
"grad_norm": 0.3325600028038025,
"learning_rate": 0.0005807783279930091,
"loss": 3.8937,
"step": 5600
},
{
"epoch": 1.6449828198707124,
"grad_norm": 0.3716844916343689,
"learning_rate": 0.0005806035537430818,
"loss": 3.913,
"step": 5650
},
{
"epoch": 1.6595422514705027,
"grad_norm": 0.3302454352378845,
"learning_rate": 0.0005804287794931546,
"loss": 3.8894,
"step": 5700
},
{
"epoch": 1.674101683070293,
"grad_norm": 0.3286576271057129,
"learning_rate": 0.0005802540052432275,
"loss": 3.9061,
"step": 5750
},
{
"epoch": 1.6886611146700834,
"grad_norm": 0.31899774074554443,
"learning_rate": 0.0005800792309933003,
"loss": 3.885,
"step": 5800
},
{
"epoch": 1.7032205462698737,
"grad_norm": 0.38346347212791443,
"learning_rate": 0.0005799044567433732,
"loss": 3.8978,
"step": 5850
},
{
"epoch": 1.717779977869664,
"grad_norm": 0.32501021027565,
"learning_rate": 0.000579729682493446,
"loss": 3.8928,
"step": 5900
},
{
"epoch": 1.7323394094694544,
"grad_norm": 0.33264926075935364,
"learning_rate": 0.0005795549082435187,
"loss": 3.8917,
"step": 5950
},
{
"epoch": 1.7468988410692448,
"grad_norm": 0.35515546798706055,
"learning_rate": 0.0005793801339935916,
"loss": 3.8836,
"step": 6000
},
{
"epoch": 1.7468988410692448,
"eval_accuracy": 0.33675024013551297,
"eval_loss": 3.8585171699523926,
"eval_runtime": 185.0399,
"eval_samples_per_second": 89.943,
"eval_steps_per_second": 5.626,
"step": 6000
},
{
"epoch": 1.761458272669035,
"grad_norm": 0.3250105679035187,
"learning_rate": 0.0005792053597436644,
"loss": 3.8774,
"step": 6050
},
{
"epoch": 1.7760177042688254,
"grad_norm": 0.333280473947525,
"learning_rate": 0.0005790305854937372,
"loss": 3.8726,
"step": 6100
},
{
"epoch": 1.7905771358686158,
"grad_norm": 0.32873275876045227,
"learning_rate": 0.0005788558112438101,
"loss": 3.8701,
"step": 6150
},
{
"epoch": 1.8051365674684061,
"grad_norm": 0.3332742154598236,
"learning_rate": 0.0005786810369938828,
"loss": 3.8699,
"step": 6200
},
{
"epoch": 1.8196959990681965,
"grad_norm": 0.3222472369670868,
"learning_rate": 0.0005785062627439557,
"loss": 3.874,
"step": 6250
},
{
"epoch": 1.8342554306679868,
"grad_norm": 0.3324868381023407,
"learning_rate": 0.0005783314884940285,
"loss": 3.869,
"step": 6300
},
{
"epoch": 1.8488148622677771,
"grad_norm": 0.32730036973953247,
"learning_rate": 0.0005781567142441013,
"loss": 3.8536,
"step": 6350
},
{
"epoch": 1.8633742938675675,
"grad_norm": 0.3353622257709503,
"learning_rate": 0.0005779819399941742,
"loss": 3.869,
"step": 6400
},
{
"epoch": 1.8779337254673578,
"grad_norm": 0.33830076456069946,
"learning_rate": 0.000577807165744247,
"loss": 3.8726,
"step": 6450
},
{
"epoch": 1.8924931570671482,
"grad_norm": 0.31618306040763855,
"learning_rate": 0.0005776323914943198,
"loss": 3.8508,
"step": 6500
},
{
"epoch": 1.9070525886669385,
"grad_norm": 0.33165860176086426,
"learning_rate": 0.0005774576172443926,
"loss": 3.8566,
"step": 6550
},
{
"epoch": 1.9216120202667288,
"grad_norm": 0.3387751579284668,
"learning_rate": 0.0005772828429944654,
"loss": 3.8548,
"step": 6600
},
{
"epoch": 1.9361714518665192,
"grad_norm": 0.3364385664463043,
"learning_rate": 0.0005771080687445383,
"loss": 3.8539,
"step": 6650
},
{
"epoch": 1.9507308834663095,
"grad_norm": 0.34390878677368164,
"learning_rate": 0.0005769332944946111,
"loss": 3.8631,
"step": 6700
},
{
"epoch": 1.9652903150660999,
"grad_norm": 0.3324083685874939,
"learning_rate": 0.0005767585202446839,
"loss": 3.8482,
"step": 6750
},
{
"epoch": 1.9798497466658902,
"grad_norm": 0.32365697622299194,
"learning_rate": 0.0005765837459947567,
"loss": 3.8303,
"step": 6800
},
{
"epoch": 1.9944091782656805,
"grad_norm": 0.3342290222644806,
"learning_rate": 0.0005764089717448295,
"loss": 3.8508,
"step": 6850
},
{
"epoch": 2.0087356589598744,
"grad_norm": 0.3290010392665863,
"learning_rate": 0.0005762341974949024,
"loss": 3.7915,
"step": 6900
},
{
"epoch": 2.0232950905596647,
"grad_norm": 0.3240971565246582,
"learning_rate": 0.0005760594232449752,
"loss": 3.7587,
"step": 6950
},
{
"epoch": 2.037854522159455,
"grad_norm": 0.3391764163970947,
"learning_rate": 0.0005758846489950481,
"loss": 3.7526,
"step": 7000
},
{
"epoch": 2.037854522159455,
"eval_accuracy": 0.34109519666654636,
"eval_loss": 3.816195249557495,
"eval_runtime": 184.953,
"eval_samples_per_second": 89.985,
"eval_steps_per_second": 5.628,
"step": 7000
},
{
"epoch": 2.0524139537592454,
"grad_norm": 0.33266958594322205,
"learning_rate": 0.0005757098747451208,
"loss": 3.7541,
"step": 7050
},
{
"epoch": 2.0669733853590357,
"grad_norm": 0.34850549697875977,
"learning_rate": 0.0005755351004951936,
"loss": 3.7518,
"step": 7100
},
{
"epoch": 2.081532816958826,
"grad_norm": 0.3229345679283142,
"learning_rate": 0.0005753603262452665,
"loss": 3.7485,
"step": 7150
},
{
"epoch": 2.0960922485586164,
"grad_norm": 0.31956946849823,
"learning_rate": 0.0005751855519953393,
"loss": 3.7446,
"step": 7200
},
{
"epoch": 2.1106516801584068,
"grad_norm": 0.3483135402202606,
"learning_rate": 0.0005750107777454121,
"loss": 3.76,
"step": 7250
},
{
"epoch": 2.125211111758197,
"grad_norm": 0.3251873850822449,
"learning_rate": 0.0005748360034954849,
"loss": 3.7494,
"step": 7300
},
{
"epoch": 2.1397705433579874,
"grad_norm": 0.3456031382083893,
"learning_rate": 0.0005746612292455577,
"loss": 3.7564,
"step": 7350
},
{
"epoch": 2.154329974957778,
"grad_norm": 0.3253571093082428,
"learning_rate": 0.0005744864549956306,
"loss": 3.7517,
"step": 7400
},
{
"epoch": 2.168889406557568,
"grad_norm": 0.322238564491272,
"learning_rate": 0.0005743116807457034,
"loss": 3.7582,
"step": 7450
},
{
"epoch": 2.1834488381573585,
"grad_norm": 0.33640897274017334,
"learning_rate": 0.0005741369064957762,
"loss": 3.7567,
"step": 7500
},
{
"epoch": 2.198008269757149,
"grad_norm": 0.3346073627471924,
"learning_rate": 0.0005739621322458491,
"loss": 3.7461,
"step": 7550
},
{
"epoch": 2.212567701356939,
"grad_norm": 0.3327328860759735,
"learning_rate": 0.0005737873579959218,
"loss": 3.7594,
"step": 7600
},
{
"epoch": 2.2271271329567295,
"grad_norm": 0.3236997723579407,
"learning_rate": 0.0005736125837459947,
"loss": 3.7726,
"step": 7650
},
{
"epoch": 2.24168656455652,
"grad_norm": 0.33130574226379395,
"learning_rate": 0.0005734378094960675,
"loss": 3.7486,
"step": 7700
},
{
"epoch": 2.25624599615631,
"grad_norm": 0.34919485449790955,
"learning_rate": 0.0005732630352461403,
"loss": 3.7578,
"step": 7750
},
{
"epoch": 2.2708054277561005,
"grad_norm": 0.3182968497276306,
"learning_rate": 0.0005730882609962132,
"loss": 3.7604,
"step": 7800
},
{
"epoch": 2.285364859355891,
"grad_norm": 0.30436646938323975,
"learning_rate": 0.0005729134867462859,
"loss": 3.7412,
"step": 7850
},
{
"epoch": 2.299924290955681,
"grad_norm": 0.3302886188030243,
"learning_rate": 0.0005727387124963588,
"loss": 3.7515,
"step": 7900
},
{
"epoch": 2.3144837225554715,
"grad_norm": 0.30620837211608887,
"learning_rate": 0.0005725639382464317,
"loss": 3.7695,
"step": 7950
},
{
"epoch": 2.329043154155262,
"grad_norm": 0.3169257640838623,
"learning_rate": 0.0005723891639965045,
"loss": 3.7682,
"step": 8000
},
{
"epoch": 2.329043154155262,
"eval_accuracy": 0.34396188967982283,
"eval_loss": 3.788954496383667,
"eval_runtime": 182.8165,
"eval_samples_per_second": 91.037,
"eval_steps_per_second": 5.694,
"step": 8000
},
{
"epoch": 2.343602585755052,
"grad_norm": 0.3280718922615051,
"learning_rate": 0.0005722143897465773,
"loss": 3.7452,
"step": 8050
},
{
"epoch": 2.3581620173548425,
"grad_norm": 0.3237084746360779,
"learning_rate": 0.0005720396154966502,
"loss": 3.762,
"step": 8100
},
{
"epoch": 2.372721448954633,
"grad_norm": 0.31791386008262634,
"learning_rate": 0.0005718648412467229,
"loss": 3.7504,
"step": 8150
},
{
"epoch": 2.3872808805544232,
"grad_norm": 0.32723358273506165,
"learning_rate": 0.0005716900669967958,
"loss": 3.7561,
"step": 8200
},
{
"epoch": 2.4018403121542136,
"grad_norm": 0.3216814398765564,
"learning_rate": 0.0005715152927468686,
"loss": 3.7496,
"step": 8250
},
{
"epoch": 2.416399743754004,
"grad_norm": 0.32928794622421265,
"learning_rate": 0.0005713405184969414,
"loss": 3.7533,
"step": 8300
},
{
"epoch": 2.4309591753537942,
"grad_norm": 0.3223062753677368,
"learning_rate": 0.0005711657442470143,
"loss": 3.766,
"step": 8350
},
{
"epoch": 2.4455186069535846,
"grad_norm": 0.3292803168296814,
"learning_rate": 0.000570990969997087,
"loss": 3.7502,
"step": 8400
},
{
"epoch": 2.460078038553375,
"grad_norm": 0.3402736783027649,
"learning_rate": 0.0005708161957471599,
"loss": 3.744,
"step": 8450
},
{
"epoch": 2.4746374701531653,
"grad_norm": 0.3164720833301544,
"learning_rate": 0.0005706414214972327,
"loss": 3.7426,
"step": 8500
},
{
"epoch": 2.4891969017529556,
"grad_norm": 0.33465683460235596,
"learning_rate": 0.0005704666472473055,
"loss": 3.756,
"step": 8550
},
{
"epoch": 2.503756333352746,
"grad_norm": 0.3301171362400055,
"learning_rate": 0.0005702918729973784,
"loss": 3.7448,
"step": 8600
},
{
"epoch": 2.5183157649525363,
"grad_norm": 0.3436541259288788,
"learning_rate": 0.0005701170987474512,
"loss": 3.7449,
"step": 8650
},
{
"epoch": 2.5328751965523266,
"grad_norm": 0.3333314061164856,
"learning_rate": 0.0005699423244975239,
"loss": 3.7381,
"step": 8700
},
{
"epoch": 2.547434628152117,
"grad_norm": 0.3258245885372162,
"learning_rate": 0.0005697675502475968,
"loss": 3.7338,
"step": 8750
},
{
"epoch": 2.5619940597519073,
"grad_norm": 0.34784647822380066,
"learning_rate": 0.0005695927759976696,
"loss": 3.734,
"step": 8800
},
{
"epoch": 2.5765534913516976,
"grad_norm": 0.31109482049942017,
"learning_rate": 0.0005694180017477425,
"loss": 3.7372,
"step": 8850
},
{
"epoch": 2.591112922951488,
"grad_norm": 0.31201112270355225,
"learning_rate": 0.0005692432274978153,
"loss": 3.7499,
"step": 8900
},
{
"epoch": 2.6056723545512783,
"grad_norm": 0.31193050742149353,
"learning_rate": 0.000569068453247888,
"loss": 3.7385,
"step": 8950
},
{
"epoch": 2.6202317861510687,
"grad_norm": 0.3446432948112488,
"learning_rate": 0.0005688936789979609,
"loss": 3.7477,
"step": 9000
},
{
"epoch": 2.6202317861510687,
"eval_accuracy": 0.3468738524556142,
"eval_loss": 3.757246255874634,
"eval_runtime": 182.4423,
"eval_samples_per_second": 91.223,
"eval_steps_per_second": 5.706,
"step": 9000
},
{
"epoch": 2.634791217750859,
"grad_norm": 0.31883829832077026,
"learning_rate": 0.0005687189047480337,
"loss": 3.7364,
"step": 9050
},
{
"epoch": 2.6493506493506493,
"grad_norm": 0.3273116946220398,
"learning_rate": 0.0005685441304981066,
"loss": 3.7312,
"step": 9100
},
{
"epoch": 2.6639100809504397,
"grad_norm": 0.3443247973918915,
"learning_rate": 0.0005683693562481794,
"loss": 3.7366,
"step": 9150
},
{
"epoch": 2.67846951255023,
"grad_norm": 0.30951568484306335,
"learning_rate": 0.0005681945819982522,
"loss": 3.7425,
"step": 9200
},
{
"epoch": 2.6930289441500204,
"grad_norm": 0.3140866756439209,
"learning_rate": 0.000568019807748325,
"loss": 3.7396,
"step": 9250
},
{
"epoch": 2.7075883757498107,
"grad_norm": 0.32707467675209045,
"learning_rate": 0.0005678450334983978,
"loss": 3.7348,
"step": 9300
},
{
"epoch": 2.722147807349601,
"grad_norm": 0.32110151648521423,
"learning_rate": 0.0005676702592484707,
"loss": 3.7223,
"step": 9350
},
{
"epoch": 2.7367072389493914,
"grad_norm": 0.3235968053340912,
"learning_rate": 0.0005674954849985435,
"loss": 3.7379,
"step": 9400
},
{
"epoch": 2.7512666705491817,
"grad_norm": 0.34924793243408203,
"learning_rate": 0.0005673207107486163,
"loss": 3.7503,
"step": 9450
},
{
"epoch": 2.765826102148972,
"grad_norm": 0.32524895668029785,
"learning_rate": 0.0005671459364986892,
"loss": 3.7302,
"step": 9500
},
{
"epoch": 2.7803855337487624,
"grad_norm": 0.3183753490447998,
"learning_rate": 0.0005669711622487619,
"loss": 3.7229,
"step": 9550
},
{
"epoch": 2.7949449653485527,
"grad_norm": 0.31938815116882324,
"learning_rate": 0.0005667963879988348,
"loss": 3.7208,
"step": 9600
},
{
"epoch": 2.809504396948343,
"grad_norm": 0.3149973154067993,
"learning_rate": 0.0005666216137489076,
"loss": 3.7312,
"step": 9650
},
{
"epoch": 2.8240638285481334,
"grad_norm": 0.32664161920547485,
"learning_rate": 0.0005664468394989804,
"loss": 3.7436,
"step": 9700
},
{
"epoch": 2.8386232601479238,
"grad_norm": 0.31149327754974365,
"learning_rate": 0.0005662720652490533,
"loss": 3.728,
"step": 9750
},
{
"epoch": 2.853182691747714,
"grad_norm": 0.3289666175842285,
"learning_rate": 0.000566097290999126,
"loss": 3.7286,
"step": 9800
},
{
"epoch": 2.8677421233475044,
"grad_norm": 0.3204244077205658,
"learning_rate": 0.0005659225167491988,
"loss": 3.7122,
"step": 9850
},
{
"epoch": 2.882301554947295,
"grad_norm": 0.33363139629364014,
"learning_rate": 0.0005657477424992717,
"loss": 3.7409,
"step": 9900
},
{
"epoch": 2.896860986547085,
"grad_norm": 0.3554539084434509,
"learning_rate": 0.0005655729682493445,
"loss": 3.7301,
"step": 9950
},
{
"epoch": 2.9114204181468755,
"grad_norm": 0.306832879781723,
"learning_rate": 0.0005653981939994174,
"loss": 3.73,
"step": 10000
},
{
"epoch": 2.9114204181468755,
"eval_accuracy": 0.3494360034301546,
"eval_loss": 3.729952573776245,
"eval_runtime": 181.5285,
"eval_samples_per_second": 91.683,
"eval_steps_per_second": 5.735,
"step": 10000
},
{
"epoch": 2.925979849746666,
"grad_norm": 0.31433573365211487,
"learning_rate": 0.0005652234197494902,
"loss": 3.7247,
"step": 10050
},
{
"epoch": 2.940539281346456,
"grad_norm": 0.3179089426994324,
"learning_rate": 0.0005650486454995629,
"loss": 3.7153,
"step": 10100
},
{
"epoch": 2.9550987129462465,
"grad_norm": 0.3196451961994171,
"learning_rate": 0.0005648738712496358,
"loss": 3.7189,
"step": 10150
},
{
"epoch": 2.969658144546037,
"grad_norm": 0.30295759439468384,
"learning_rate": 0.0005646990969997086,
"loss": 3.7165,
"step": 10200
},
{
"epoch": 2.984217576145827,
"grad_norm": 0.32530921697616577,
"learning_rate": 0.0005645243227497815,
"loss": 3.715,
"step": 10250
},
{
"epoch": 2.9987770077456175,
"grad_norm": 0.30198994278907776,
"learning_rate": 0.0005643495484998543,
"loss": 3.7192,
"step": 10300
},
{
"epoch": 3.0131034884398114,
"grad_norm": 0.31793293356895447,
"learning_rate": 0.000564174774249927,
"loss": 3.6316,
"step": 10350
},
{
"epoch": 3.0276629200396017,
"grad_norm": 0.3131251633167267,
"learning_rate": 0.0005639999999999999,
"loss": 3.6161,
"step": 10400
},
{
"epoch": 3.042222351639392,
"grad_norm": 0.3221314251422882,
"learning_rate": 0.0005638252257500727,
"loss": 3.6239,
"step": 10450
},
{
"epoch": 3.0567817832391824,
"grad_norm": 0.3299553096294403,
"learning_rate": 0.0005636504515001456,
"loss": 3.6255,
"step": 10500
},
{
"epoch": 3.0713412148389727,
"grad_norm": 0.3239217698574066,
"learning_rate": 0.0005634756772502185,
"loss": 3.6207,
"step": 10550
},
{
"epoch": 3.085900646438763,
"grad_norm": 0.3120846152305603,
"learning_rate": 0.0005633009030002913,
"loss": 3.6305,
"step": 10600
},
{
"epoch": 3.1004600780385534,
"grad_norm": 0.324990838766098,
"learning_rate": 0.000563126128750364,
"loss": 3.6298,
"step": 10650
},
{
"epoch": 3.1150195096383437,
"grad_norm": 0.3125215172767639,
"learning_rate": 0.0005629513545004369,
"loss": 3.617,
"step": 10700
},
{
"epoch": 3.129578941238134,
"grad_norm": 0.3323279917240143,
"learning_rate": 0.0005627765802505097,
"loss": 3.6235,
"step": 10750
},
{
"epoch": 3.1441383728379244,
"grad_norm": 0.3290170133113861,
"learning_rate": 0.0005626018060005826,
"loss": 3.6227,
"step": 10800
},
{
"epoch": 3.1586978044377148,
"grad_norm": 0.3450184762477875,
"learning_rate": 0.0005624270317506554,
"loss": 3.64,
"step": 10850
},
{
"epoch": 3.173257236037505,
"grad_norm": 0.32774847745895386,
"learning_rate": 0.0005622522575007282,
"loss": 3.646,
"step": 10900
},
{
"epoch": 3.1878166676372954,
"grad_norm": 0.32285189628601074,
"learning_rate": 0.000562077483250801,
"loss": 3.643,
"step": 10950
},
{
"epoch": 3.2023760992370858,
"grad_norm": 0.3201664686203003,
"learning_rate": 0.0005619027090008738,
"loss": 3.6397,
"step": 11000
},
{
"epoch": 3.2023760992370858,
"eval_accuracy": 0.3513435653971105,
"eval_loss": 3.7152557373046875,
"eval_runtime": 180.6216,
"eval_samples_per_second": 92.143,
"eval_steps_per_second": 5.763,
"step": 11000
},
{
"epoch": 3.216935530836876,
"grad_norm": 0.32860246300697327,
"learning_rate": 0.0005617279347509467,
"loss": 3.6478,
"step": 11050
},
{
"epoch": 3.2314949624366665,
"grad_norm": 0.32338783144950867,
"learning_rate": 0.0005615531605010195,
"loss": 3.6419,
"step": 11100
},
{
"epoch": 3.246054394036457,
"grad_norm": 0.3216056823730469,
"learning_rate": 0.0005613783862510923,
"loss": 3.6497,
"step": 11150
},
{
"epoch": 3.260613825636247,
"grad_norm": 0.36512988805770874,
"learning_rate": 0.0005612036120011652,
"loss": 3.6238,
"step": 11200
},
{
"epoch": 3.2751732572360375,
"grad_norm": 0.33006951212882996,
"learning_rate": 0.0005610288377512379,
"loss": 3.65,
"step": 11250
},
{
"epoch": 3.289732688835828,
"grad_norm": 0.32506290078163147,
"learning_rate": 0.0005608540635013107,
"loss": 3.6369,
"step": 11300
},
{
"epoch": 3.304292120435618,
"grad_norm": 0.3291010856628418,
"learning_rate": 0.0005606792892513836,
"loss": 3.644,
"step": 11350
},
{
"epoch": 3.3188515520354085,
"grad_norm": 0.3134164810180664,
"learning_rate": 0.0005605045150014564,
"loss": 3.6428,
"step": 11400
},
{
"epoch": 3.333410983635199,
"grad_norm": 0.3079008162021637,
"learning_rate": 0.0005603297407515293,
"loss": 3.638,
"step": 11450
},
{
"epoch": 3.347970415234989,
"grad_norm": 0.2959432899951935,
"learning_rate": 0.000560154966501602,
"loss": 3.6469,
"step": 11500
},
{
"epoch": 3.3625298468347795,
"grad_norm": 0.3210470676422119,
"learning_rate": 0.0005599801922516748,
"loss": 3.6441,
"step": 11550
},
{
"epoch": 3.37708927843457,
"grad_norm": 0.3303925395011902,
"learning_rate": 0.0005598054180017477,
"loss": 3.6448,
"step": 11600
},
{
"epoch": 3.39164871003436,
"grad_norm": 0.3426654040813446,
"learning_rate": 0.0005596306437518205,
"loss": 3.638,
"step": 11650
},
{
"epoch": 3.4062081416341505,
"grad_norm": 0.35107845067977905,
"learning_rate": 0.0005594558695018934,
"loss": 3.6483,
"step": 11700
},
{
"epoch": 3.420767573233941,
"grad_norm": 0.3188258111476898,
"learning_rate": 0.0005592810952519662,
"loss": 3.6422,
"step": 11750
},
{
"epoch": 3.435327004833731,
"grad_norm": 0.33043134212493896,
"learning_rate": 0.0005591063210020389,
"loss": 3.6448,
"step": 11800
},
{
"epoch": 3.4498864364335216,
"grad_norm": 0.31511127948760986,
"learning_rate": 0.0005589315467521118,
"loss": 3.648,
"step": 11850
},
{
"epoch": 3.464445868033312,
"grad_norm": 0.3306327164173126,
"learning_rate": 0.0005587567725021846,
"loss": 3.6258,
"step": 11900
},
{
"epoch": 3.4790052996331022,
"grad_norm": 0.3343588411808014,
"learning_rate": 0.0005585819982522575,
"loss": 3.646,
"step": 11950
},
{
"epoch": 3.4935647312328926,
"grad_norm": 0.3293665945529938,
"learning_rate": 0.0005584072240023303,
"loss": 3.6405,
"step": 12000
},
{
"epoch": 3.4935647312328926,
"eval_accuracy": 0.35307957260170497,
"eval_loss": 3.6972014904022217,
"eval_runtime": 181.5639,
"eval_samples_per_second": 91.665,
"eval_steps_per_second": 5.734,
"step": 12000
},
{
"epoch": 3.508124162832683,
"grad_norm": 0.3309422433376312,
"learning_rate": 0.000558232449752403,
"loss": 3.6445,
"step": 12050
},
{
"epoch": 3.5226835944324733,
"grad_norm": 0.3296276032924652,
"learning_rate": 0.0005580576755024759,
"loss": 3.6433,
"step": 12100
},
{
"epoch": 3.5372430260322636,
"grad_norm": 0.3203052580356598,
"learning_rate": 0.0005578829012525487,
"loss": 3.6408,
"step": 12150
},
{
"epoch": 3.551802457632054,
"grad_norm": 0.31741246581077576,
"learning_rate": 0.0005577081270026216,
"loss": 3.6379,
"step": 12200
},
{
"epoch": 3.5663618892318443,
"grad_norm": 0.32449865341186523,
"learning_rate": 0.0005575333527526944,
"loss": 3.6515,
"step": 12250
},
{
"epoch": 3.5809213208316346,
"grad_norm": 0.3303356170654297,
"learning_rate": 0.0005573585785027672,
"loss": 3.6346,
"step": 12300
},
{
"epoch": 3.595480752431425,
"grad_norm": 0.3001437783241272,
"learning_rate": 0.00055718380425284,
"loss": 3.6476,
"step": 12350
},
{
"epoch": 3.6100401840312153,
"grad_norm": 0.3065738379955292,
"learning_rate": 0.0005570090300029128,
"loss": 3.6495,
"step": 12400
},
{
"epoch": 3.6245996156310056,
"grad_norm": 0.3155801594257355,
"learning_rate": 0.0005568342557529856,
"loss": 3.6398,
"step": 12450
},
{
"epoch": 3.639159047230796,
"grad_norm": 0.3072325587272644,
"learning_rate": 0.0005566594815030585,
"loss": 3.6446,
"step": 12500
},
{
"epoch": 3.6537184788305863,
"grad_norm": 0.331887811422348,
"learning_rate": 0.0005564847072531313,
"loss": 3.6402,
"step": 12550
},
{
"epoch": 3.6682779104303767,
"grad_norm": 0.30090418457984924,
"learning_rate": 0.0005563099330032042,
"loss": 3.6303,
"step": 12600
},
{
"epoch": 3.682837342030167,
"grad_norm": 0.3239140808582306,
"learning_rate": 0.0005561351587532769,
"loss": 3.6552,
"step": 12650
},
{
"epoch": 3.6973967736299573,
"grad_norm": 0.320881724357605,
"learning_rate": 0.0005559603845033497,
"loss": 3.6356,
"step": 12700
},
{
"epoch": 3.7119562052297477,
"grad_norm": 0.3165138363838196,
"learning_rate": 0.0005557856102534226,
"loss": 3.6434,
"step": 12750
},
{
"epoch": 3.726515636829538,
"grad_norm": 0.3095230162143707,
"learning_rate": 0.0005556108360034954,
"loss": 3.6385,
"step": 12800
},
{
"epoch": 3.7410750684293284,
"grad_norm": 0.34694117307662964,
"learning_rate": 0.0005554360617535683,
"loss": 3.6463,
"step": 12850
},
{
"epoch": 3.755634500029119,
"grad_norm": 0.32559525966644287,
"learning_rate": 0.000555261287503641,
"loss": 3.6323,
"step": 12900
},
{
"epoch": 3.770193931628909,
"grad_norm": 0.3220575451850891,
"learning_rate": 0.0005550865132537138,
"loss": 3.6369,
"step": 12950
},
{
"epoch": 3.7847533632287,
"grad_norm": 0.31526488065719604,
"learning_rate": 0.0005549117390037867,
"loss": 3.6412,
"step": 13000
},
{
"epoch": 3.7847533632287,
"eval_accuracy": 0.35453631828429244,
"eval_loss": 3.682695150375366,
"eval_runtime": 183.1807,
"eval_samples_per_second": 90.856,
"eval_steps_per_second": 5.683,
"step": 13000
},
{
"epoch": 3.7993127948284897,
"grad_norm": 0.3306889832019806,
"learning_rate": 0.0005547369647538596,
"loss": 3.6618,
"step": 13050
},
{
"epoch": 3.8138722264282805,
"grad_norm": 0.33385586738586426,
"learning_rate": 0.0005545621905039324,
"loss": 3.6427,
"step": 13100
},
{
"epoch": 3.8284316580280704,
"grad_norm": 0.30829793214797974,
"learning_rate": 0.0005543874162540053,
"loss": 3.6345,
"step": 13150
},
{
"epoch": 3.842991089627861,
"grad_norm": 0.3245658576488495,
"learning_rate": 0.000554212642004078,
"loss": 3.6519,
"step": 13200
},
{
"epoch": 3.857550521227651,
"grad_norm": 0.29873931407928467,
"learning_rate": 0.0005540378677541508,
"loss": 3.639,
"step": 13250
},
{
"epoch": 3.872109952827442,
"grad_norm": 0.3140360414981842,
"learning_rate": 0.0005538630935042237,
"loss": 3.644,
"step": 13300
},
{
"epoch": 3.8866693844272318,
"grad_norm": 0.31487107276916504,
"learning_rate": 0.0005536883192542965,
"loss": 3.6451,
"step": 13350
},
{
"epoch": 3.9012288160270225,
"grad_norm": 0.31665652990341187,
"learning_rate": 0.0005535135450043694,
"loss": 3.63,
"step": 13400
},
{
"epoch": 3.9157882476268124,
"grad_norm": 0.3285450339317322,
"learning_rate": 0.0005533387707544422,
"loss": 3.6402,
"step": 13450
},
{
"epoch": 3.930347679226603,
"grad_norm": 0.3168368935585022,
"learning_rate": 0.0005531639965045149,
"loss": 3.6433,
"step": 13500
},
{
"epoch": 3.944907110826393,
"grad_norm": 0.3096484839916229,
"learning_rate": 0.0005529892222545878,
"loss": 3.6292,
"step": 13550
},
{
"epoch": 3.959466542426184,
"grad_norm": 0.31400060653686523,
"learning_rate": 0.0005528144480046606,
"loss": 3.6337,
"step": 13600
},
{
"epoch": 3.974025974025974,
"grad_norm": 0.32995402812957764,
"learning_rate": 0.0005526396737547335,
"loss": 3.644,
"step": 13650
},
{
"epoch": 3.9885854056257646,
"grad_norm": 0.30545228719711304,
"learning_rate": 0.0005524648995048063,
"loss": 3.6337,
"step": 13700
},
{
"epoch": 4.002911886319958,
"grad_norm": 0.3340036869049072,
"learning_rate": 0.000552290125254879,
"loss": 3.6049,
"step": 13750
},
{
"epoch": 4.017471317919749,
"grad_norm": 0.3237653076648712,
"learning_rate": 0.0005521153510049519,
"loss": 3.5263,
"step": 13800
},
{
"epoch": 4.032030749519539,
"grad_norm": 0.33258405327796936,
"learning_rate": 0.0005519405767550247,
"loss": 3.5231,
"step": 13850
},
{
"epoch": 4.046590181119329,
"grad_norm": 0.33560073375701904,
"learning_rate": 0.0005517658025050975,
"loss": 3.5422,
"step": 13900
},
{
"epoch": 4.061149612719119,
"grad_norm": 0.32539400458335876,
"learning_rate": 0.0005515910282551704,
"loss": 3.5393,
"step": 13950
},
{
"epoch": 4.07570904431891,
"grad_norm": 0.3466116786003113,
"learning_rate": 0.0005514162540052432,
"loss": 3.5371,
"step": 14000
},
{
"epoch": 4.07570904431891,
"eval_accuracy": 0.35599906074061566,
"eval_loss": 3.6699209213256836,
"eval_runtime": 180.5976,
"eval_samples_per_second": 92.155,
"eval_steps_per_second": 5.764,
"step": 14000
},
{
"epoch": 4.0902684759187,
"grad_norm": 0.35234954953193665,
"learning_rate": 0.000551241479755316,
"loss": 3.5405,
"step": 14050
},
{
"epoch": 4.104827907518491,
"grad_norm": 0.3241097629070282,
"learning_rate": 0.0005510667055053888,
"loss": 3.5312,
"step": 14100
},
{
"epoch": 4.119387339118281,
"grad_norm": 0.35480767488479614,
"learning_rate": 0.0005508919312554616,
"loss": 3.541,
"step": 14150
},
{
"epoch": 4.1339467707180715,
"grad_norm": 0.31226274371147156,
"learning_rate": 0.0005507171570055345,
"loss": 3.5525,
"step": 14200
},
{
"epoch": 4.148506202317861,
"grad_norm": 0.3221980631351471,
"learning_rate": 0.0005505423827556073,
"loss": 3.5545,
"step": 14250
},
{
"epoch": 4.163065633917652,
"grad_norm": 0.33322617411613464,
"learning_rate": 0.0005503676085056802,
"loss": 3.5607,
"step": 14300
},
{
"epoch": 4.177625065517442,
"grad_norm": 0.31406116485595703,
"learning_rate": 0.0005501928342557529,
"loss": 3.5597,
"step": 14350
},
{
"epoch": 4.192184497117233,
"grad_norm": 0.30982154607772827,
"learning_rate": 0.0005500180600058257,
"loss": 3.5544,
"step": 14400
},
{
"epoch": 4.206743928717023,
"grad_norm": 0.31833505630493164,
"learning_rate": 0.0005498432857558986,
"loss": 3.5589,
"step": 14450
},
{
"epoch": 4.2213033603168135,
"grad_norm": 0.31112346053123474,
"learning_rate": 0.0005496685115059714,
"loss": 3.5535,
"step": 14500
},
{
"epoch": 4.235862791916603,
"grad_norm": 0.3102998733520508,
"learning_rate": 0.0005494937372560443,
"loss": 3.5584,
"step": 14550
},
{
"epoch": 4.250422223516394,
"grad_norm": 0.3442176878452301,
"learning_rate": 0.000549318963006117,
"loss": 3.5691,
"step": 14600
},
{
"epoch": 4.264981655116184,
"grad_norm": 0.3217466175556183,
"learning_rate": 0.0005491441887561898,
"loss": 3.5748,
"step": 14650
},
{
"epoch": 4.279541086715975,
"grad_norm": 0.32345715165138245,
"learning_rate": 0.0005489694145062627,
"loss": 3.5711,
"step": 14700
},
{
"epoch": 4.294100518315765,
"grad_norm": 0.31309959292411804,
"learning_rate": 0.0005487946402563355,
"loss": 3.5544,
"step": 14750
},
{
"epoch": 4.308659949915556,
"grad_norm": 0.31507858633995056,
"learning_rate": 0.0005486198660064084,
"loss": 3.5806,
"step": 14800
},
{
"epoch": 4.3232193815153455,
"grad_norm": 0.3113386631011963,
"learning_rate": 0.0005484450917564812,
"loss": 3.5698,
"step": 14850
},
{
"epoch": 4.337778813115136,
"grad_norm": 0.30662500858306885,
"learning_rate": 0.0005482703175065539,
"loss": 3.5684,
"step": 14900
},
{
"epoch": 4.352338244714926,
"grad_norm": 0.33159640431404114,
"learning_rate": 0.0005480955432566268,
"loss": 3.5681,
"step": 14950
},
{
"epoch": 4.366897676314717,
"grad_norm": 0.3497229218482971,
"learning_rate": 0.0005479207690066996,
"loss": 3.5768,
"step": 15000
},
{
"epoch": 4.366897676314717,
"eval_accuracy": 0.35680662627036064,
"eval_loss": 3.663057565689087,
"eval_runtime": 180.5674,
"eval_samples_per_second": 92.171,
"eval_steps_per_second": 5.765,
"step": 15000
},
{
"epoch": 4.381457107914507,
"grad_norm": 0.3152848184108734,
"learning_rate": 0.0005477459947567725,
"loss": 3.5651,
"step": 15050
},
{
"epoch": 4.396016539514298,
"grad_norm": 0.31485655903816223,
"learning_rate": 0.0005475712205068453,
"loss": 3.5724,
"step": 15100
},
{
"epoch": 4.4105759711140875,
"grad_norm": 0.3210237920284271,
"learning_rate": 0.000547396446256918,
"loss": 3.5743,
"step": 15150
},
{
"epoch": 4.425135402713878,
"grad_norm": 0.31647804379463196,
"learning_rate": 0.0005472216720069909,
"loss": 3.5643,
"step": 15200
},
{
"epoch": 4.439694834313668,
"grad_norm": 0.3220058083534241,
"learning_rate": 0.0005470468977570637,
"loss": 3.5777,
"step": 15250
},
{
"epoch": 4.454254265913459,
"grad_norm": 0.31475868821144104,
"learning_rate": 0.0005468721235071365,
"loss": 3.5759,
"step": 15300
},
{
"epoch": 4.468813697513249,
"grad_norm": 0.31258007884025574,
"learning_rate": 0.0005466973492572094,
"loss": 3.58,
"step": 15350
},
{
"epoch": 4.48337312911304,
"grad_norm": 0.3323783874511719,
"learning_rate": 0.0005465225750072822,
"loss": 3.5717,
"step": 15400
},
{
"epoch": 4.4979325607128295,
"grad_norm": 0.31647196412086487,
"learning_rate": 0.000546347800757355,
"loss": 3.5666,
"step": 15450
},
{
"epoch": 4.51249199231262,
"grad_norm": 0.3166157007217407,
"learning_rate": 0.0005461730265074279,
"loss": 3.5661,
"step": 15500
},
{
"epoch": 4.52705142391241,
"grad_norm": 0.33359718322753906,
"learning_rate": 0.0005459982522575007,
"loss": 3.581,
"step": 15550
},
{
"epoch": 4.541610855512201,
"grad_norm": 0.30880287289619446,
"learning_rate": 0.0005458234780075735,
"loss": 3.5767,
"step": 15600
},
{
"epoch": 4.556170287111991,
"grad_norm": 0.3321440517902374,
"learning_rate": 0.0005456487037576464,
"loss": 3.5927,
"step": 15650
},
{
"epoch": 4.570729718711782,
"grad_norm": 0.35169097781181335,
"learning_rate": 0.0005454739295077192,
"loss": 3.5777,
"step": 15700
},
{
"epoch": 4.585289150311572,
"grad_norm": 0.3210912048816681,
"learning_rate": 0.000545299155257792,
"loss": 3.5641,
"step": 15750
},
{
"epoch": 4.599848581911362,
"grad_norm": 0.3266526460647583,
"learning_rate": 0.0005451243810078648,
"loss": 3.5624,
"step": 15800
},
{
"epoch": 4.614408013511152,
"grad_norm": 0.3169322609901428,
"learning_rate": 0.0005449496067579376,
"loss": 3.582,
"step": 15850
},
{
"epoch": 4.628967445110943,
"grad_norm": 0.30979159474372864,
"learning_rate": 0.0005447748325080105,
"loss": 3.5808,
"step": 15900
},
{
"epoch": 4.643526876710733,
"grad_norm": 0.3104844391345978,
"learning_rate": 0.0005446000582580833,
"loss": 3.5779,
"step": 15950
},
{
"epoch": 4.658086308310524,
"grad_norm": 0.3167930543422699,
"learning_rate": 0.0005444252840081562,
"loss": 3.577,
"step": 16000
},
{
"epoch": 4.658086308310524,
"eval_accuracy": 0.3581025173162721,
"eval_loss": 3.6506025791168213,
"eval_runtime": 184.1909,
"eval_samples_per_second": 90.357,
"eval_steps_per_second": 5.652,
"step": 16000
},
{
"epoch": 4.672645739910314,
"grad_norm": 0.3104100823402405,
"learning_rate": 0.0005442505097582289,
"loss": 3.5755,
"step": 16050
},
{
"epoch": 4.687205171510104,
"grad_norm": 0.32251089811325073,
"learning_rate": 0.0005440757355083017,
"loss": 3.5785,
"step": 16100
},
{
"epoch": 4.701764603109894,
"grad_norm": 0.30579274892807007,
"learning_rate": 0.0005439009612583746,
"loss": 3.5736,
"step": 16150
},
{
"epoch": 4.716324034709685,
"grad_norm": 0.32924431562423706,
"learning_rate": 0.0005437261870084474,
"loss": 3.5859,
"step": 16200
},
{
"epoch": 4.730883466309475,
"grad_norm": 0.32339397072792053,
"learning_rate": 0.0005435514127585203,
"loss": 3.5714,
"step": 16250
},
{
"epoch": 4.745442897909266,
"grad_norm": 0.3301834762096405,
"learning_rate": 0.000543376638508593,
"loss": 3.581,
"step": 16300
},
{
"epoch": 4.760002329509056,
"grad_norm": 0.3323529064655304,
"learning_rate": 0.0005432018642586658,
"loss": 3.5745,
"step": 16350
},
{
"epoch": 4.7745617611088464,
"grad_norm": 0.31460458040237427,
"learning_rate": 0.0005430270900087387,
"loss": 3.5752,
"step": 16400
},
{
"epoch": 4.789121192708636,
"grad_norm": 0.30962061882019043,
"learning_rate": 0.0005428523157588115,
"loss": 3.5847,
"step": 16450
},
{
"epoch": 4.803680624308427,
"grad_norm": 0.31121689081192017,
"learning_rate": 0.0005426775415088843,
"loss": 3.581,
"step": 16500
},
{
"epoch": 4.818240055908217,
"grad_norm": 0.3271123170852661,
"learning_rate": 0.0005425027672589572,
"loss": 3.5747,
"step": 16550
},
{
"epoch": 4.832799487508008,
"grad_norm": 0.34155216813087463,
"learning_rate": 0.0005423279930090299,
"loss": 3.5757,
"step": 16600
},
{
"epoch": 4.847358919107798,
"grad_norm": 0.31826114654541016,
"learning_rate": 0.0005421532187591028,
"loss": 3.5863,
"step": 16650
},
{
"epoch": 4.8619183507075885,
"grad_norm": 0.3213462829589844,
"learning_rate": 0.0005419784445091756,
"loss": 3.5846,
"step": 16700
},
{
"epoch": 4.876477782307378,
"grad_norm": 0.3335978388786316,
"learning_rate": 0.0005418036702592484,
"loss": 3.5778,
"step": 16750
},
{
"epoch": 4.891037213907169,
"grad_norm": 0.32565537095069885,
"learning_rate": 0.0005416288960093213,
"loss": 3.5903,
"step": 16800
},
{
"epoch": 4.905596645506959,
"grad_norm": 0.31601616740226746,
"learning_rate": 0.000541454121759394,
"loss": 3.581,
"step": 16850
},
{
"epoch": 4.92015607710675,
"grad_norm": 0.3034924268722534,
"learning_rate": 0.0005412793475094669,
"loss": 3.5731,
"step": 16900
},
{
"epoch": 4.93471550870654,
"grad_norm": 0.30528074502944946,
"learning_rate": 0.0005411045732595397,
"loss": 3.5775,
"step": 16950
},
{
"epoch": 4.9492749403063305,
"grad_norm": 0.32346123456954956,
"learning_rate": 0.0005409297990096125,
"loss": 3.5711,
"step": 17000
},
{
"epoch": 4.9492749403063305,
"eval_accuracy": 0.3593712465046746,
"eval_loss": 3.6345937252044678,
"eval_runtime": 183.7217,
"eval_samples_per_second": 90.588,
"eval_steps_per_second": 5.666,
"step": 17000
},
{
"epoch": 4.96383437190612,
"grad_norm": 0.3116399049758911,
"learning_rate": 0.0005407550247596854,
"loss": 3.5657,
"step": 17050
},
{
"epoch": 4.978393803505911,
"grad_norm": 0.3291073143482208,
"learning_rate": 0.0005405802505097582,
"loss": 3.5751,
"step": 17100
},
{
"epoch": 4.992953235105701,
"grad_norm": 0.3149360716342926,
"learning_rate": 0.000540405476259831,
"loss": 3.5743,
"step": 17150
},
{
"epoch": 5.007279715799895,
"grad_norm": 0.3213154971599579,
"learning_rate": 0.0005402307020099038,
"loss": 3.5347,
"step": 17200
},
{
"epoch": 5.021839147399685,
"grad_norm": 0.3356756567955017,
"learning_rate": 0.0005400559277599766,
"loss": 3.4594,
"step": 17250
},
{
"epoch": 5.036398578999476,
"grad_norm": 0.3190675973892212,
"learning_rate": 0.0005398811535100495,
"loss": 3.4737,
"step": 17300
},
{
"epoch": 5.050958010599266,
"grad_norm": 0.30441927909851074,
"learning_rate": 0.0005397063792601223,
"loss": 3.4683,
"step": 17350
},
{
"epoch": 5.065517442199057,
"grad_norm": 0.3276670277118683,
"learning_rate": 0.0005395316050101951,
"loss": 3.4779,
"step": 17400
},
{
"epoch": 5.080076873798847,
"grad_norm": 0.3393913805484772,
"learning_rate": 0.0005393568307602679,
"loss": 3.4886,
"step": 17450
},
{
"epoch": 5.094636305398637,
"grad_norm": 0.33122798800468445,
"learning_rate": 0.0005391820565103407,
"loss": 3.4829,
"step": 17500
},
{
"epoch": 5.109195736998427,
"grad_norm": 0.32901448011398315,
"learning_rate": 0.0005390072822604136,
"loss": 3.4872,
"step": 17550
},
{
"epoch": 5.123755168598218,
"grad_norm": 0.3309627175331116,
"learning_rate": 0.0005388325080104864,
"loss": 3.4831,
"step": 17600
},
{
"epoch": 5.138314600198008,
"grad_norm": 0.32044172286987305,
"learning_rate": 0.0005386577337605593,
"loss": 3.4888,
"step": 17650
},
{
"epoch": 5.152874031797799,
"grad_norm": 0.3464089334011078,
"learning_rate": 0.000538482959510632,
"loss": 3.4972,
"step": 17700
},
{
"epoch": 5.167433463397589,
"grad_norm": 0.3171513080596924,
"learning_rate": 0.0005383081852607048,
"loss": 3.5026,
"step": 17750
},
{
"epoch": 5.1819928949973795,
"grad_norm": 0.3164452612400055,
"learning_rate": 0.0005381334110107777,
"loss": 3.4926,
"step": 17800
},
{
"epoch": 5.196552326597169,
"grad_norm": 0.32658103108406067,
"learning_rate": 0.0005379586367608505,
"loss": 3.5046,
"step": 17850
},
{
"epoch": 5.21111175819696,
"grad_norm": 0.32511815428733826,
"learning_rate": 0.0005377838625109233,
"loss": 3.4953,
"step": 17900
},
{
"epoch": 5.22567118979675,
"grad_norm": 0.343904972076416,
"learning_rate": 0.0005376090882609961,
"loss": 3.5065,
"step": 17950
},
{
"epoch": 5.240230621396541,
"grad_norm": 0.33408525586128235,
"learning_rate": 0.0005374343140110689,
"loss": 3.5066,
"step": 18000
},
{
"epoch": 5.240230621396541,
"eval_accuracy": 0.35997068871065013,
"eval_loss": 3.6377220153808594,
"eval_runtime": 181.2212,
"eval_samples_per_second": 91.838,
"eval_steps_per_second": 5.744,
"step": 18000
},
{
"epoch": 5.254790052996331,
"grad_norm": 0.3558831512928009,
"learning_rate": 0.0005372595397611418,
"loss": 3.5154,
"step": 18050
},
{
"epoch": 5.2693494845961215,
"grad_norm": 0.3240915536880493,
"learning_rate": 0.0005370847655112147,
"loss": 3.5104,
"step": 18100
},
{
"epoch": 5.283908916195911,
"grad_norm": 0.3641294538974762,
"learning_rate": 0.0005369099912612875,
"loss": 3.5125,
"step": 18150
},
{
"epoch": 5.298468347795702,
"grad_norm": 0.323595255613327,
"learning_rate": 0.0005367352170113603,
"loss": 3.5091,
"step": 18200
},
{
"epoch": 5.313027779395492,
"grad_norm": 0.31085318326950073,
"learning_rate": 0.0005365604427614331,
"loss": 3.5061,
"step": 18250
},
{
"epoch": 5.327587210995283,
"grad_norm": 0.3321459889411926,
"learning_rate": 0.0005363856685115059,
"loss": 3.5128,
"step": 18300
},
{
"epoch": 5.342146642595073,
"grad_norm": 0.3359740674495697,
"learning_rate": 0.0005362108942615788,
"loss": 3.5207,
"step": 18350
},
{
"epoch": 5.3567060741948636,
"grad_norm": 0.35164040327072144,
"learning_rate": 0.0005360361200116516,
"loss": 3.5206,
"step": 18400
},
{
"epoch": 5.3712655057946534,
"grad_norm": 0.33065569400787354,
"learning_rate": 0.0005358613457617244,
"loss": 3.5137,
"step": 18450
},
{
"epoch": 5.385824937394444,
"grad_norm": 0.31795698404312134,
"learning_rate": 0.0005356865715117973,
"loss": 3.5181,
"step": 18500
},
{
"epoch": 5.400384368994234,
"grad_norm": 0.3166426718235016,
"learning_rate": 0.00053551179726187,
"loss": 3.5129,
"step": 18550
},
{
"epoch": 5.414943800594025,
"grad_norm": 0.3113225996494293,
"learning_rate": 0.0005353370230119429,
"loss": 3.5224,
"step": 18600
},
{
"epoch": 5.429503232193815,
"grad_norm": 0.3037504553794861,
"learning_rate": 0.0005351622487620157,
"loss": 3.5212,
"step": 18650
},
{
"epoch": 5.444062663793606,
"grad_norm": 0.3170977830886841,
"learning_rate": 0.0005349874745120885,
"loss": 3.5185,
"step": 18700
},
{
"epoch": 5.4586220953933955,
"grad_norm": 0.3276199698448181,
"learning_rate": 0.0005348127002621614,
"loss": 3.5143,
"step": 18750
},
{
"epoch": 5.473181526993186,
"grad_norm": 0.35049423575401306,
"learning_rate": 0.0005346379260122341,
"loss": 3.5178,
"step": 18800
},
{
"epoch": 5.487740958592976,
"grad_norm": 0.3257882595062256,
"learning_rate": 0.000534463151762307,
"loss": 3.513,
"step": 18850
},
{
"epoch": 5.502300390192767,
"grad_norm": 0.3254280686378479,
"learning_rate": 0.0005342883775123798,
"loss": 3.5157,
"step": 18900
},
{
"epoch": 5.516859821792557,
"grad_norm": 0.35354653000831604,
"learning_rate": 0.0005341136032624526,
"loss": 3.5323,
"step": 18950
},
{
"epoch": 5.531419253392348,
"grad_norm": 0.3293665945529938,
"learning_rate": 0.0005339388290125255,
"loss": 3.5294,
"step": 19000
},
{
"epoch": 5.531419253392348,
"eval_accuracy": 0.36105410583223874,
"eval_loss": 3.6271042823791504,
"eval_runtime": 181.268,
"eval_samples_per_second": 91.814,
"eval_steps_per_second": 5.743,
"step": 19000
},
{
"epoch": 5.5459786849921375,
"grad_norm": 0.32479095458984375,
"learning_rate": 0.0005337640547625983,
"loss": 3.5257,
"step": 19050
},
{
"epoch": 5.560538116591928,
"grad_norm": 0.30282458662986755,
"learning_rate": 0.000533589280512671,
"loss": 3.5376,
"step": 19100
},
{
"epoch": 5.575097548191718,
"grad_norm": 0.3051811754703522,
"learning_rate": 0.0005334145062627439,
"loss": 3.5188,
"step": 19150
},
{
"epoch": 5.589656979791509,
"grad_norm": 0.34127405285835266,
"learning_rate": 0.0005332397320128167,
"loss": 3.5171,
"step": 19200
},
{
"epoch": 5.604216411391299,
"grad_norm": 0.3210941553115845,
"learning_rate": 0.0005330649577628896,
"loss": 3.5248,
"step": 19250
},
{
"epoch": 5.61877584299109,
"grad_norm": 0.3192020654678345,
"learning_rate": 0.0005328901835129624,
"loss": 3.533,
"step": 19300
},
{
"epoch": 5.6333352745908805,
"grad_norm": 0.34110450744628906,
"learning_rate": 0.0005327154092630351,
"loss": 3.5295,
"step": 19350
},
{
"epoch": 5.64789470619067,
"grad_norm": 0.3144545555114746,
"learning_rate": 0.000532540635013108,
"loss": 3.5335,
"step": 19400
},
{
"epoch": 5.66245413779046,
"grad_norm": 0.3245835304260254,
"learning_rate": 0.0005323658607631808,
"loss": 3.5229,
"step": 19450
},
{
"epoch": 5.677013569390251,
"grad_norm": 0.3528177738189697,
"learning_rate": 0.0005321910865132537,
"loss": 3.5209,
"step": 19500
},
{
"epoch": 5.691573000990042,
"grad_norm": 0.3312878906726837,
"learning_rate": 0.0005320163122633265,
"loss": 3.5321,
"step": 19550
},
{
"epoch": 5.706132432589832,
"grad_norm": 0.3077809512615204,
"learning_rate": 0.0005318415380133993,
"loss": 3.5183,
"step": 19600
},
{
"epoch": 5.720691864189622,
"grad_norm": 0.32409968972206116,
"learning_rate": 0.0005316667637634721,
"loss": 3.5276,
"step": 19650
},
{
"epoch": 5.735251295789412,
"grad_norm": 0.3110126852989197,
"learning_rate": 0.0005314919895135449,
"loss": 3.5253,
"step": 19700
},
{
"epoch": 5.749810727389203,
"grad_norm": 0.33343297243118286,
"learning_rate": 0.0005313172152636178,
"loss": 3.5272,
"step": 19750
},
{
"epoch": 5.764370158988993,
"grad_norm": 0.3315747082233429,
"learning_rate": 0.0005311424410136906,
"loss": 3.534,
"step": 19800
},
{
"epoch": 5.778929590588783,
"grad_norm": 0.3091914653778076,
"learning_rate": 0.0005309676667637634,
"loss": 3.5276,
"step": 19850
},
{
"epoch": 5.793489022188574,
"grad_norm": 0.30921050906181335,
"learning_rate": 0.0005307928925138363,
"loss": 3.5219,
"step": 19900
},
{
"epoch": 5.8080484537883645,
"grad_norm": 0.30907315015792847,
"learning_rate": 0.000530618118263909,
"loss": 3.534,
"step": 19950
},
{
"epoch": 5.822607885388154,
"grad_norm": 0.36628568172454834,
"learning_rate": 0.0005304433440139819,
"loss": 3.538,
"step": 20000
},
{
"epoch": 5.822607885388154,
"eval_accuracy": 0.36194950645964236,
"eval_loss": 3.61657452583313,
"eval_runtime": 183.6498,
"eval_samples_per_second": 90.624,
"eval_steps_per_second": 5.668,
"step": 20000
},
{
"epoch": 5.837167316987944,
"grad_norm": 0.3185259997844696,
"learning_rate": 0.0005302685697640547,
"loss": 3.5243,
"step": 20050
},
{
"epoch": 5.851726748587735,
"grad_norm": 0.3328113257884979,
"learning_rate": 0.0005300937955141275,
"loss": 3.5306,
"step": 20100
},
{
"epoch": 5.866286180187526,
"grad_norm": 0.31715288758277893,
"learning_rate": 0.0005299190212642004,
"loss": 3.5368,
"step": 20150
},
{
"epoch": 5.880845611787316,
"grad_norm": 0.3114943206310272,
"learning_rate": 0.0005297442470142731,
"loss": 3.5279,
"step": 20200
},
{
"epoch": 5.895405043387106,
"grad_norm": 0.3375224471092224,
"learning_rate": 0.000529569472764346,
"loss": 3.5214,
"step": 20250
},
{
"epoch": 5.9099644749868965,
"grad_norm": 0.29627102613449097,
"learning_rate": 0.0005293946985144188,
"loss": 3.5182,
"step": 20300
},
{
"epoch": 5.924523906586687,
"grad_norm": 0.33964815735816956,
"learning_rate": 0.0005292199242644916,
"loss": 3.541,
"step": 20350
},
{
"epoch": 5.939083338186477,
"grad_norm": 0.3077552914619446,
"learning_rate": 0.0005290451500145645,
"loss": 3.5246,
"step": 20400
},
{
"epoch": 5.953642769786267,
"grad_norm": 0.3167116641998291,
"learning_rate": 0.0005288703757646373,
"loss": 3.5294,
"step": 20450
},
{
"epoch": 5.968202201386058,
"grad_norm": 0.3327026665210724,
"learning_rate": 0.00052869560151471,
"loss": 3.5322,
"step": 20500
},
{
"epoch": 5.982761632985849,
"grad_norm": 0.3215795159339905,
"learning_rate": 0.0005285208272647829,
"loss": 3.5378,
"step": 20550
},
{
"epoch": 5.9973210645856385,
"grad_norm": 0.3464929759502411,
"learning_rate": 0.0005283460530148558,
"loss": 3.544,
"step": 20600
},
{
"epoch": 6.011647545279832,
"grad_norm": 0.37006425857543945,
"learning_rate": 0.0005281712787649286,
"loss": 3.4569,
"step": 20650
},
{
"epoch": 6.026206976879623,
"grad_norm": 0.32685425877571106,
"learning_rate": 0.0005279965045150015,
"loss": 3.4143,
"step": 20700
},
{
"epoch": 6.040766408479413,
"grad_norm": 0.31896543502807617,
"learning_rate": 0.0005278217302650743,
"loss": 3.4284,
"step": 20750
},
{
"epoch": 6.055325840079203,
"grad_norm": 0.3501061499118805,
"learning_rate": 0.000527646956015147,
"loss": 3.4257,
"step": 20800
},
{
"epoch": 6.069885271678993,
"grad_norm": 0.3293428421020508,
"learning_rate": 0.0005274721817652199,
"loss": 3.4253,
"step": 20850
},
{
"epoch": 6.084444703278784,
"grad_norm": 0.33916565775871277,
"learning_rate": 0.0005272974075152927,
"loss": 3.4532,
"step": 20900
},
{
"epoch": 6.099004134878574,
"grad_norm": 0.3229523301124573,
"learning_rate": 0.0005271226332653656,
"loss": 3.4476,
"step": 20950
},
{
"epoch": 6.113563566478365,
"grad_norm": 0.3364764153957367,
"learning_rate": 0.0005269478590154384,
"loss": 3.4415,
"step": 21000
},
{
"epoch": 6.113563566478365,
"eval_accuracy": 0.3621457538197391,
"eval_loss": 3.6198906898498535,
"eval_runtime": 183.4856,
"eval_samples_per_second": 90.705,
"eval_steps_per_second": 5.673,
"step": 21000
},
{
"epoch": 6.128122998078155,
"grad_norm": 0.3735044300556183,
"learning_rate": 0.0005267730847655111,
"loss": 3.4457,
"step": 21050
},
{
"epoch": 6.142682429677945,
"grad_norm": 0.34455105662345886,
"learning_rate": 0.000526598310515584,
"loss": 3.4528,
"step": 21100
},
{
"epoch": 6.157241861277735,
"grad_norm": 0.33916333317756653,
"learning_rate": 0.0005264235362656568,
"loss": 3.4609,
"step": 21150
},
{
"epoch": 6.171801292877526,
"grad_norm": 0.3121279180049896,
"learning_rate": 0.0005262487620157297,
"loss": 3.4478,
"step": 21200
},
{
"epoch": 6.186360724477316,
"grad_norm": 0.30740803480148315,
"learning_rate": 0.0005260739877658025,
"loss": 3.4448,
"step": 21250
},
{
"epoch": 6.200920156077107,
"grad_norm": 0.3505891263484955,
"learning_rate": 0.0005258992135158753,
"loss": 3.4585,
"step": 21300
},
{
"epoch": 6.215479587676897,
"grad_norm": 0.33900803327560425,
"learning_rate": 0.0005257244392659481,
"loss": 3.46,
"step": 21350
},
{
"epoch": 6.2300390192766875,
"grad_norm": 0.3224051892757416,
"learning_rate": 0.0005255496650160209,
"loss": 3.4557,
"step": 21400
},
{
"epoch": 6.244598450876477,
"grad_norm": 0.35417911410331726,
"learning_rate": 0.0005253748907660938,
"loss": 3.4583,
"step": 21450
},
{
"epoch": 6.259157882476268,
"grad_norm": 0.34107911586761475,
"learning_rate": 0.0005252001165161666,
"loss": 3.4659,
"step": 21500
},
{
"epoch": 6.273717314076059,
"grad_norm": 0.32315975427627563,
"learning_rate": 0.0005250253422662394,
"loss": 3.4613,
"step": 21550
},
{
"epoch": 6.288276745675849,
"grad_norm": 0.3344326615333557,
"learning_rate": 0.0005248505680163123,
"loss": 3.4729,
"step": 21600
},
{
"epoch": 6.302836177275639,
"grad_norm": 0.34388530254364014,
"learning_rate": 0.000524675793766385,
"loss": 3.4723,
"step": 21650
},
{
"epoch": 6.3173956088754295,
"grad_norm": 0.34264546632766724,
"learning_rate": 0.0005245010195164579,
"loss": 3.4751,
"step": 21700
},
{
"epoch": 6.33195504047522,
"grad_norm": 0.32228031754493713,
"learning_rate": 0.0005243262452665307,
"loss": 3.4586,
"step": 21750
},
{
"epoch": 6.34651447207501,
"grad_norm": 0.34229689836502075,
"learning_rate": 0.0005241514710166035,
"loss": 3.4657,
"step": 21800
},
{
"epoch": 6.3610739036748,
"grad_norm": 0.3267248570919037,
"learning_rate": 0.0005239766967666764,
"loss": 3.4747,
"step": 21850
},
{
"epoch": 6.375633335274591,
"grad_norm": 0.3363324999809265,
"learning_rate": 0.0005238019225167491,
"loss": 3.4736,
"step": 21900
},
{
"epoch": 6.390192766874382,
"grad_norm": 0.32636144757270813,
"learning_rate": 0.0005236271482668219,
"loss": 3.4731,
"step": 21950
},
{
"epoch": 6.4047521984741715,
"grad_norm": 0.3209141194820404,
"learning_rate": 0.0005234523740168948,
"loss": 3.475,
"step": 22000
},
{
"epoch": 6.4047521984741715,
"eval_accuracy": 0.36269357673806785,
"eval_loss": 3.6098952293395996,
"eval_runtime": 183.5371,
"eval_samples_per_second": 90.679,
"eval_steps_per_second": 5.672,
"step": 22000
},
{
"epoch": 6.419311630073962,
"grad_norm": 0.3223513066768646,
"learning_rate": 0.0005232775997669676,
"loss": 3.4759,
"step": 22050
},
{
"epoch": 6.433871061673752,
"grad_norm": 0.3284885585308075,
"learning_rate": 0.0005231028255170405,
"loss": 3.4796,
"step": 22100
},
{
"epoch": 6.448430493273543,
"grad_norm": 0.32980912923812866,
"learning_rate": 0.0005229280512671133,
"loss": 3.4839,
"step": 22150
},
{
"epoch": 6.462989924873333,
"grad_norm": 0.33856451511383057,
"learning_rate": 0.000522753277017186,
"loss": 3.4825,
"step": 22200
},
{
"epoch": 6.477549356473124,
"grad_norm": 0.3303597867488861,
"learning_rate": 0.0005225785027672589,
"loss": 3.4827,
"step": 22250
},
{
"epoch": 6.492108788072914,
"grad_norm": 0.32675686478614807,
"learning_rate": 0.0005224037285173317,
"loss": 3.4781,
"step": 22300
},
{
"epoch": 6.506668219672704,
"grad_norm": 0.3315143883228302,
"learning_rate": 0.0005222289542674046,
"loss": 3.4786,
"step": 22350
},
{
"epoch": 6.521227651272494,
"grad_norm": 0.35115185379981995,
"learning_rate": 0.0005220541800174774,
"loss": 3.4777,
"step": 22400
},
{
"epoch": 6.535787082872285,
"grad_norm": 0.32922348380088806,
"learning_rate": 0.0005218794057675501,
"loss": 3.4764,
"step": 22450
},
{
"epoch": 6.550346514472075,
"grad_norm": 0.32848137617111206,
"learning_rate": 0.000521704631517623,
"loss": 3.4864,
"step": 22500
},
{
"epoch": 6.564905946071866,
"grad_norm": 0.3455169200897217,
"learning_rate": 0.0005215298572676958,
"loss": 3.4872,
"step": 22550
},
{
"epoch": 6.579465377671656,
"grad_norm": 0.3491528034210205,
"learning_rate": 0.0005213550830177687,
"loss": 3.4941,
"step": 22600
},
{
"epoch": 6.594024809271446,
"grad_norm": 0.3292933404445648,
"learning_rate": 0.0005211803087678415,
"loss": 3.4849,
"step": 22650
},
{
"epoch": 6.608584240871236,
"grad_norm": 0.33583250641822815,
"learning_rate": 0.0005210055345179143,
"loss": 3.4787,
"step": 22700
},
{
"epoch": 6.623143672471027,
"grad_norm": 0.32590252161026,
"learning_rate": 0.0005208307602679871,
"loss": 3.4819,
"step": 22750
},
{
"epoch": 6.637703104070817,
"grad_norm": 0.34313255548477173,
"learning_rate": 0.0005206559860180599,
"loss": 3.4679,
"step": 22800
},
{
"epoch": 6.652262535670608,
"grad_norm": 0.3168715238571167,
"learning_rate": 0.0005204812117681328,
"loss": 3.4812,
"step": 22850
},
{
"epoch": 6.666821967270398,
"grad_norm": 0.33726438879966736,
"learning_rate": 0.0005203064375182056,
"loss": 3.4791,
"step": 22900
},
{
"epoch": 6.6813813988701884,
"grad_norm": 0.33907851576805115,
"learning_rate": 0.0005201316632682784,
"loss": 3.4817,
"step": 22950
},
{
"epoch": 6.695940830469978,
"grad_norm": 0.3657963275909424,
"learning_rate": 0.0005199568890183513,
"loss": 3.4859,
"step": 23000
},
{
"epoch": 6.695940830469978,
"eval_accuracy": 0.3636106127844396,
"eval_loss": 3.6045539379119873,
"eval_runtime": 184.0903,
"eval_samples_per_second": 90.407,
"eval_steps_per_second": 5.655,
"step": 23000
},
{
"epoch": 6.710500262069769,
"grad_norm": 0.3322959840297699,
"learning_rate": 0.000519782114768424,
"loss": 3.4824,
"step": 23050
},
{
"epoch": 6.725059693669559,
"grad_norm": 0.343662828207016,
"learning_rate": 0.0005196073405184969,
"loss": 3.4897,
"step": 23100
},
{
"epoch": 6.73961912526935,
"grad_norm": 0.32909801602363586,
"learning_rate": 0.0005194325662685697,
"loss": 3.4858,
"step": 23150
},
{
"epoch": 6.75417855686914,
"grad_norm": 0.3375694751739502,
"learning_rate": 0.0005192577920186426,
"loss": 3.4808,
"step": 23200
},
{
"epoch": 6.7687379884689305,
"grad_norm": 0.3138526678085327,
"learning_rate": 0.0005190830177687154,
"loss": 3.483,
"step": 23250
},
{
"epoch": 6.78329742006872,
"grad_norm": 0.3340669572353363,
"learning_rate": 0.0005189082435187883,
"loss": 3.4903,
"step": 23300
},
{
"epoch": 6.797856851668511,
"grad_norm": 0.3336253762245178,
"learning_rate": 0.000518733469268861,
"loss": 3.4864,
"step": 23350
},
{
"epoch": 6.812416283268301,
"grad_norm": 0.3235922157764435,
"learning_rate": 0.0005185586950189338,
"loss": 3.5037,
"step": 23400
},
{
"epoch": 6.826975714868092,
"grad_norm": 0.3445108234882355,
"learning_rate": 0.0005183839207690067,
"loss": 3.4908,
"step": 23450
},
{
"epoch": 6.841535146467882,
"grad_norm": 0.3229808211326599,
"learning_rate": 0.0005182091465190795,
"loss": 3.4906,
"step": 23500
},
{
"epoch": 6.8560945780676725,
"grad_norm": 0.29649391770362854,
"learning_rate": 0.0005180343722691524,
"loss": 3.4964,
"step": 23550
},
{
"epoch": 6.870654009667462,
"grad_norm": 0.3273935616016388,
"learning_rate": 0.0005178595980192251,
"loss": 3.493,
"step": 23600
},
{
"epoch": 6.885213441267253,
"grad_norm": 0.33352574706077576,
"learning_rate": 0.0005176848237692979,
"loss": 3.4915,
"step": 23650
},
{
"epoch": 6.899772872867043,
"grad_norm": 0.3277892768383026,
"learning_rate": 0.0005175100495193708,
"loss": 3.4986,
"step": 23700
},
{
"epoch": 6.914332304466834,
"grad_norm": 0.3182038366794586,
"learning_rate": 0.0005173352752694436,
"loss": 3.5041,
"step": 23750
},
{
"epoch": 6.928891736066624,
"grad_norm": 0.3153535723686218,
"learning_rate": 0.0005171605010195165,
"loss": 3.5035,
"step": 23800
},
{
"epoch": 6.943451167666415,
"grad_norm": 0.34128624200820923,
"learning_rate": 0.0005169857267695893,
"loss": 3.4943,
"step": 23850
},
{
"epoch": 6.9580105992662045,
"grad_norm": 0.3200225234031677,
"learning_rate": 0.000516810952519662,
"loss": 3.5004,
"step": 23900
},
{
"epoch": 6.972570030865995,
"grad_norm": 0.35053977370262146,
"learning_rate": 0.0005166361782697349,
"loss": 3.4938,
"step": 23950
},
{
"epoch": 6.987129462465785,
"grad_norm": 0.35640257596969604,
"learning_rate": 0.0005164614040198077,
"loss": 3.4951,
"step": 24000
},
{
"epoch": 6.987129462465785,
"eval_accuracy": 0.36452964775538993,
"eval_loss": 3.5928568840026855,
"eval_runtime": 184.1497,
"eval_samples_per_second": 90.378,
"eval_steps_per_second": 5.653,
"step": 24000
},
{
"epoch": 7.001455943159979,
"grad_norm": 0.3846636712551117,
"learning_rate": 0.0005162866297698806,
"loss": 3.4935,
"step": 24050
},
{
"epoch": 7.016015374759769,
"grad_norm": 0.3523205518722534,
"learning_rate": 0.0005161118555199534,
"loss": 3.3822,
"step": 24100
},
{
"epoch": 7.03057480635956,
"grad_norm": 0.36663973331451416,
"learning_rate": 0.0005159370812700261,
"loss": 3.3874,
"step": 24150
},
{
"epoch": 7.04513423795935,
"grad_norm": 0.38096940517425537,
"learning_rate": 0.000515762307020099,
"loss": 3.3899,
"step": 24200
},
{
"epoch": 7.059693669559141,
"grad_norm": 0.35516002774238586,
"learning_rate": 0.0005155875327701718,
"loss": 3.3847,
"step": 24250
},
{
"epoch": 7.074253101158931,
"grad_norm": 0.3651926815509796,
"learning_rate": 0.0005154127585202447,
"loss": 3.4049,
"step": 24300
},
{
"epoch": 7.0888125327587215,
"grad_norm": 0.36075493693351746,
"learning_rate": 0.0005152379842703175,
"loss": 3.3965,
"step": 24350
},
{
"epoch": 7.103371964358511,
"grad_norm": 0.38245540857315063,
"learning_rate": 0.0005150632100203903,
"loss": 3.4028,
"step": 24400
},
{
"epoch": 7.117931395958302,
"grad_norm": 0.32894188165664673,
"learning_rate": 0.0005148884357704631,
"loss": 3.3985,
"step": 24450
},
{
"epoch": 7.132490827558092,
"grad_norm": 0.3118518590927124,
"learning_rate": 0.0005147136615205359,
"loss": 3.4151,
"step": 24500
},
{
"epoch": 7.147050259157883,
"grad_norm": 0.3686443269252777,
"learning_rate": 0.0005145388872706087,
"loss": 3.4092,
"step": 24550
},
{
"epoch": 7.161609690757673,
"grad_norm": 0.35504400730133057,
"learning_rate": 0.0005143641130206816,
"loss": 3.4128,
"step": 24600
},
{
"epoch": 7.1761691223574635,
"grad_norm": 0.371929794549942,
"learning_rate": 0.0005141893387707544,
"loss": 3.4088,
"step": 24650
},
{
"epoch": 7.190728553957253,
"grad_norm": 0.35544171929359436,
"learning_rate": 0.0005140145645208272,
"loss": 3.4102,
"step": 24700
},
{
"epoch": 7.205287985557044,
"grad_norm": 0.32105565071105957,
"learning_rate": 0.0005138397902709,
"loss": 3.4146,
"step": 24750
},
{
"epoch": 7.219847417156834,
"grad_norm": 0.3172771632671356,
"learning_rate": 0.0005136650160209728,
"loss": 3.4218,
"step": 24800
},
{
"epoch": 7.234406848756625,
"grad_norm": 0.3447094261646271,
"learning_rate": 0.0005134902417710457,
"loss": 3.4251,
"step": 24850
},
{
"epoch": 7.248966280356415,
"grad_norm": 0.3414628505706787,
"learning_rate": 0.0005133154675211185,
"loss": 3.4205,
"step": 24900
},
{
"epoch": 7.2635257119562056,
"grad_norm": 0.36512497067451477,
"learning_rate": 0.0005131406932711914,
"loss": 3.4273,
"step": 24950
},
{
"epoch": 7.2780851435559955,
"grad_norm": 0.3672768771648407,
"learning_rate": 0.0005129659190212641,
"loss": 3.4339,
"step": 25000
},
{
"epoch": 7.2780851435559955,
"eval_accuracy": 0.36433104871931843,
"eval_loss": 3.5997819900512695,
"eval_runtime": 180.8992,
"eval_samples_per_second": 92.002,
"eval_steps_per_second": 5.755,
"step": 25000
},
{
"epoch": 7.292644575155786,
"grad_norm": 0.3394540846347809,
"learning_rate": 0.0005127911447713369,
"loss": 3.4232,
"step": 25050
},
{
"epoch": 7.307204006755576,
"grad_norm": 0.3045949637889862,
"learning_rate": 0.0005126163705214098,
"loss": 3.4267,
"step": 25100
},
{
"epoch": 7.321763438355367,
"grad_norm": 0.32903987169265747,
"learning_rate": 0.0005124415962714826,
"loss": 3.4341,
"step": 25150
},
{
"epoch": 7.336322869955157,
"grad_norm": 0.3628155589103699,
"learning_rate": 0.0005122668220215555,
"loss": 3.4336,
"step": 25200
},
{
"epoch": 7.350882301554948,
"grad_norm": 0.3750855624675751,
"learning_rate": 0.0005120920477716282,
"loss": 3.436,
"step": 25250
},
{
"epoch": 7.3654417331547375,
"grad_norm": 0.31662774085998535,
"learning_rate": 0.000511917273521701,
"loss": 3.4372,
"step": 25300
},
{
"epoch": 7.380001164754528,
"grad_norm": 0.3318006694316864,
"learning_rate": 0.0005117424992717739,
"loss": 3.4377,
"step": 25350
},
{
"epoch": 7.394560596354318,
"grad_norm": 0.3489433526992798,
"learning_rate": 0.0005115677250218467,
"loss": 3.4364,
"step": 25400
},
{
"epoch": 7.409120027954109,
"grad_norm": 0.3378850817680359,
"learning_rate": 0.0005113929507719196,
"loss": 3.4392,
"step": 25450
},
{
"epoch": 7.423679459553899,
"grad_norm": 0.3490906357765198,
"learning_rate": 0.0005112181765219924,
"loss": 3.4404,
"step": 25500
},
{
"epoch": 7.43823889115369,
"grad_norm": 0.33684709668159485,
"learning_rate": 0.0005110434022720651,
"loss": 3.4316,
"step": 25550
},
{
"epoch": 7.4527983227534795,
"grad_norm": 0.3533405363559723,
"learning_rate": 0.000510868628022138,
"loss": 3.4519,
"step": 25600
},
{
"epoch": 7.46735775435327,
"grad_norm": 0.364666610956192,
"learning_rate": 0.0005106938537722109,
"loss": 3.4428,
"step": 25650
},
{
"epoch": 7.48191718595306,
"grad_norm": 0.3563931882381439,
"learning_rate": 0.0005105190795222837,
"loss": 3.442,
"step": 25700
},
{
"epoch": 7.496476617552851,
"grad_norm": 0.35002008080482483,
"learning_rate": 0.0005103443052723565,
"loss": 3.4379,
"step": 25750
},
{
"epoch": 7.511036049152641,
"grad_norm": 0.3543298542499542,
"learning_rate": 0.0005101695310224294,
"loss": 3.457,
"step": 25800
},
{
"epoch": 7.525595480752432,
"grad_norm": 0.33176884055137634,
"learning_rate": 0.0005099947567725021,
"loss": 3.4399,
"step": 25850
},
{
"epoch": 7.540154912352222,
"grad_norm": 0.34475451707839966,
"learning_rate": 0.000509819982522575,
"loss": 3.4443,
"step": 25900
},
{
"epoch": 7.554714343952012,
"grad_norm": 0.33004602789878845,
"learning_rate": 0.0005096452082726478,
"loss": 3.4584,
"step": 25950
},
{
"epoch": 7.569273775551802,
"grad_norm": 0.3163653016090393,
"learning_rate": 0.0005094704340227206,
"loss": 3.45,
"step": 26000
},
{
"epoch": 7.569273775551802,
"eval_accuracy": 0.36467333515745,
"eval_loss": 3.5947012901306152,
"eval_runtime": 180.7981,
"eval_samples_per_second": 92.053,
"eval_steps_per_second": 5.758,
"step": 26000
},
{
"epoch": 7.583833207151593,
"grad_norm": 0.3281993567943573,
"learning_rate": 0.0005092956597727935,
"loss": 3.4512,
"step": 26050
},
{
"epoch": 7.598392638751383,
"grad_norm": 0.31753283739089966,
"learning_rate": 0.0005091208855228662,
"loss": 3.4646,
"step": 26100
},
{
"epoch": 7.612952070351174,
"grad_norm": 0.3362863063812256,
"learning_rate": 0.0005089461112729391,
"loss": 3.4544,
"step": 26150
},
{
"epoch": 7.627511501950964,
"grad_norm": 0.34793269634246826,
"learning_rate": 0.0005087713370230119,
"loss": 3.4481,
"step": 26200
},
{
"epoch": 7.642070933550754,
"grad_norm": 0.3432117700576782,
"learning_rate": 0.0005085965627730847,
"loss": 3.4591,
"step": 26250
},
{
"epoch": 7.656630365150544,
"grad_norm": 0.3630698323249817,
"learning_rate": 0.0005084217885231576,
"loss": 3.4618,
"step": 26300
},
{
"epoch": 7.671189796750335,
"grad_norm": 0.3361819088459015,
"learning_rate": 0.0005082470142732304,
"loss": 3.4646,
"step": 26350
},
{
"epoch": 7.685749228350125,
"grad_norm": 0.3673403263092041,
"learning_rate": 0.0005080722400233032,
"loss": 3.4539,
"step": 26400
},
{
"epoch": 7.700308659949916,
"grad_norm": 0.33987388014793396,
"learning_rate": 0.000507897465773376,
"loss": 3.4693,
"step": 26450
},
{
"epoch": 7.714868091549706,
"grad_norm": 0.32190704345703125,
"learning_rate": 0.0005077226915234488,
"loss": 3.4468,
"step": 26500
},
{
"epoch": 7.729427523149496,
"grad_norm": 0.3864888846874237,
"learning_rate": 0.0005075479172735217,
"loss": 3.4556,
"step": 26550
},
{
"epoch": 7.743986954749286,
"grad_norm": 0.3400271534919739,
"learning_rate": 0.0005073731430235945,
"loss": 3.4587,
"step": 26600
},
{
"epoch": 7.758546386349077,
"grad_norm": 0.3375173509120941,
"learning_rate": 0.0005071983687736674,
"loss": 3.4628,
"step": 26650
},
{
"epoch": 7.773105817948867,
"grad_norm": 0.3561650216579437,
"learning_rate": 0.0005070235945237401,
"loss": 3.4572,
"step": 26700
},
{
"epoch": 7.787665249548658,
"grad_norm": 0.3330904543399811,
"learning_rate": 0.0005068488202738129,
"loss": 3.4615,
"step": 26750
},
{
"epoch": 7.802224681148448,
"grad_norm": 0.3155699074268341,
"learning_rate": 0.0005066740460238858,
"loss": 3.4407,
"step": 26800
},
{
"epoch": 7.8167841127482385,
"grad_norm": 0.3466147780418396,
"learning_rate": 0.0005064992717739586,
"loss": 3.457,
"step": 26850
},
{
"epoch": 7.831343544348028,
"grad_norm": 0.3634095788002014,
"learning_rate": 0.0005063244975240315,
"loss": 3.4655,
"step": 26900
},
{
"epoch": 7.845902975947819,
"grad_norm": 0.3383113741874695,
"learning_rate": 0.0005061497232741042,
"loss": 3.4613,
"step": 26950
},
{
"epoch": 7.860462407547609,
"grad_norm": 0.33195146918296814,
"learning_rate": 0.000505974949024177,
"loss": 3.4628,
"step": 27000
},
{
"epoch": 7.860462407547609,
"eval_accuracy": 0.3657741546812521,
"eval_loss": 3.5823051929473877,
"eval_runtime": 183.1613,
"eval_samples_per_second": 90.865,
"eval_steps_per_second": 5.684,
"step": 27000
},
{
"epoch": 7.8750218391474,
"grad_norm": 0.35151028633117676,
"learning_rate": 0.0005058001747742499,
"loss": 3.4647,
"step": 27050
},
{
"epoch": 7.88958127074719,
"grad_norm": 0.35772132873535156,
"learning_rate": 0.0005056254005243227,
"loss": 3.4699,
"step": 27100
},
{
"epoch": 7.9041407023469805,
"grad_norm": 0.3402451276779175,
"learning_rate": 0.0005054506262743955,
"loss": 3.4738,
"step": 27150
},
{
"epoch": 7.91870013394677,
"grad_norm": 0.33174848556518555,
"learning_rate": 0.0005052758520244684,
"loss": 3.4582,
"step": 27200
},
{
"epoch": 7.933259565546561,
"grad_norm": 0.33006104826927185,
"learning_rate": 0.0005051010777745411,
"loss": 3.4596,
"step": 27250
},
{
"epoch": 7.947818997146351,
"grad_norm": 0.347843199968338,
"learning_rate": 0.000504926303524614,
"loss": 3.4695,
"step": 27300
},
{
"epoch": 7.962378428746142,
"grad_norm": 0.32010769844055176,
"learning_rate": 0.0005047515292746868,
"loss": 3.4707,
"step": 27350
},
{
"epoch": 7.976937860345932,
"grad_norm": 0.3584131896495819,
"learning_rate": 0.0005045767550247596,
"loss": 3.4584,
"step": 27400
},
{
"epoch": 7.991497291945723,
"grad_norm": 0.3257739543914795,
"learning_rate": 0.0005044019807748325,
"loss": 3.4689,
"step": 27450
},
{
"epoch": 8.005823772639916,
"grad_norm": 0.33740708231925964,
"learning_rate": 0.0005042272065249052,
"loss": 3.4213,
"step": 27500
},
{
"epoch": 8.020383204239707,
"grad_norm": 0.33311763405799866,
"learning_rate": 0.0005040524322749781,
"loss": 3.3665,
"step": 27550
},
{
"epoch": 8.034942635839498,
"grad_norm": 0.32844987511634827,
"learning_rate": 0.0005038776580250509,
"loss": 3.3403,
"step": 27600
},
{
"epoch": 8.049502067439287,
"grad_norm": 0.33761027455329895,
"learning_rate": 0.0005037028837751237,
"loss": 3.3569,
"step": 27650
},
{
"epoch": 8.064061499039077,
"grad_norm": 0.35406097769737244,
"learning_rate": 0.0005035281095251966,
"loss": 3.3567,
"step": 27700
},
{
"epoch": 8.078620930638868,
"grad_norm": 0.38495901226997375,
"learning_rate": 0.0005033533352752694,
"loss": 3.3788,
"step": 27750
},
{
"epoch": 8.093180362238659,
"grad_norm": 0.331709086894989,
"learning_rate": 0.0005031785610253422,
"loss": 3.3591,
"step": 27800
},
{
"epoch": 8.107739793838448,
"grad_norm": 0.3502473533153534,
"learning_rate": 0.000503003786775415,
"loss": 3.3805,
"step": 27850
},
{
"epoch": 8.122299225438239,
"grad_norm": 0.3384426534175873,
"learning_rate": 0.0005028290125254878,
"loss": 3.3839,
"step": 27900
},
{
"epoch": 8.13685865703803,
"grad_norm": 0.36867547035217285,
"learning_rate": 0.0005026542382755607,
"loss": 3.3739,
"step": 27950
},
{
"epoch": 8.15141808863782,
"grad_norm": 0.342602014541626,
"learning_rate": 0.0005024794640256335,
"loss": 3.3902,
"step": 28000
},
{
"epoch": 8.15141808863782,
"eval_accuracy": 0.3654784314274215,
"eval_loss": 3.5930871963500977,
"eval_runtime": 181.7974,
"eval_samples_per_second": 91.547,
"eval_steps_per_second": 5.726,
"step": 28000
},
{
"epoch": 8.16597752023761,
"grad_norm": 0.34287703037261963,
"learning_rate": 0.0005023046897757064,
"loss": 3.3962,
"step": 28050
},
{
"epoch": 8.1805369518374,
"grad_norm": 0.3469769060611725,
"learning_rate": 0.0005021299155257791,
"loss": 3.3802,
"step": 28100
},
{
"epoch": 8.19509638343719,
"grad_norm": 0.33318281173706055,
"learning_rate": 0.000501955141275852,
"loss": 3.3841,
"step": 28150
},
{
"epoch": 8.209655815036982,
"grad_norm": 0.3634045720100403,
"learning_rate": 0.0005017803670259248,
"loss": 3.3948,
"step": 28200
},
{
"epoch": 8.22421524663677,
"grad_norm": 0.33881765604019165,
"learning_rate": 0.0005016055927759977,
"loss": 3.3884,
"step": 28250
},
{
"epoch": 8.238774678236561,
"grad_norm": 0.3295370638370514,
"learning_rate": 0.0005014308185260705,
"loss": 3.3949,
"step": 28300
},
{
"epoch": 8.253334109836352,
"grad_norm": 0.32435914874076843,
"learning_rate": 0.0005012560442761432,
"loss": 3.3864,
"step": 28350
},
{
"epoch": 8.267893541436143,
"grad_norm": 0.3608424663543701,
"learning_rate": 0.0005010812700262161,
"loss": 3.3904,
"step": 28400
},
{
"epoch": 8.282452973035932,
"grad_norm": 0.34927839040756226,
"learning_rate": 0.0005009064957762889,
"loss": 3.4077,
"step": 28450
},
{
"epoch": 8.297012404635723,
"grad_norm": 0.37262028455734253,
"learning_rate": 0.0005007317215263618,
"loss": 3.3987,
"step": 28500
},
{
"epoch": 8.311571836235514,
"grad_norm": 0.335907906293869,
"learning_rate": 0.0005005569472764346,
"loss": 3.4031,
"step": 28550
},
{
"epoch": 8.326131267835304,
"grad_norm": 0.32725778222084045,
"learning_rate": 0.0005003821730265074,
"loss": 3.4097,
"step": 28600
},
{
"epoch": 8.340690699435093,
"grad_norm": 0.34939050674438477,
"learning_rate": 0.0005002073987765802,
"loss": 3.4022,
"step": 28650
},
{
"epoch": 8.355250131034884,
"grad_norm": 0.329519659280777,
"learning_rate": 0.000500032624526653,
"loss": 3.395,
"step": 28700
},
{
"epoch": 8.369809562634675,
"grad_norm": 0.342352032661438,
"learning_rate": 0.0004998578502767259,
"loss": 3.4104,
"step": 28750
},
{
"epoch": 8.384368994234466,
"grad_norm": 0.33699142932891846,
"learning_rate": 0.0004996830760267987,
"loss": 3.4088,
"step": 28800
},
{
"epoch": 8.398928425834255,
"grad_norm": 0.3412262797355652,
"learning_rate": 0.0004995083017768715,
"loss": 3.4005,
"step": 28850
},
{
"epoch": 8.413487857434045,
"grad_norm": 0.3386961817741394,
"learning_rate": 0.0004993335275269444,
"loss": 3.4233,
"step": 28900
},
{
"epoch": 8.428047289033836,
"grad_norm": 0.3285767138004303,
"learning_rate": 0.0004991587532770171,
"loss": 3.4173,
"step": 28950
},
{
"epoch": 8.442606720633627,
"grad_norm": 0.36489197611808777,
"learning_rate": 0.00049898397902709,
"loss": 3.4052,
"step": 29000
},
{
"epoch": 8.442606720633627,
"eval_accuracy": 0.3661564196109552,
"eval_loss": 3.5855534076690674,
"eval_runtime": 181.6388,
"eval_samples_per_second": 91.627,
"eval_steps_per_second": 5.731,
"step": 29000
},
{
"epoch": 8.457166152233416,
"grad_norm": 0.33611413836479187,
"learning_rate": 0.0004988092047771628,
"loss": 3.4126,
"step": 29050
},
{
"epoch": 8.471725583833207,
"grad_norm": 0.353683203458786,
"learning_rate": 0.0004986344305272356,
"loss": 3.411,
"step": 29100
},
{
"epoch": 8.486285015432998,
"grad_norm": 0.3237438201904297,
"learning_rate": 0.0004984596562773085,
"loss": 3.4116,
"step": 29150
},
{
"epoch": 8.500844447032788,
"grad_norm": 0.3344637155532837,
"learning_rate": 0.0004982848820273812,
"loss": 3.4252,
"step": 29200
},
{
"epoch": 8.515403878632577,
"grad_norm": 0.36675405502319336,
"learning_rate": 0.0004981101077774541,
"loss": 3.4157,
"step": 29250
},
{
"epoch": 8.529963310232368,
"grad_norm": 0.3269510269165039,
"learning_rate": 0.0004979353335275269,
"loss": 3.4108,
"step": 29300
},
{
"epoch": 8.544522741832159,
"grad_norm": 0.32414621114730835,
"learning_rate": 0.0004977605592775997,
"loss": 3.4189,
"step": 29350
},
{
"epoch": 8.55908217343195,
"grad_norm": 0.34106162190437317,
"learning_rate": 0.0004975857850276726,
"loss": 3.413,
"step": 29400
},
{
"epoch": 8.573641605031739,
"grad_norm": 0.3581826388835907,
"learning_rate": 0.0004974110107777454,
"loss": 3.4321,
"step": 29450
},
{
"epoch": 8.58820103663153,
"grad_norm": 0.34445253014564514,
"learning_rate": 0.0004972362365278182,
"loss": 3.4358,
"step": 29500
},
{
"epoch": 8.60276046823132,
"grad_norm": 0.3163345158100128,
"learning_rate": 0.000497061462277891,
"loss": 3.4149,
"step": 29550
},
{
"epoch": 8.617319899831111,
"grad_norm": 0.3480691909790039,
"learning_rate": 0.0004968866880279638,
"loss": 3.4231,
"step": 29600
},
{
"epoch": 8.6318793314309,
"grad_norm": 0.35285916924476624,
"learning_rate": 0.0004967119137780367,
"loss": 3.438,
"step": 29650
},
{
"epoch": 8.646438763030691,
"grad_norm": 0.3620506823062897,
"learning_rate": 0.0004965371395281095,
"loss": 3.428,
"step": 29700
},
{
"epoch": 8.660998194630482,
"grad_norm": 0.3322892189025879,
"learning_rate": 0.0004963623652781822,
"loss": 3.4191,
"step": 29750
},
{
"epoch": 8.675557626230272,
"grad_norm": 0.3197033405303955,
"learning_rate": 0.0004961875910282551,
"loss": 3.4174,
"step": 29800
},
{
"epoch": 8.690117057830061,
"grad_norm": 0.34963804483413696,
"learning_rate": 0.0004960128167783279,
"loss": 3.4357,
"step": 29850
},
{
"epoch": 8.704676489429852,
"grad_norm": 0.3247370421886444,
"learning_rate": 0.0004958380425284008,
"loss": 3.421,
"step": 29900
},
{
"epoch": 8.719235921029643,
"grad_norm": 0.32233262062072754,
"learning_rate": 0.0004956632682784736,
"loss": 3.4173,
"step": 29950
},
{
"epoch": 8.733795352629434,
"grad_norm": 0.36941850185394287,
"learning_rate": 0.0004954884940285464,
"loss": 3.4335,
"step": 30000
},
{
"epoch": 8.733795352629434,
"eval_accuracy": 0.36665227049024096,
"eval_loss": 3.578705310821533,
"eval_runtime": 182.0872,
"eval_samples_per_second": 91.401,
"eval_steps_per_second": 5.717,
"step": 30000
},
{
"epoch": 8.748354784229225,
"grad_norm": 0.3275495767593384,
"learning_rate": 0.0004953137197786192,
"loss": 3.4322,
"step": 30050
},
{
"epoch": 8.762914215829014,
"grad_norm": 0.3445492684841156,
"learning_rate": 0.000495138945528692,
"loss": 3.4295,
"step": 30100
},
{
"epoch": 8.777473647428804,
"grad_norm": 0.364786297082901,
"learning_rate": 0.0004949641712787649,
"loss": 3.4453,
"step": 30150
},
{
"epoch": 8.792033079028595,
"grad_norm": 0.3113223612308502,
"learning_rate": 0.0004947893970288377,
"loss": 3.4291,
"step": 30200
},
{
"epoch": 8.806592510628384,
"grad_norm": 0.3300077021121979,
"learning_rate": 0.0004946146227789105,
"loss": 3.4139,
"step": 30250
},
{
"epoch": 8.821151942228175,
"grad_norm": 0.3236207067966461,
"learning_rate": 0.0004944398485289834,
"loss": 3.4379,
"step": 30300
},
{
"epoch": 8.835711373827966,
"grad_norm": 0.34814879298210144,
"learning_rate": 0.0004942650742790561,
"loss": 3.4444,
"step": 30350
},
{
"epoch": 8.850270805427757,
"grad_norm": 0.35743340849876404,
"learning_rate": 0.000494090300029129,
"loss": 3.4288,
"step": 30400
},
{
"epoch": 8.864830237027547,
"grad_norm": 0.3275567889213562,
"learning_rate": 0.0004939155257792018,
"loss": 3.4436,
"step": 30450
},
{
"epoch": 8.879389668627336,
"grad_norm": 0.3158799707889557,
"learning_rate": 0.0004937407515292746,
"loss": 3.4395,
"step": 30500
},
{
"epoch": 8.893949100227127,
"grad_norm": 0.33471766114234924,
"learning_rate": 0.0004935659772793475,
"loss": 3.4325,
"step": 30550
},
{
"epoch": 8.908508531826918,
"grad_norm": 0.34228864312171936,
"learning_rate": 0.0004933912030294202,
"loss": 3.427,
"step": 30600
},
{
"epoch": 8.923067963426707,
"grad_norm": 0.3427802324295044,
"learning_rate": 0.0004932164287794931,
"loss": 3.4251,
"step": 30650
},
{
"epoch": 8.937627395026498,
"grad_norm": 0.35321247577667236,
"learning_rate": 0.000493041654529566,
"loss": 3.4431,
"step": 30700
},
{
"epoch": 8.952186826626289,
"grad_norm": 0.3708088994026184,
"learning_rate": 0.0004928668802796388,
"loss": 3.4538,
"step": 30750
},
{
"epoch": 8.96674625822608,
"grad_norm": 0.34218692779541016,
"learning_rate": 0.0004926921060297116,
"loss": 3.4313,
"step": 30800
},
{
"epoch": 8.98130568982587,
"grad_norm": 0.35662880539894104,
"learning_rate": 0.0004925173317797845,
"loss": 3.4317,
"step": 30850
},
{
"epoch": 8.995865121425659,
"grad_norm": 0.3506118655204773,
"learning_rate": 0.0004923425575298572,
"loss": 3.4502,
"step": 30900
},
{
"epoch": 9.010191602119853,
"grad_norm": 0.34358587861061096,
"learning_rate": 0.0004921677832799301,
"loss": 3.3637,
"step": 30950
},
{
"epoch": 9.024751033719644,
"grad_norm": 0.3556085526943207,
"learning_rate": 0.0004919930090300029,
"loss": 3.3189,
"step": 31000
},
{
"epoch": 9.024751033719644,
"eval_accuracy": 0.36717398980524946,
"eval_loss": 3.5799267292022705,
"eval_runtime": 181.8055,
"eval_samples_per_second": 91.543,
"eval_steps_per_second": 5.726,
"step": 31000
},
{
"epoch": 9.039310465319433,
"grad_norm": 0.34578433632850647,
"learning_rate": 0.0004918182347800757,
"loss": 3.3231,
"step": 31050
},
{
"epoch": 9.053869896919224,
"grad_norm": 0.3525102138519287,
"learning_rate": 0.0004916434605301486,
"loss": 3.3193,
"step": 31100
},
{
"epoch": 9.068429328519015,
"grad_norm": 0.3447619080543518,
"learning_rate": 0.0004914686862802213,
"loss": 3.3293,
"step": 31150
},
{
"epoch": 9.082988760118806,
"grad_norm": 0.316193550825119,
"learning_rate": 0.0004912939120302941,
"loss": 3.35,
"step": 31200
},
{
"epoch": 9.097548191718595,
"grad_norm": 0.3357117772102356,
"learning_rate": 0.000491119137780367,
"loss": 3.3503,
"step": 31250
},
{
"epoch": 9.112107623318385,
"grad_norm": 0.3565595746040344,
"learning_rate": 0.0004909443635304398,
"loss": 3.3394,
"step": 31300
},
{
"epoch": 9.126667054918176,
"grad_norm": 0.35598695278167725,
"learning_rate": 0.0004907695892805127,
"loss": 3.3571,
"step": 31350
},
{
"epoch": 9.141226486517967,
"grad_norm": 0.3496910035610199,
"learning_rate": 0.0004905948150305855,
"loss": 3.354,
"step": 31400
},
{
"epoch": 9.155785918117756,
"grad_norm": 0.34782034158706665,
"learning_rate": 0.0004904200407806582,
"loss": 3.3431,
"step": 31450
},
{
"epoch": 9.170345349717547,
"grad_norm": 0.34046244621276855,
"learning_rate": 0.0004902452665307311,
"loss": 3.3657,
"step": 31500
},
{
"epoch": 9.184904781317337,
"grad_norm": 0.37150949239730835,
"learning_rate": 0.0004900704922808039,
"loss": 3.3665,
"step": 31550
},
{
"epoch": 9.199464212917128,
"grad_norm": 0.36348044872283936,
"learning_rate": 0.0004898957180308768,
"loss": 3.3567,
"step": 31600
},
{
"epoch": 9.214023644516917,
"grad_norm": 0.3551836311817169,
"learning_rate": 0.0004897209437809496,
"loss": 3.3674,
"step": 31650
},
{
"epoch": 9.228583076116708,
"grad_norm": 0.3500552475452423,
"learning_rate": 0.0004895461695310223,
"loss": 3.3814,
"step": 31700
},
{
"epoch": 9.243142507716499,
"grad_norm": 0.3479650318622589,
"learning_rate": 0.0004893713952810952,
"loss": 3.3613,
"step": 31750
},
{
"epoch": 9.25770193931629,
"grad_norm": 0.3503901958465576,
"learning_rate": 0.000489196621031168,
"loss": 3.3602,
"step": 31800
},
{
"epoch": 9.272261370916079,
"grad_norm": 0.33610227704048157,
"learning_rate": 0.0004890218467812409,
"loss": 3.3631,
"step": 31850
},
{
"epoch": 9.28682080251587,
"grad_norm": 0.3341948091983795,
"learning_rate": 0.0004888470725313137,
"loss": 3.3609,
"step": 31900
},
{
"epoch": 9.30138023411566,
"grad_norm": 0.3447319567203522,
"learning_rate": 0.0004886722982813865,
"loss": 3.3727,
"step": 31950
},
{
"epoch": 9.315939665715451,
"grad_norm": 0.32863977551460266,
"learning_rate": 0.0004884975240314593,
"loss": 3.3782,
"step": 32000
},
{
"epoch": 9.315939665715451,
"eval_accuracy": 0.366884968827947,
"eval_loss": 3.581573724746704,
"eval_runtime": 182.0337,
"eval_samples_per_second": 91.428,
"eval_steps_per_second": 5.719,
"step": 32000
},
{
"epoch": 9.33049909731524,
"grad_norm": 0.3508942127227783,
"learning_rate": 0.0004883227497815321,
"loss": 3.3778,
"step": 32050
},
{
"epoch": 9.34505852891503,
"grad_norm": 0.3674251437187195,
"learning_rate": 0.00048814797553160496,
"loss": 3.3807,
"step": 32100
},
{
"epoch": 9.359617960514822,
"grad_norm": 0.3387126922607422,
"learning_rate": 0.0004879732012816778,
"loss": 3.3823,
"step": 32150
},
{
"epoch": 9.374177392114612,
"grad_norm": 0.3542914390563965,
"learning_rate": 0.0004877984270317506,
"loss": 3.398,
"step": 32200
},
{
"epoch": 9.388736823714403,
"grad_norm": 0.354044109582901,
"learning_rate": 0.0004876236527818234,
"loss": 3.3764,
"step": 32250
},
{
"epoch": 9.403296255314192,
"grad_norm": 0.3662169575691223,
"learning_rate": 0.00048744887853189624,
"loss": 3.3919,
"step": 32300
},
{
"epoch": 9.417855686913983,
"grad_norm": 0.33728882670402527,
"learning_rate": 0.00048727410428196907,
"loss": 3.383,
"step": 32350
},
{
"epoch": 9.432415118513774,
"grad_norm": 0.32222864031791687,
"learning_rate": 0.0004870993300320419,
"loss": 3.3877,
"step": 32400
},
{
"epoch": 9.446974550113563,
"grad_norm": 0.3222348988056183,
"learning_rate": 0.00048692455578211474,
"loss": 3.3822,
"step": 32450
},
{
"epoch": 9.461533981713353,
"grad_norm": 0.3391883671283722,
"learning_rate": 0.0004867497815321875,
"loss": 3.3887,
"step": 32500
},
{
"epoch": 9.476093413313144,
"grad_norm": 0.3517501652240753,
"learning_rate": 0.00048657500728226035,
"loss": 3.3825,
"step": 32550
},
{
"epoch": 9.490652844912935,
"grad_norm": 0.3315829932689667,
"learning_rate": 0.0004864002330323332,
"loss": 3.3849,
"step": 32600
},
{
"epoch": 9.505212276512726,
"grad_norm": 0.33583584427833557,
"learning_rate": 0.000486225458782406,
"loss": 3.3938,
"step": 32650
},
{
"epoch": 9.519771708112515,
"grad_norm": 0.3496243357658386,
"learning_rate": 0.0004860506845324788,
"loss": 3.3901,
"step": 32700
},
{
"epoch": 9.534331139712306,
"grad_norm": 0.34915950894355774,
"learning_rate": 0.0004858759102825516,
"loss": 3.402,
"step": 32750
},
{
"epoch": 9.548890571312096,
"grad_norm": 0.3658216893672943,
"learning_rate": 0.00048570113603262446,
"loss": 3.391,
"step": 32800
},
{
"epoch": 9.563450002911885,
"grad_norm": 0.3504136800765991,
"learning_rate": 0.0004855263617826973,
"loss": 3.3906,
"step": 32850
},
{
"epoch": 9.578009434511676,
"grad_norm": 0.33254560828208923,
"learning_rate": 0.0004853515875327701,
"loss": 3.4056,
"step": 32900
},
{
"epoch": 9.592568866111467,
"grad_norm": 0.34906646609306335,
"learning_rate": 0.0004851768132828429,
"loss": 3.4075,
"step": 32950
},
{
"epoch": 9.607128297711258,
"grad_norm": 0.34559518098831177,
"learning_rate": 0.00048500203903291574,
"loss": 3.4026,
"step": 33000
},
{
"epoch": 9.607128297711258,
"eval_accuracy": 0.36745666125742,
"eval_loss": 3.5726640224456787,
"eval_runtime": 181.7757,
"eval_samples_per_second": 91.558,
"eval_steps_per_second": 5.727,
"step": 33000
},
{
"epoch": 9.621687729311049,
"grad_norm": 0.3735829293727875,
"learning_rate": 0.00048482726478298857,
"loss": 3.4065,
"step": 33050
},
{
"epoch": 9.636247160910838,
"grad_norm": 0.3518868684768677,
"learning_rate": 0.0004846524905330614,
"loss": 3.4036,
"step": 33100
},
{
"epoch": 9.650806592510628,
"grad_norm": 0.3787810802459717,
"learning_rate": 0.00048447771628313424,
"loss": 3.4012,
"step": 33150
},
{
"epoch": 9.66536602411042,
"grad_norm": 0.36960500478744507,
"learning_rate": 0.0004843029420332071,
"loss": 3.408,
"step": 33200
},
{
"epoch": 9.67992545571021,
"grad_norm": 0.34325626492500305,
"learning_rate": 0.0004841281677832799,
"loss": 3.4017,
"step": 33250
},
{
"epoch": 9.694484887309999,
"grad_norm": 0.3455840051174164,
"learning_rate": 0.00048395339353335273,
"loss": 3.4139,
"step": 33300
},
{
"epoch": 9.70904431890979,
"grad_norm": 0.35434481501579285,
"learning_rate": 0.00048377861928342557,
"loss": 3.3996,
"step": 33350
},
{
"epoch": 9.72360375050958,
"grad_norm": 0.33681508898735046,
"learning_rate": 0.0004836038450334984,
"loss": 3.4125,
"step": 33400
},
{
"epoch": 9.738163182109371,
"grad_norm": 0.35238656401634216,
"learning_rate": 0.0004834290707835712,
"loss": 3.4157,
"step": 33450
},
{
"epoch": 9.75272261370916,
"grad_norm": 0.37718260288238525,
"learning_rate": 0.000483254296533644,
"loss": 3.4033,
"step": 33500
},
{
"epoch": 9.767282045308951,
"grad_norm": 0.3434363901615143,
"learning_rate": 0.00048307952228371685,
"loss": 3.4143,
"step": 33550
},
{
"epoch": 9.781841476908742,
"grad_norm": 0.34627440571784973,
"learning_rate": 0.0004829047480337897,
"loss": 3.4043,
"step": 33600
},
{
"epoch": 9.796400908508533,
"grad_norm": 0.33534497022628784,
"learning_rate": 0.0004827299737838625,
"loss": 3.4029,
"step": 33650
},
{
"epoch": 9.810960340108322,
"grad_norm": 0.3508129417896271,
"learning_rate": 0.0004825551995339353,
"loss": 3.406,
"step": 33700
},
{
"epoch": 9.825519771708112,
"grad_norm": 0.34650343656539917,
"learning_rate": 0.0004823804252840081,
"loss": 3.404,
"step": 33750
},
{
"epoch": 9.840079203307903,
"grad_norm": 0.33442333340644836,
"learning_rate": 0.00048220565103408096,
"loss": 3.4015,
"step": 33800
},
{
"epoch": 9.854638634907694,
"grad_norm": 0.3506050407886505,
"learning_rate": 0.0004820308767841538,
"loss": 3.4156,
"step": 33850
},
{
"epoch": 9.869198066507483,
"grad_norm": 0.341828316450119,
"learning_rate": 0.0004818561025342266,
"loss": 3.4171,
"step": 33900
},
{
"epoch": 9.883757498107274,
"grad_norm": 0.3377910554409027,
"learning_rate": 0.0004816813282842994,
"loss": 3.4102,
"step": 33950
},
{
"epoch": 9.898316929707065,
"grad_norm": 0.35400837659835815,
"learning_rate": 0.00048150655403437223,
"loss": 3.4082,
"step": 34000
},
{
"epoch": 9.898316929707065,
"eval_accuracy": 0.36838721944064684,
"eval_loss": 3.5640623569488525,
"eval_runtime": 182.8947,
"eval_samples_per_second": 90.998,
"eval_steps_per_second": 5.692,
"step": 34000
},
{
"epoch": 9.912876361306855,
"grad_norm": 0.3472040593624115,
"learning_rate": 0.00048133177978444507,
"loss": 3.4179,
"step": 34050
},
{
"epoch": 9.927435792906644,
"grad_norm": 0.3496232032775879,
"learning_rate": 0.0004811570055345179,
"loss": 3.4113,
"step": 34100
},
{
"epoch": 9.941995224506435,
"grad_norm": 0.33684638142585754,
"learning_rate": 0.0004809822312845907,
"loss": 3.4137,
"step": 34150
},
{
"epoch": 9.956554656106226,
"grad_norm": 0.34335857629776,
"learning_rate": 0.0004808074570346635,
"loss": 3.4172,
"step": 34200
},
{
"epoch": 9.971114087706017,
"grad_norm": 0.34269091486930847,
"learning_rate": 0.00048063268278473634,
"loss": 3.4183,
"step": 34250
},
{
"epoch": 9.985673519305806,
"grad_norm": 0.3301508128643036,
"learning_rate": 0.0004804579085348092,
"loss": 3.4135,
"step": 34300
},
{
"epoch": 10.0,
"grad_norm": 0.8190501928329468,
"learning_rate": 0.000480283134284882,
"loss": 3.4084,
"step": 34350
},
{
"epoch": 10.01455943159979,
"grad_norm": 0.34881967306137085,
"learning_rate": 0.0004801083600349548,
"loss": 3.3121,
"step": 34400
},
{
"epoch": 10.029118863199582,
"grad_norm": 0.3504365086555481,
"learning_rate": 0.0004799335857850276,
"loss": 3.3012,
"step": 34450
},
{
"epoch": 10.04367829479937,
"grad_norm": 0.3723757565021515,
"learning_rate": 0.00047975881153510046,
"loss": 3.3123,
"step": 34500
},
{
"epoch": 10.058237726399161,
"grad_norm": 0.3652939200401306,
"learning_rate": 0.0004795840372851733,
"loss": 3.3082,
"step": 34550
},
{
"epoch": 10.072797157998952,
"grad_norm": 0.36539286375045776,
"learning_rate": 0.00047940926303524607,
"loss": 3.3053,
"step": 34600
},
{
"epoch": 10.087356589598743,
"grad_norm": 0.34552112221717834,
"learning_rate": 0.0004792344887853189,
"loss": 3.3203,
"step": 34650
},
{
"epoch": 10.101916021198532,
"grad_norm": 0.34289079904556274,
"learning_rate": 0.00047905971453539173,
"loss": 3.3335,
"step": 34700
},
{
"epoch": 10.116475452798323,
"grad_norm": 0.34614643454551697,
"learning_rate": 0.00047888494028546457,
"loss": 3.3293,
"step": 34750
},
{
"epoch": 10.131034884398114,
"grad_norm": 0.365692675113678,
"learning_rate": 0.0004787101660355374,
"loss": 3.3347,
"step": 34800
},
{
"epoch": 10.145594315997904,
"grad_norm": 0.3478696644306183,
"learning_rate": 0.0004785353917856102,
"loss": 3.3419,
"step": 34850
},
{
"epoch": 10.160153747597693,
"grad_norm": 0.345829576253891,
"learning_rate": 0.000478360617535683,
"loss": 3.3263,
"step": 34900
},
{
"epoch": 10.174713179197484,
"grad_norm": 0.4017032980918884,
"learning_rate": 0.00047818584328575584,
"loss": 3.3348,
"step": 34950
},
{
"epoch": 10.189272610797275,
"grad_norm": 0.34451884031295776,
"learning_rate": 0.0004780110690358287,
"loss": 3.3356,
"step": 35000
},
{
"epoch": 10.189272610797275,
"eval_accuracy": 0.36771605111744,
"eval_loss": 3.5778610706329346,
"eval_runtime": 183.2483,
"eval_samples_per_second": 90.822,
"eval_steps_per_second": 5.681,
"step": 35000
},
{
"epoch": 10.203832042397066,
"grad_norm": 0.35025554895401,
"learning_rate": 0.0004778362947859015,
"loss": 3.3442,
"step": 35050
},
{
"epoch": 10.218391473996855,
"grad_norm": 0.34518471360206604,
"learning_rate": 0.0004776615205359743,
"loss": 3.3374,
"step": 35100
},
{
"epoch": 10.232950905596645,
"grad_norm": 0.35896578431129456,
"learning_rate": 0.0004774867462860471,
"loss": 3.3453,
"step": 35150
},
{
"epoch": 10.247510337196436,
"grad_norm": 0.3396795094013214,
"learning_rate": 0.00047731197203611995,
"loss": 3.3457,
"step": 35200
},
{
"epoch": 10.262069768796227,
"grad_norm": 0.3721248209476471,
"learning_rate": 0.0004771371977861928,
"loss": 3.3458,
"step": 35250
},
{
"epoch": 10.276629200396016,
"grad_norm": 0.3700907230377197,
"learning_rate": 0.00047696242353626557,
"loss": 3.3381,
"step": 35300
},
{
"epoch": 10.291188631995807,
"grad_norm": 0.3764047622680664,
"learning_rate": 0.0004767876492863384,
"loss": 3.3418,
"step": 35350
},
{
"epoch": 10.305748063595598,
"grad_norm": 0.3617747724056244,
"learning_rate": 0.00047661287503641123,
"loss": 3.347,
"step": 35400
},
{
"epoch": 10.320307495195388,
"grad_norm": 0.34759700298309326,
"learning_rate": 0.00047643810078648407,
"loss": 3.3512,
"step": 35450
},
{
"epoch": 10.334866926795177,
"grad_norm": 0.35689282417297363,
"learning_rate": 0.0004762633265365569,
"loss": 3.3663,
"step": 35500
},
{
"epoch": 10.349426358394968,
"grad_norm": 0.32792720198631287,
"learning_rate": 0.0004760885522866297,
"loss": 3.3568,
"step": 35550
},
{
"epoch": 10.363985789994759,
"grad_norm": 0.3390996754169464,
"learning_rate": 0.0004759137780367025,
"loss": 3.3689,
"step": 35600
},
{
"epoch": 10.37854522159455,
"grad_norm": 0.35693955421447754,
"learning_rate": 0.00047573900378677534,
"loss": 3.3575,
"step": 35650
},
{
"epoch": 10.393104653194339,
"grad_norm": 0.3452168405056,
"learning_rate": 0.00047556422953684823,
"loss": 3.3642,
"step": 35700
},
{
"epoch": 10.40766408479413,
"grad_norm": 0.370328426361084,
"learning_rate": 0.00047538945528692106,
"loss": 3.3595,
"step": 35750
},
{
"epoch": 10.42222351639392,
"grad_norm": 0.37136757373809814,
"learning_rate": 0.0004752146810369939,
"loss": 3.346,
"step": 35800
},
{
"epoch": 10.436782947993711,
"grad_norm": 0.3773367702960968,
"learning_rate": 0.0004750399067870667,
"loss": 3.3645,
"step": 35850
},
{
"epoch": 10.4513423795935,
"grad_norm": 0.3447873592376709,
"learning_rate": 0.0004748651325371395,
"loss": 3.3598,
"step": 35900
},
{
"epoch": 10.46590181119329,
"grad_norm": 0.355688214302063,
"learning_rate": 0.00047469035828721234,
"loss": 3.3672,
"step": 35950
},
{
"epoch": 10.480461242793082,
"grad_norm": 0.3678136169910431,
"learning_rate": 0.0004745155840372852,
"loss": 3.3828,
"step": 36000
},
{
"epoch": 10.480461242793082,
"eval_accuracy": 0.36819038416155636,
"eval_loss": 3.568837881088257,
"eval_runtime": 183.926,
"eval_samples_per_second": 90.487,
"eval_steps_per_second": 5.66,
"step": 36000
},
{
"epoch": 10.495020674392872,
"grad_norm": 0.3659283220767975,
"learning_rate": 0.00047434080978735795,
"loss": 3.3647,
"step": 36050
},
{
"epoch": 10.509580105992661,
"grad_norm": 0.3798047602176666,
"learning_rate": 0.0004741660355374308,
"loss": 3.3631,
"step": 36100
},
{
"epoch": 10.524139537592452,
"grad_norm": 0.3466806411743164,
"learning_rate": 0.0004739912612875036,
"loss": 3.359,
"step": 36150
},
{
"epoch": 10.538698969192243,
"grad_norm": 0.35511037707328796,
"learning_rate": 0.00047381648703757645,
"loss": 3.3755,
"step": 36200
},
{
"epoch": 10.553258400792034,
"grad_norm": 0.3418614864349365,
"learning_rate": 0.0004736417127876493,
"loss": 3.3799,
"step": 36250
},
{
"epoch": 10.567817832391823,
"grad_norm": 0.38244953751564026,
"learning_rate": 0.00047346693853772206,
"loss": 3.382,
"step": 36300
},
{
"epoch": 10.582377263991614,
"grad_norm": 0.3323763310909271,
"learning_rate": 0.0004732921642877949,
"loss": 3.3828,
"step": 36350
},
{
"epoch": 10.596936695591404,
"grad_norm": 0.3437618315219879,
"learning_rate": 0.00047311739003786773,
"loss": 3.391,
"step": 36400
},
{
"epoch": 10.611496127191195,
"grad_norm": 0.36182549595832825,
"learning_rate": 0.00047294261578794056,
"loss": 3.3829,
"step": 36450
},
{
"epoch": 10.626055558790984,
"grad_norm": 0.38253724575042725,
"learning_rate": 0.0004727678415380134,
"loss": 3.3803,
"step": 36500
},
{
"epoch": 10.640614990390775,
"grad_norm": 0.36465519666671753,
"learning_rate": 0.0004725930672880862,
"loss": 3.3703,
"step": 36550
},
{
"epoch": 10.655174421990566,
"grad_norm": 0.3479657769203186,
"learning_rate": 0.000472418293038159,
"loss": 3.3709,
"step": 36600
},
{
"epoch": 10.669733853590357,
"grad_norm": 0.3454592227935791,
"learning_rate": 0.00047224351878823184,
"loss": 3.3876,
"step": 36650
},
{
"epoch": 10.684293285190146,
"grad_norm": 0.34455588459968567,
"learning_rate": 0.0004720687445383047,
"loss": 3.3788,
"step": 36700
},
{
"epoch": 10.698852716789936,
"grad_norm": 0.357598215341568,
"learning_rate": 0.00047189397028837745,
"loss": 3.3759,
"step": 36750
},
{
"epoch": 10.713412148389727,
"grad_norm": 0.36810582876205444,
"learning_rate": 0.0004717191960384503,
"loss": 3.3817,
"step": 36800
},
{
"epoch": 10.727971579989518,
"grad_norm": 0.37969326972961426,
"learning_rate": 0.0004715444217885231,
"loss": 3.3845,
"step": 36850
},
{
"epoch": 10.742531011589307,
"grad_norm": 0.362560898065567,
"learning_rate": 0.00047136964753859595,
"loss": 3.397,
"step": 36900
},
{
"epoch": 10.757090443189098,
"grad_norm": 0.36402398347854614,
"learning_rate": 0.0004711948732886688,
"loss": 3.3797,
"step": 36950
},
{
"epoch": 10.771649874788888,
"grad_norm": 0.3478822410106659,
"learning_rate": 0.00047102009903874156,
"loss": 3.3911,
"step": 37000
},
{
"epoch": 10.771649874788888,
"eval_accuracy": 0.368904117819907,
"eval_loss": 3.5626118183135986,
"eval_runtime": 183.8574,
"eval_samples_per_second": 90.521,
"eval_steps_per_second": 5.662,
"step": 37000
},
{
"epoch": 10.78620930638868,
"grad_norm": 0.34672781825065613,
"learning_rate": 0.0004708453247888144,
"loss": 3.3796,
"step": 37050
},
{
"epoch": 10.800768737988468,
"grad_norm": 0.35510483384132385,
"learning_rate": 0.00047067055053888723,
"loss": 3.3921,
"step": 37100
},
{
"epoch": 10.815328169588259,
"grad_norm": 0.3330132067203522,
"learning_rate": 0.00047049577628896006,
"loss": 3.3707,
"step": 37150
},
{
"epoch": 10.82988760118805,
"grad_norm": 0.35041606426239014,
"learning_rate": 0.0004703210020390329,
"loss": 3.3993,
"step": 37200
},
{
"epoch": 10.84444703278784,
"grad_norm": 0.34748944640159607,
"learning_rate": 0.0004701462277891057,
"loss": 3.3854,
"step": 37250
},
{
"epoch": 10.85900646438763,
"grad_norm": 0.3505236506462097,
"learning_rate": 0.0004699714535391785,
"loss": 3.3933,
"step": 37300
},
{
"epoch": 10.87356589598742,
"grad_norm": 0.3472146689891815,
"learning_rate": 0.00046979667928925134,
"loss": 3.3877,
"step": 37350
},
{
"epoch": 10.888125327587211,
"grad_norm": 0.33038902282714844,
"learning_rate": 0.0004696219050393242,
"loss": 3.3872,
"step": 37400
},
{
"epoch": 10.902684759187002,
"grad_norm": 0.33716917037963867,
"learning_rate": 0.00046944713078939695,
"loss": 3.3962,
"step": 37450
},
{
"epoch": 10.917244190786791,
"grad_norm": 0.3526748716831207,
"learning_rate": 0.0004692723565394698,
"loss": 3.3928,
"step": 37500
},
{
"epoch": 10.931803622386582,
"grad_norm": 0.36475178599357605,
"learning_rate": 0.0004690975822895426,
"loss": 3.3842,
"step": 37550
},
{
"epoch": 10.946363053986373,
"grad_norm": 0.36359477043151855,
"learning_rate": 0.00046892280803961545,
"loss": 3.401,
"step": 37600
},
{
"epoch": 10.960922485586163,
"grad_norm": 0.35189494490623474,
"learning_rate": 0.0004687480337896883,
"loss": 3.3937,
"step": 37650
},
{
"epoch": 10.975481917185952,
"grad_norm": 0.3400118350982666,
"learning_rate": 0.00046857325953976106,
"loss": 3.3934,
"step": 37700
},
{
"epoch": 10.990041348785743,
"grad_norm": 0.3473895490169525,
"learning_rate": 0.0004683984852898339,
"loss": 3.3902,
"step": 37750
},
{
"epoch": 11.004367829479937,
"grad_norm": 0.3693157732486725,
"learning_rate": 0.00046822371103990673,
"loss": 3.3584,
"step": 37800
},
{
"epoch": 11.018927261079728,
"grad_norm": 0.34884193539619446,
"learning_rate": 0.00046804893678997956,
"loss": 3.2712,
"step": 37850
},
{
"epoch": 11.033486692679517,
"grad_norm": 0.331039696931839,
"learning_rate": 0.00046787416254005234,
"loss": 3.2856,
"step": 37900
},
{
"epoch": 11.048046124279308,
"grad_norm": 0.34825077652931213,
"learning_rate": 0.0004676993882901252,
"loss": 3.2941,
"step": 37950
},
{
"epoch": 11.062605555879099,
"grad_norm": 0.3396894633769989,
"learning_rate": 0.000467524614040198,
"loss": 3.2861,
"step": 38000
},
{
"epoch": 11.062605555879099,
"eval_accuracy": 0.36855336534826616,
"eval_loss": 3.5719475746154785,
"eval_runtime": 180.4716,
"eval_samples_per_second": 92.219,
"eval_steps_per_second": 5.768,
"step": 38000
},
{
"epoch": 11.07716498747889,
"grad_norm": 0.41541653871536255,
"learning_rate": 0.00046734983979027084,
"loss": 3.2967,
"step": 38050
},
{
"epoch": 11.091724419078679,
"grad_norm": 0.34760013222694397,
"learning_rate": 0.00046717506554034367,
"loss": 3.2988,
"step": 38100
},
{
"epoch": 11.10628385067847,
"grad_norm": 0.3493053913116455,
"learning_rate": 0.00046700029129041645,
"loss": 3.2994,
"step": 38150
},
{
"epoch": 11.12084328227826,
"grad_norm": 0.35706987977027893,
"learning_rate": 0.0004668255170404893,
"loss": 3.315,
"step": 38200
},
{
"epoch": 11.135402713878051,
"grad_norm": 0.3363507390022278,
"learning_rate": 0.00046665074279056217,
"loss": 3.3071,
"step": 38250
},
{
"epoch": 11.14996214547784,
"grad_norm": 0.3618837296962738,
"learning_rate": 0.000466475968540635,
"loss": 3.3106,
"step": 38300
},
{
"epoch": 11.16452157707763,
"grad_norm": 0.33892515301704407,
"learning_rate": 0.00046630119429070784,
"loss": 3.3133,
"step": 38350
},
{
"epoch": 11.179081008677421,
"grad_norm": 0.33202266693115234,
"learning_rate": 0.00046612642004078067,
"loss": 3.3235,
"step": 38400
},
{
"epoch": 11.193640440277212,
"grad_norm": 0.3930901288986206,
"learning_rate": 0.00046595164579085345,
"loss": 3.3299,
"step": 38450
},
{
"epoch": 11.208199871877001,
"grad_norm": 0.4052780568599701,
"learning_rate": 0.0004657768715409263,
"loss": 3.3197,
"step": 38500
},
{
"epoch": 11.222759303476792,
"grad_norm": 0.3582177460193634,
"learning_rate": 0.0004656020972909991,
"loss": 3.3247,
"step": 38550
},
{
"epoch": 11.237318735076583,
"grad_norm": 0.3405052423477173,
"learning_rate": 0.00046542732304107195,
"loss": 3.3235,
"step": 38600
},
{
"epoch": 11.251878166676374,
"grad_norm": 0.32738906145095825,
"learning_rate": 0.0004652525487911447,
"loss": 3.3191,
"step": 38650
},
{
"epoch": 11.266437598276163,
"grad_norm": 0.36800041794776917,
"learning_rate": 0.00046507777454121756,
"loss": 3.3328,
"step": 38700
},
{
"epoch": 11.280997029875953,
"grad_norm": 0.37207457423210144,
"learning_rate": 0.0004649030002912904,
"loss": 3.3304,
"step": 38750
},
{
"epoch": 11.295556461475744,
"grad_norm": 0.36415359377861023,
"learning_rate": 0.0004647282260413632,
"loss": 3.3409,
"step": 38800
},
{
"epoch": 11.310115893075535,
"grad_norm": 0.3438774049282074,
"learning_rate": 0.00046455345179143606,
"loss": 3.3288,
"step": 38850
},
{
"epoch": 11.324675324675324,
"grad_norm": 0.3514200448989868,
"learning_rate": 0.00046437867754150884,
"loss": 3.331,
"step": 38900
},
{
"epoch": 11.339234756275115,
"grad_norm": 0.34444525837898254,
"learning_rate": 0.00046420390329158167,
"loss": 3.3253,
"step": 38950
},
{
"epoch": 11.353794187874906,
"grad_norm": 0.34927886724472046,
"learning_rate": 0.0004640291290416545,
"loss": 3.3361,
"step": 39000
},
{
"epoch": 11.353794187874906,
"eval_accuracy": 0.36882192674458786,
"eval_loss": 3.568171739578247,
"eval_runtime": 180.5055,
"eval_samples_per_second": 92.202,
"eval_steps_per_second": 5.767,
"step": 39000
},
{
"epoch": 11.368353619474696,
"grad_norm": 0.34765294194221497,
"learning_rate": 0.00046385435479172734,
"loss": 3.3385,
"step": 39050
},
{
"epoch": 11.382913051074485,
"grad_norm": 0.37567201256752014,
"learning_rate": 0.00046367958054180017,
"loss": 3.3372,
"step": 39100
},
{
"epoch": 11.397472482674276,
"grad_norm": 0.3298972547054291,
"learning_rate": 0.00046350480629187295,
"loss": 3.3449,
"step": 39150
},
{
"epoch": 11.412031914274067,
"grad_norm": 0.3385719656944275,
"learning_rate": 0.0004633300320419458,
"loss": 3.3381,
"step": 39200
},
{
"epoch": 11.426591345873858,
"grad_norm": 0.3834417164325714,
"learning_rate": 0.0004631552577920186,
"loss": 3.3453,
"step": 39250
},
{
"epoch": 11.441150777473647,
"grad_norm": 0.36645200848579407,
"learning_rate": 0.00046298048354209145,
"loss": 3.3585,
"step": 39300
},
{
"epoch": 11.455710209073438,
"grad_norm": 0.3596128523349762,
"learning_rate": 0.0004628057092921642,
"loss": 3.3505,
"step": 39350
},
{
"epoch": 11.470269640673228,
"grad_norm": 0.37306201457977295,
"learning_rate": 0.00046263093504223706,
"loss": 3.3489,
"step": 39400
},
{
"epoch": 11.484829072273019,
"grad_norm": 0.40729859471321106,
"learning_rate": 0.0004624561607923099,
"loss": 3.3555,
"step": 39450
},
{
"epoch": 11.499388503872808,
"grad_norm": 0.35100769996643066,
"learning_rate": 0.0004622813865423827,
"loss": 3.3443,
"step": 39500
},
{
"epoch": 11.513947935472599,
"grad_norm": 0.347989559173584,
"learning_rate": 0.00046210661229245556,
"loss": 3.3647,
"step": 39550
},
{
"epoch": 11.52850736707239,
"grad_norm": 0.35340970754623413,
"learning_rate": 0.00046193183804252834,
"loss": 3.346,
"step": 39600
},
{
"epoch": 11.54306679867218,
"grad_norm": 0.3439280092716217,
"learning_rate": 0.00046175706379260117,
"loss": 3.3613,
"step": 39650
},
{
"epoch": 11.55762623027197,
"grad_norm": 0.34520137310028076,
"learning_rate": 0.000461582289542674,
"loss": 3.347,
"step": 39700
},
{
"epoch": 11.57218566187176,
"grad_norm": 0.3320297598838806,
"learning_rate": 0.00046140751529274684,
"loss": 3.3489,
"step": 39750
},
{
"epoch": 11.586745093471551,
"grad_norm": 0.35040003061294556,
"learning_rate": 0.00046123274104281967,
"loss": 3.3462,
"step": 39800
},
{
"epoch": 11.601304525071342,
"grad_norm": 0.3691483438014984,
"learning_rate": 0.00046105796679289245,
"loss": 3.3593,
"step": 39850
},
{
"epoch": 11.61586395667113,
"grad_norm": 0.3896438777446747,
"learning_rate": 0.0004608831925429653,
"loss": 3.3616,
"step": 39900
},
{
"epoch": 11.630423388270922,
"grad_norm": 0.36567434668540955,
"learning_rate": 0.0004607084182930381,
"loss": 3.3553,
"step": 39950
},
{
"epoch": 11.644982819870712,
"grad_norm": 0.343128502368927,
"learning_rate": 0.00046053364404311095,
"loss": 3.361,
"step": 40000
},
{
"epoch": 11.644982819870712,
"eval_accuracy": 0.36932248097582326,
"eval_loss": 3.5595271587371826,
"eval_runtime": 180.4161,
"eval_samples_per_second": 92.248,
"eval_steps_per_second": 5.77,
"step": 40000
},
{
"epoch": 11.659542251470503,
"grad_norm": 0.35191434621810913,
"learning_rate": 0.0004603588697931837,
"loss": 3.3577,
"step": 40050
},
{
"epoch": 11.674101683070292,
"grad_norm": 0.36230576038360596,
"learning_rate": 0.00046018409554325656,
"loss": 3.357,
"step": 40100
},
{
"epoch": 11.688661114670083,
"grad_norm": 0.3622187077999115,
"learning_rate": 0.0004600093212933294,
"loss": 3.3582,
"step": 40150
},
{
"epoch": 11.703220546269874,
"grad_norm": 0.3318762183189392,
"learning_rate": 0.0004598345470434022,
"loss": 3.3514,
"step": 40200
},
{
"epoch": 11.717779977869665,
"grad_norm": 0.39714378118515015,
"learning_rate": 0.00045965977279347506,
"loss": 3.3712,
"step": 40250
},
{
"epoch": 11.732339409469454,
"grad_norm": 0.3500266969203949,
"learning_rate": 0.00045948499854354784,
"loss": 3.3497,
"step": 40300
},
{
"epoch": 11.746898841069244,
"grad_norm": 0.3604873716831207,
"learning_rate": 0.00045931022429362067,
"loss": 3.3741,
"step": 40350
},
{
"epoch": 11.761458272669035,
"grad_norm": 0.3486907482147217,
"learning_rate": 0.0004591354500436935,
"loss": 3.3652,
"step": 40400
},
{
"epoch": 11.776017704268826,
"grad_norm": 0.3572530448436737,
"learning_rate": 0.00045896067579376634,
"loss": 3.3718,
"step": 40450
},
{
"epoch": 11.790577135868615,
"grad_norm": 0.3657876253128052,
"learning_rate": 0.0004587859015438391,
"loss": 3.3635,
"step": 40500
},
{
"epoch": 11.805136567468406,
"grad_norm": 0.3681361675262451,
"learning_rate": 0.00045861112729391195,
"loss": 3.3774,
"step": 40550
},
{
"epoch": 11.819695999068196,
"grad_norm": 0.3658032715320587,
"learning_rate": 0.0004584363530439848,
"loss": 3.3633,
"step": 40600
},
{
"epoch": 11.834255430667987,
"grad_norm": 0.35775232315063477,
"learning_rate": 0.0004582615787940576,
"loss": 3.3666,
"step": 40650
},
{
"epoch": 11.848814862267776,
"grad_norm": 0.3474526107311249,
"learning_rate": 0.00045808680454413045,
"loss": 3.3651,
"step": 40700
},
{
"epoch": 11.863374293867567,
"grad_norm": 0.332225501537323,
"learning_rate": 0.00045791203029420333,
"loss": 3.3805,
"step": 40750
},
{
"epoch": 11.877933725467358,
"grad_norm": 0.3563697338104248,
"learning_rate": 0.0004577372560442761,
"loss": 3.3677,
"step": 40800
},
{
"epoch": 11.892493157067149,
"grad_norm": 0.35877200961112976,
"learning_rate": 0.00045756248179434894,
"loss": 3.36,
"step": 40850
},
{
"epoch": 11.90705258866694,
"grad_norm": 0.3673311471939087,
"learning_rate": 0.0004573877075444218,
"loss": 3.3754,
"step": 40900
},
{
"epoch": 11.921612020266728,
"grad_norm": 0.3623284697532654,
"learning_rate": 0.0004572129332944946,
"loss": 3.3718,
"step": 40950
},
{
"epoch": 11.93617145186652,
"grad_norm": 0.3350276052951813,
"learning_rate": 0.00045703815904456744,
"loss": 3.382,
"step": 41000
},
{
"epoch": 11.93617145186652,
"eval_accuracy": 0.3699698973716846,
"eval_loss": 3.5505945682525635,
"eval_runtime": 180.4963,
"eval_samples_per_second": 92.207,
"eval_steps_per_second": 5.767,
"step": 41000
},
{
"epoch": 11.95073088346631,
"grad_norm": 0.38111695647239685,
"learning_rate": 0.0004568633847946402,
"loss": 3.3643,
"step": 41050
},
{
"epoch": 11.965290315066099,
"grad_norm": 0.3372560739517212,
"learning_rate": 0.00045668861054471306,
"loss": 3.3782,
"step": 41100
},
{
"epoch": 11.97984974666589,
"grad_norm": 0.3638279139995575,
"learning_rate": 0.0004565138362947859,
"loss": 3.3817,
"step": 41150
},
{
"epoch": 11.99440917826568,
"grad_norm": 0.3317911922931671,
"learning_rate": 0.0004563390620448587,
"loss": 3.3756,
"step": 41200
},
{
"epoch": 12.008735658959875,
"grad_norm": 0.36474958062171936,
"learning_rate": 0.0004561642877949315,
"loss": 3.3129,
"step": 41250
},
{
"epoch": 12.023295090559664,
"grad_norm": 0.3482862412929535,
"learning_rate": 0.00045598951354500433,
"loss": 3.2551,
"step": 41300
},
{
"epoch": 12.037854522159455,
"grad_norm": 0.3640674948692322,
"learning_rate": 0.00045581473929507717,
"loss": 3.2729,
"step": 41350
},
{
"epoch": 12.052413953759245,
"grad_norm": 0.3480179011821747,
"learning_rate": 0.00045563996504515,
"loss": 3.2839,
"step": 41400
},
{
"epoch": 12.066973385359036,
"grad_norm": 0.36980733275413513,
"learning_rate": 0.00045546519079522283,
"loss": 3.2803,
"step": 41450
},
{
"epoch": 12.081532816958825,
"grad_norm": 0.3632776141166687,
"learning_rate": 0.0004552904165452956,
"loss": 3.276,
"step": 41500
},
{
"epoch": 12.096092248558616,
"grad_norm": 0.3438667058944702,
"learning_rate": 0.00045511564229536844,
"loss": 3.2754,
"step": 41550
},
{
"epoch": 12.110651680158407,
"grad_norm": 0.37722644209861755,
"learning_rate": 0.0004549408680454413,
"loss": 3.2876,
"step": 41600
},
{
"epoch": 12.125211111758198,
"grad_norm": 0.3496084213256836,
"learning_rate": 0.0004547660937955141,
"loss": 3.2876,
"step": 41650
},
{
"epoch": 12.139770543357987,
"grad_norm": 0.35013580322265625,
"learning_rate": 0.00045459131954558694,
"loss": 3.2979,
"step": 41700
},
{
"epoch": 12.154329974957777,
"grad_norm": 0.36948785185813904,
"learning_rate": 0.0004544165452956597,
"loss": 3.2904,
"step": 41750
},
{
"epoch": 12.168889406557568,
"grad_norm": 0.3557933568954468,
"learning_rate": 0.00045424177104573255,
"loss": 3.3091,
"step": 41800
},
{
"epoch": 12.183448838157359,
"grad_norm": 0.3801754415035248,
"learning_rate": 0.0004540669967958054,
"loss": 3.3023,
"step": 41850
},
{
"epoch": 12.198008269757148,
"grad_norm": 0.3558266758918762,
"learning_rate": 0.0004538922225458782,
"loss": 3.3032,
"step": 41900
},
{
"epoch": 12.212567701356939,
"grad_norm": 0.347100168466568,
"learning_rate": 0.000453717448295951,
"loss": 3.3132,
"step": 41950
},
{
"epoch": 12.22712713295673,
"grad_norm": 0.38097837567329407,
"learning_rate": 0.00045354267404602383,
"loss": 3.3023,
"step": 42000
},
{
"epoch": 12.22712713295673,
"eval_accuracy": 0.3693688089925267,
"eval_loss": 3.5664913654327393,
"eval_runtime": 180.5022,
"eval_samples_per_second": 92.204,
"eval_steps_per_second": 5.767,
"step": 42000
},
{
"epoch": 12.24168656455652,
"grad_norm": 0.3856671154499054,
"learning_rate": 0.00045336789979609667,
"loss": 3.3158,
"step": 42050
},
{
"epoch": 12.25624599615631,
"grad_norm": 0.38062354922294617,
"learning_rate": 0.0004531931255461695,
"loss": 3.3048,
"step": 42100
},
{
"epoch": 12.2708054277561,
"grad_norm": 0.36241772770881653,
"learning_rate": 0.00045301835129624233,
"loss": 3.3038,
"step": 42150
},
{
"epoch": 12.28536485935589,
"grad_norm": 0.3613075315952301,
"learning_rate": 0.0004528435770463151,
"loss": 3.3044,
"step": 42200
},
{
"epoch": 12.299924290955682,
"grad_norm": 0.3558962941169739,
"learning_rate": 0.00045266880279638794,
"loss": 3.319,
"step": 42250
},
{
"epoch": 12.31448372255547,
"grad_norm": 0.3771170973777771,
"learning_rate": 0.0004524940285464608,
"loss": 3.3102,
"step": 42300
},
{
"epoch": 12.329043154155261,
"grad_norm": 0.3604891896247864,
"learning_rate": 0.0004523192542965336,
"loss": 3.3279,
"step": 42350
},
{
"epoch": 12.343602585755052,
"grad_norm": 0.3826010525226593,
"learning_rate": 0.00045214448004660644,
"loss": 3.3174,
"step": 42400
},
{
"epoch": 12.358162017354843,
"grad_norm": 0.36643317341804504,
"learning_rate": 0.0004519697057966792,
"loss": 3.3172,
"step": 42450
},
{
"epoch": 12.372721448954632,
"grad_norm": 0.3626962900161743,
"learning_rate": 0.00045179493154675205,
"loss": 3.3235,
"step": 42500
},
{
"epoch": 12.387280880554423,
"grad_norm": 0.3473532199859619,
"learning_rate": 0.0004516201572968249,
"loss": 3.328,
"step": 42550
},
{
"epoch": 12.401840312154214,
"grad_norm": 0.3678642213344574,
"learning_rate": 0.0004514453830468977,
"loss": 3.328,
"step": 42600
},
{
"epoch": 12.416399743754004,
"grad_norm": 0.3755843937397003,
"learning_rate": 0.0004512706087969705,
"loss": 3.3249,
"step": 42650
},
{
"epoch": 12.430959175353793,
"grad_norm": 0.36632055044174194,
"learning_rate": 0.00045109583454704333,
"loss": 3.3178,
"step": 42700
},
{
"epoch": 12.445518606953584,
"grad_norm": 0.348257839679718,
"learning_rate": 0.00045092106029711616,
"loss": 3.3205,
"step": 42750
},
{
"epoch": 12.460078038553375,
"grad_norm": 0.39049002528190613,
"learning_rate": 0.000450746286047189,
"loss": 3.3324,
"step": 42800
},
{
"epoch": 12.474637470153166,
"grad_norm": 0.35907331109046936,
"learning_rate": 0.00045057151179726183,
"loss": 3.3413,
"step": 42850
},
{
"epoch": 12.489196901752955,
"grad_norm": 0.3372901380062103,
"learning_rate": 0.0004503967375473346,
"loss": 3.3362,
"step": 42900
},
{
"epoch": 12.503756333352746,
"grad_norm": 0.3593348562717438,
"learning_rate": 0.00045022196329740744,
"loss": 3.3363,
"step": 42950
},
{
"epoch": 12.518315764952536,
"grad_norm": 0.3299802541732788,
"learning_rate": 0.0004500471890474803,
"loss": 3.3451,
"step": 43000
},
{
"epoch": 12.518315764952536,
"eval_accuracy": 0.37018825048594445,
"eval_loss": 3.559201955795288,
"eval_runtime": 180.5771,
"eval_samples_per_second": 92.166,
"eval_steps_per_second": 5.765,
"step": 43000
},
{
"epoch": 12.532875196552327,
"grad_norm": 0.390259712934494,
"learning_rate": 0.0004498724147975531,
"loss": 3.3272,
"step": 43050
},
{
"epoch": 12.547434628152118,
"grad_norm": 0.3740730583667755,
"learning_rate": 0.00044969764054762594,
"loss": 3.3344,
"step": 43100
},
{
"epoch": 12.561994059751907,
"grad_norm": 0.3831499516963959,
"learning_rate": 0.0004495228662976987,
"loss": 3.3283,
"step": 43150
},
{
"epoch": 12.576553491351698,
"grad_norm": 0.3770614266395569,
"learning_rate": 0.00044934809204777155,
"loss": 3.3304,
"step": 43200
},
{
"epoch": 12.591112922951488,
"grad_norm": 0.3610830008983612,
"learning_rate": 0.0004491733177978444,
"loss": 3.3299,
"step": 43250
},
{
"epoch": 12.605672354551277,
"grad_norm": 0.36223649978637695,
"learning_rate": 0.0004489985435479173,
"loss": 3.338,
"step": 43300
},
{
"epoch": 12.620231786151068,
"grad_norm": 0.3683512210845947,
"learning_rate": 0.0004488237692979901,
"loss": 3.3381,
"step": 43350
},
{
"epoch": 12.634791217750859,
"grad_norm": 0.3539344370365143,
"learning_rate": 0.0004486489950480629,
"loss": 3.3481,
"step": 43400
},
{
"epoch": 12.64935064935065,
"grad_norm": 0.35910946130752563,
"learning_rate": 0.0004484742207981357,
"loss": 3.3372,
"step": 43450
},
{
"epoch": 12.66391008095044,
"grad_norm": 0.354937881231308,
"learning_rate": 0.00044829944654820855,
"loss": 3.3418,
"step": 43500
},
{
"epoch": 12.67846951255023,
"grad_norm": 0.3593963384628296,
"learning_rate": 0.0004481246722982814,
"loss": 3.3433,
"step": 43550
},
{
"epoch": 12.69302894415002,
"grad_norm": 0.3790937066078186,
"learning_rate": 0.0004479498980483542,
"loss": 3.344,
"step": 43600
},
{
"epoch": 12.707588375749811,
"grad_norm": 0.37113437056541443,
"learning_rate": 0.000447775123798427,
"loss": 3.3403,
"step": 43650
},
{
"epoch": 12.7221478073496,
"grad_norm": 0.3723011612892151,
"learning_rate": 0.00044760034954849983,
"loss": 3.3454,
"step": 43700
},
{
"epoch": 12.736707238949391,
"grad_norm": 0.42552995681762695,
"learning_rate": 0.00044742557529857266,
"loss": 3.3587,
"step": 43750
},
{
"epoch": 12.751266670549182,
"grad_norm": 0.37783387303352356,
"learning_rate": 0.0004472508010486455,
"loss": 3.3452,
"step": 43800
},
{
"epoch": 12.765826102148973,
"grad_norm": 0.36067041754722595,
"learning_rate": 0.0004470760267987183,
"loss": 3.3381,
"step": 43850
},
{
"epoch": 12.780385533748763,
"grad_norm": 0.34738510847091675,
"learning_rate": 0.0004469012525487911,
"loss": 3.3478,
"step": 43900
},
{
"epoch": 12.794944965348552,
"grad_norm": 0.37115344405174255,
"learning_rate": 0.00044672647829886394,
"loss": 3.3477,
"step": 43950
},
{
"epoch": 12.809504396948343,
"grad_norm": 0.3720683753490448,
"learning_rate": 0.00044655170404893677,
"loss": 3.3421,
"step": 44000
},
{
"epoch": 12.809504396948343,
"eval_accuracy": 0.3703436962678785,
"eval_loss": 3.550240993499756,
"eval_runtime": 180.6249,
"eval_samples_per_second": 92.141,
"eval_steps_per_second": 5.763,
"step": 44000
},
{
"epoch": 12.824063828548134,
"grad_norm": 0.38867267966270447,
"learning_rate": 0.0004463769297990096,
"loss": 3.3632,
"step": 44050
},
{
"epoch": 12.838623260147925,
"grad_norm": 0.3558189868927002,
"learning_rate": 0.0004462021555490824,
"loss": 3.3466,
"step": 44100
},
{
"epoch": 12.853182691747714,
"grad_norm": 0.37346190214157104,
"learning_rate": 0.0004460273812991552,
"loss": 3.3497,
"step": 44150
},
{
"epoch": 12.867742123347504,
"grad_norm": 0.3574129641056061,
"learning_rate": 0.00044585260704922805,
"loss": 3.3586,
"step": 44200
},
{
"epoch": 12.882301554947295,
"grad_norm": 0.3636460602283478,
"learning_rate": 0.0004456778327993009,
"loss": 3.3511,
"step": 44250
},
{
"epoch": 12.896860986547086,
"grad_norm": 0.38483962416648865,
"learning_rate": 0.0004455030585493737,
"loss": 3.3554,
"step": 44300
},
{
"epoch": 12.911420418146875,
"grad_norm": 0.3610190749168396,
"learning_rate": 0.0004453282842994465,
"loss": 3.3622,
"step": 44350
},
{
"epoch": 12.925979849746666,
"grad_norm": 0.3633497357368469,
"learning_rate": 0.00044515351004951933,
"loss": 3.3642,
"step": 44400
},
{
"epoch": 12.940539281346457,
"grad_norm": 0.37331637740135193,
"learning_rate": 0.00044497873579959216,
"loss": 3.3451,
"step": 44450
},
{
"epoch": 12.955098712946247,
"grad_norm": 0.3372592031955719,
"learning_rate": 0.000444803961549665,
"loss": 3.3385,
"step": 44500
},
{
"epoch": 12.969658144546036,
"grad_norm": 0.38096725940704346,
"learning_rate": 0.0004446291872997378,
"loss": 3.3511,
"step": 44550
},
{
"epoch": 12.984217576145827,
"grad_norm": 0.3771226704120636,
"learning_rate": 0.0004444544130498106,
"loss": 3.3651,
"step": 44600
},
{
"epoch": 12.998777007745618,
"grad_norm": 0.37394359707832336,
"learning_rate": 0.00044427963879988344,
"loss": 3.3551,
"step": 44650
},
{
"epoch": 13.01310348843981,
"grad_norm": 0.3792758882045746,
"learning_rate": 0.00044410486454995627,
"loss": 3.2584,
"step": 44700
},
{
"epoch": 13.027662920039601,
"grad_norm": 0.3561805188655853,
"learning_rate": 0.0004439300903000291,
"loss": 3.2468,
"step": 44750
},
{
"epoch": 13.042222351639392,
"grad_norm": 0.3759787678718567,
"learning_rate": 0.0004437553160501019,
"loss": 3.2549,
"step": 44800
},
{
"epoch": 13.056781783239183,
"grad_norm": 0.3348606824874878,
"learning_rate": 0.0004435805418001747,
"loss": 3.2564,
"step": 44850
},
{
"epoch": 13.071341214838972,
"grad_norm": 0.36148425936698914,
"learning_rate": 0.00044340576755024755,
"loss": 3.2635,
"step": 44900
},
{
"epoch": 13.085900646438763,
"grad_norm": 0.37378937005996704,
"learning_rate": 0.0004432309933003204,
"loss": 3.2627,
"step": 44950
},
{
"epoch": 13.100460078038553,
"grad_norm": 0.39774009585380554,
"learning_rate": 0.0004430562190503932,
"loss": 3.278,
"step": 45000
},
{
"epoch": 13.100460078038553,
"eval_accuracy": 0.36968699075191663,
"eval_loss": 3.564668893814087,
"eval_runtime": 180.6228,
"eval_samples_per_second": 92.142,
"eval_steps_per_second": 5.763,
"step": 45000
},
{
"epoch": 13.115019509638344,
"grad_norm": 0.3558865785598755,
"learning_rate": 0.000442881444800466,
"loss": 3.2663,
"step": 45050
},
{
"epoch": 13.129578941238133,
"grad_norm": 0.3488845229148865,
"learning_rate": 0.00044270667055053883,
"loss": 3.2783,
"step": 45100
},
{
"epoch": 13.144138372837924,
"grad_norm": 0.3732559382915497,
"learning_rate": 0.00044253189630061166,
"loss": 3.2773,
"step": 45150
},
{
"epoch": 13.158697804437715,
"grad_norm": 0.3805554211139679,
"learning_rate": 0.0004423571220506845,
"loss": 3.285,
"step": 45200
},
{
"epoch": 13.173257236037506,
"grad_norm": 0.3527976870536804,
"learning_rate": 0.00044218234780075727,
"loss": 3.2736,
"step": 45250
},
{
"epoch": 13.187816667637296,
"grad_norm": 0.36648017168045044,
"learning_rate": 0.0004420075735508301,
"loss": 3.2835,
"step": 45300
},
{
"epoch": 13.202376099237085,
"grad_norm": 0.3752287030220032,
"learning_rate": 0.00044183279930090294,
"loss": 3.2882,
"step": 45350
},
{
"epoch": 13.216935530836876,
"grad_norm": 0.3686719536781311,
"learning_rate": 0.00044165802505097577,
"loss": 3.2853,
"step": 45400
},
{
"epoch": 13.231494962436667,
"grad_norm": 0.37559449672698975,
"learning_rate": 0.0004414832508010486,
"loss": 3.2975,
"step": 45450
},
{
"epoch": 13.246054394036458,
"grad_norm": 0.37365761399269104,
"learning_rate": 0.0004413084765511214,
"loss": 3.2917,
"step": 45500
},
{
"epoch": 13.260613825636247,
"grad_norm": 0.3821715712547302,
"learning_rate": 0.0004411337023011942,
"loss": 3.3,
"step": 45550
},
{
"epoch": 13.275173257236037,
"grad_norm": 0.38011810183525085,
"learning_rate": 0.00044095892805126705,
"loss": 3.2926,
"step": 45600
},
{
"epoch": 13.289732688835828,
"grad_norm": 0.3801783621311188,
"learning_rate": 0.0004407841538013399,
"loss": 3.2993,
"step": 45650
},
{
"epoch": 13.304292120435619,
"grad_norm": 0.3595413267612457,
"learning_rate": 0.0004406093795514127,
"loss": 3.3049,
"step": 45700
},
{
"epoch": 13.318851552035408,
"grad_norm": 0.3675829768180847,
"learning_rate": 0.0004404346053014855,
"loss": 3.296,
"step": 45750
},
{
"epoch": 13.333410983635199,
"grad_norm": 0.37648913264274597,
"learning_rate": 0.0004402598310515584,
"loss": 3.3018,
"step": 45800
},
{
"epoch": 13.34797041523499,
"grad_norm": 0.3896103799343109,
"learning_rate": 0.0004400850568016312,
"loss": 3.3021,
"step": 45850
},
{
"epoch": 13.36252984683478,
"grad_norm": 0.39437761902809143,
"learning_rate": 0.00043991028255170405,
"loss": 3.3033,
"step": 45900
},
{
"epoch": 13.37708927843457,
"grad_norm": 0.3523409962654114,
"learning_rate": 0.0004397355083017769,
"loss": 3.3159,
"step": 45950
},
{
"epoch": 13.39164871003436,
"grad_norm": 0.3909803628921509,
"learning_rate": 0.00043956073405184966,
"loss": 3.3183,
"step": 46000
},
{
"epoch": 13.39164871003436,
"eval_accuracy": 0.3703245301086839,
"eval_loss": 3.55873703956604,
"eval_runtime": 180.5898,
"eval_samples_per_second": 92.159,
"eval_steps_per_second": 5.764,
"step": 46000
},
{
"epoch": 13.406208141634151,
"grad_norm": 0.3750564455986023,
"learning_rate": 0.0004393859598019225,
"loss": 3.3159,
"step": 46050
},
{
"epoch": 13.420767573233942,
"grad_norm": 0.36859869956970215,
"learning_rate": 0.0004392111855519953,
"loss": 3.3034,
"step": 46100
},
{
"epoch": 13.43532700483373,
"grad_norm": 0.3968806564807892,
"learning_rate": 0.00043903641130206816,
"loss": 3.3098,
"step": 46150
},
{
"epoch": 13.449886436433522,
"grad_norm": 0.34583473205566406,
"learning_rate": 0.000438861637052141,
"loss": 3.3178,
"step": 46200
},
{
"epoch": 13.464445868033312,
"grad_norm": 0.37164896726608276,
"learning_rate": 0.00043868686280221377,
"loss": 3.3134,
"step": 46250
},
{
"epoch": 13.479005299633103,
"grad_norm": 0.367662638425827,
"learning_rate": 0.0004385120885522866,
"loss": 3.3195,
"step": 46300
},
{
"epoch": 13.493564731232892,
"grad_norm": 0.38434478640556335,
"learning_rate": 0.00043833731430235944,
"loss": 3.3219,
"step": 46350
},
{
"epoch": 13.508124162832683,
"grad_norm": 0.3725048303604126,
"learning_rate": 0.00043816254005243227,
"loss": 3.3191,
"step": 46400
},
{
"epoch": 13.522683594432474,
"grad_norm": 0.38499268889427185,
"learning_rate": 0.0004379877658025051,
"loss": 3.3106,
"step": 46450
},
{
"epoch": 13.537243026032264,
"grad_norm": 0.36700862646102905,
"learning_rate": 0.0004378129915525779,
"loss": 3.3155,
"step": 46500
},
{
"epoch": 13.551802457632053,
"grad_norm": 0.3642922639846802,
"learning_rate": 0.0004376382173026507,
"loss": 3.3128,
"step": 46550
},
{
"epoch": 13.566361889231844,
"grad_norm": 0.38024237751960754,
"learning_rate": 0.00043746344305272355,
"loss": 3.3268,
"step": 46600
},
{
"epoch": 13.580921320831635,
"grad_norm": 0.3899970054626465,
"learning_rate": 0.0004372886688027964,
"loss": 3.3127,
"step": 46650
},
{
"epoch": 13.595480752431426,
"grad_norm": 0.3449365198612213,
"learning_rate": 0.00043711389455286916,
"loss": 3.3182,
"step": 46700
},
{
"epoch": 13.610040184031215,
"grad_norm": 0.36648425459861755,
"learning_rate": 0.000436939120302942,
"loss": 3.3191,
"step": 46750
},
{
"epoch": 13.624599615631006,
"grad_norm": 0.3479945957660675,
"learning_rate": 0.0004367643460530148,
"loss": 3.3269,
"step": 46800
},
{
"epoch": 13.639159047230796,
"grad_norm": 0.3739745616912842,
"learning_rate": 0.00043658957180308766,
"loss": 3.3166,
"step": 46850
},
{
"epoch": 13.653718478830587,
"grad_norm": 0.37553584575653076,
"learning_rate": 0.0004364147975531605,
"loss": 3.3376,
"step": 46900
},
{
"epoch": 13.668277910430376,
"grad_norm": 0.3680751621723175,
"learning_rate": 0.00043624002330323327,
"loss": 3.3333,
"step": 46950
},
{
"epoch": 13.682837342030167,
"grad_norm": 0.41123461723327637,
"learning_rate": 0.0004360652490533061,
"loss": 3.3367,
"step": 47000
},
{
"epoch": 13.682837342030167,
"eval_accuracy": 0.37088411140688665,
"eval_loss": 3.5522854328155518,
"eval_runtime": 180.6331,
"eval_samples_per_second": 92.137,
"eval_steps_per_second": 5.763,
"step": 47000
},
{
"epoch": 13.697396773629958,
"grad_norm": 0.37972718477249146,
"learning_rate": 0.00043589047480337893,
"loss": 3.3264,
"step": 47050
},
{
"epoch": 13.711956205229749,
"grad_norm": 0.3651711642742157,
"learning_rate": 0.00043571570055345177,
"loss": 3.3072,
"step": 47100
},
{
"epoch": 13.726515636829538,
"grad_norm": 0.38708093762397766,
"learning_rate": 0.00043554092630352455,
"loss": 3.3205,
"step": 47150
},
{
"epoch": 13.741075068429328,
"grad_norm": 0.39110127091407776,
"learning_rate": 0.0004353661520535974,
"loss": 3.3218,
"step": 47200
},
{
"epoch": 13.75563450002912,
"grad_norm": 0.37536707520484924,
"learning_rate": 0.0004351913778036702,
"loss": 3.3211,
"step": 47250
},
{
"epoch": 13.77019393162891,
"grad_norm": 0.359542578458786,
"learning_rate": 0.00043501660355374305,
"loss": 3.3267,
"step": 47300
},
{
"epoch": 13.784753363228699,
"grad_norm": 0.3808390200138092,
"learning_rate": 0.0004348418293038159,
"loss": 3.3377,
"step": 47350
},
{
"epoch": 13.79931279482849,
"grad_norm": 0.3661261200904846,
"learning_rate": 0.00043466705505388866,
"loss": 3.3298,
"step": 47400
},
{
"epoch": 13.81387222642828,
"grad_norm": 0.3553769588470459,
"learning_rate": 0.0004344922808039615,
"loss": 3.3201,
"step": 47450
},
{
"epoch": 13.828431658028071,
"grad_norm": 0.3497569262981415,
"learning_rate": 0.0004343175065540343,
"loss": 3.3404,
"step": 47500
},
{
"epoch": 13.84299108962786,
"grad_norm": 0.37346166372299194,
"learning_rate": 0.00043414273230410716,
"loss": 3.3399,
"step": 47550
},
{
"epoch": 13.857550521227651,
"grad_norm": 0.3590134382247925,
"learning_rate": 0.00043396795805418,
"loss": 3.3308,
"step": 47600
},
{
"epoch": 13.872109952827442,
"grad_norm": 0.3460633158683777,
"learning_rate": 0.00043379318380425277,
"loss": 3.3433,
"step": 47650
},
{
"epoch": 13.886669384427233,
"grad_norm": 0.38021934032440186,
"learning_rate": 0.0004336184095543256,
"loss": 3.3404,
"step": 47700
},
{
"epoch": 13.901228816027022,
"grad_norm": 0.3488508462905884,
"learning_rate": 0.00043344363530439843,
"loss": 3.3313,
"step": 47750
},
{
"epoch": 13.915788247626812,
"grad_norm": 0.3584929406642914,
"learning_rate": 0.00043326886105447127,
"loss": 3.3392,
"step": 47800
},
{
"epoch": 13.930347679226603,
"grad_norm": 0.35224011540412903,
"learning_rate": 0.00043309408680454405,
"loss": 3.3349,
"step": 47850
},
{
"epoch": 13.944907110826394,
"grad_norm": 0.4024561643600464,
"learning_rate": 0.0004329193125546169,
"loss": 3.3364,
"step": 47900
},
{
"epoch": 13.959466542426183,
"grad_norm": 0.397568941116333,
"learning_rate": 0.0004327445383046897,
"loss": 3.3406,
"step": 47950
},
{
"epoch": 13.974025974025974,
"grad_norm": 0.3521833121776581,
"learning_rate": 0.00043256976405476255,
"loss": 3.3425,
"step": 48000
},
{
"epoch": 13.974025974025974,
"eval_accuracy": 0.3711820687528933,
"eval_loss": 3.5451555252075195,
"eval_runtime": 180.6693,
"eval_samples_per_second": 92.119,
"eval_steps_per_second": 5.762,
"step": 48000
},
{
"epoch": 13.988585405625765,
"grad_norm": 0.37055131793022156,
"learning_rate": 0.0004323949898048354,
"loss": 3.3417,
"step": 48050
},
{
"epoch": 14.002911886319959,
"grad_norm": 0.3706069588661194,
"learning_rate": 0.00043222021555490816,
"loss": 3.3111,
"step": 48100
},
{
"epoch": 14.017471317919748,
"grad_norm": 0.36993443965911865,
"learning_rate": 0.000432045441304981,
"loss": 3.242,
"step": 48150
},
{
"epoch": 14.032030749519539,
"grad_norm": 0.3860079348087311,
"learning_rate": 0.0004318706670550538,
"loss": 3.2285,
"step": 48200
},
{
"epoch": 14.04659018111933,
"grad_norm": 0.36225396394729614,
"learning_rate": 0.00043169589280512666,
"loss": 3.2359,
"step": 48250
},
{
"epoch": 14.06114961271912,
"grad_norm": 0.3634689450263977,
"learning_rate": 0.0004315211185551995,
"loss": 3.241,
"step": 48300
},
{
"epoch": 14.07570904431891,
"grad_norm": 0.40568917989730835,
"learning_rate": 0.0004313463443052724,
"loss": 3.2463,
"step": 48350
},
{
"epoch": 14.0902684759187,
"grad_norm": 0.3718388080596924,
"learning_rate": 0.00043117157005534515,
"loss": 3.2494,
"step": 48400
},
{
"epoch": 14.10482790751849,
"grad_norm": 0.3734684884548187,
"learning_rate": 0.000430996795805418,
"loss": 3.2557,
"step": 48450
},
{
"epoch": 14.119387339118282,
"grad_norm": 0.35955342650413513,
"learning_rate": 0.0004308220215554908,
"loss": 3.2482,
"step": 48500
},
{
"epoch": 14.13394677071807,
"grad_norm": 0.36027783155441284,
"learning_rate": 0.00043064724730556365,
"loss": 3.2626,
"step": 48550
},
{
"epoch": 14.148506202317861,
"grad_norm": 0.3621423542499542,
"learning_rate": 0.00043047247305563643,
"loss": 3.267,
"step": 48600
},
{
"epoch": 14.163065633917652,
"grad_norm": 0.37086912989616394,
"learning_rate": 0.00043029769880570927,
"loss": 3.2634,
"step": 48650
},
{
"epoch": 14.177625065517443,
"grad_norm": 0.3684757351875305,
"learning_rate": 0.0004301229245557821,
"loss": 3.2605,
"step": 48700
},
{
"epoch": 14.192184497117232,
"grad_norm": 0.37380942702293396,
"learning_rate": 0.00042994815030585493,
"loss": 3.2687,
"step": 48750
},
{
"epoch": 14.206743928717023,
"grad_norm": 0.39025264978408813,
"learning_rate": 0.00042977337605592776,
"loss": 3.2659,
"step": 48800
},
{
"epoch": 14.221303360316814,
"grad_norm": 0.38189178705215454,
"learning_rate": 0.00042959860180600054,
"loss": 3.2656,
"step": 48850
},
{
"epoch": 14.235862791916604,
"grad_norm": 0.3802640438079834,
"learning_rate": 0.0004294238275560734,
"loss": 3.2824,
"step": 48900
},
{
"epoch": 14.250422223516393,
"grad_norm": 0.3753884732723236,
"learning_rate": 0.0004292490533061462,
"loss": 3.2765,
"step": 48950
},
{
"epoch": 14.264981655116184,
"grad_norm": 0.35979166626930237,
"learning_rate": 0.00042907427905621904,
"loss": 3.2799,
"step": 49000
},
{
"epoch": 14.264981655116184,
"eval_accuracy": 0.37069092122555714,
"eval_loss": 3.5586068630218506,
"eval_runtime": 180.5727,
"eval_samples_per_second": 92.168,
"eval_steps_per_second": 5.765,
"step": 49000
},
{
"epoch": 14.279541086715975,
"grad_norm": 0.3511539101600647,
"learning_rate": 0.0004288995048062919,
"loss": 3.2865,
"step": 49050
},
{
"epoch": 14.294100518315766,
"grad_norm": 0.37331297993659973,
"learning_rate": 0.00042872473055636465,
"loss": 3.2907,
"step": 49100
},
{
"epoch": 14.308659949915555,
"grad_norm": 0.402055948972702,
"learning_rate": 0.0004285499563064375,
"loss": 3.287,
"step": 49150
},
{
"epoch": 14.323219381515345,
"grad_norm": 0.37558865547180176,
"learning_rate": 0.0004283751820565103,
"loss": 3.2928,
"step": 49200
},
{
"epoch": 14.337778813115136,
"grad_norm": 0.36927327513694763,
"learning_rate": 0.00042820040780658315,
"loss": 3.2923,
"step": 49250
},
{
"epoch": 14.352338244714927,
"grad_norm": 0.37961676716804504,
"learning_rate": 0.00042802563355665593,
"loss": 3.2827,
"step": 49300
},
{
"epoch": 14.366897676314716,
"grad_norm": 0.3793201744556427,
"learning_rate": 0.00042785085930672876,
"loss": 3.2895,
"step": 49350
},
{
"epoch": 14.381457107914507,
"grad_norm": 0.38360291719436646,
"learning_rate": 0.0004276760850568016,
"loss": 3.2897,
"step": 49400
},
{
"epoch": 14.396016539514298,
"grad_norm": 0.37893158197402954,
"learning_rate": 0.00042750131080687443,
"loss": 3.3067,
"step": 49450
},
{
"epoch": 14.410575971114088,
"grad_norm": 0.3840549886226654,
"learning_rate": 0.00042732653655694726,
"loss": 3.2867,
"step": 49500
},
{
"epoch": 14.425135402713877,
"grad_norm": 0.37985333800315857,
"learning_rate": 0.00042715176230702004,
"loss": 3.2966,
"step": 49550
},
{
"epoch": 14.439694834313668,
"grad_norm": 0.3771022856235504,
"learning_rate": 0.0004269769880570929,
"loss": 3.287,
"step": 49600
},
{
"epoch": 14.454254265913459,
"grad_norm": 0.38164374232292175,
"learning_rate": 0.0004268022138071657,
"loss": 3.3017,
"step": 49650
},
{
"epoch": 14.46881369751325,
"grad_norm": 0.38309139013290405,
"learning_rate": 0.00042662743955723854,
"loss": 3.298,
"step": 49700
},
{
"epoch": 14.483373129113039,
"grad_norm": 0.3698599338531494,
"learning_rate": 0.0004264526653073114,
"loss": 3.3014,
"step": 49750
},
{
"epoch": 14.49793256071283,
"grad_norm": 0.3559224605560303,
"learning_rate": 0.00042627789105738415,
"loss": 3.2961,
"step": 49800
},
{
"epoch": 14.51249199231262,
"grad_norm": 0.38171273469924927,
"learning_rate": 0.000426103116807457,
"loss": 3.297,
"step": 49850
},
{
"epoch": 14.527051423912411,
"grad_norm": 0.3736954629421234,
"learning_rate": 0.0004259283425575298,
"loss": 3.3043,
"step": 49900
},
{
"epoch": 14.5416108555122,
"grad_norm": 0.36771056056022644,
"learning_rate": 0.00042575356830760265,
"loss": 3.3111,
"step": 49950
},
{
"epoch": 14.556170287111991,
"grad_norm": 0.39156320691108704,
"learning_rate": 0.00042557879405767543,
"loss": 3.3193,
"step": 50000
},
{
"epoch": 14.556170287111991,
"eval_accuracy": 0.37085107035944076,
"eval_loss": 3.5553669929504395,
"eval_runtime": 180.9457,
"eval_samples_per_second": 91.978,
"eval_steps_per_second": 5.753,
"step": 50000
},
{
"epoch": 14.570729718711782,
"grad_norm": 0.39517736434936523,
"learning_rate": 0.00042540401980774826,
"loss": 3.3082,
"step": 50050
},
{
"epoch": 14.585289150311572,
"grad_norm": 0.35409751534461975,
"learning_rate": 0.0004252292455578211,
"loss": 3.2987,
"step": 50100
},
{
"epoch": 14.599848581911361,
"grad_norm": 0.40323737263679504,
"learning_rate": 0.00042505447130789393,
"loss": 3.3129,
"step": 50150
},
{
"epoch": 14.614408013511152,
"grad_norm": 0.3909080922603607,
"learning_rate": 0.00042487969705796676,
"loss": 3.3088,
"step": 50200
},
{
"epoch": 14.628967445110943,
"grad_norm": 0.36540573835372925,
"learning_rate": 0.00042470492280803954,
"loss": 3.3142,
"step": 50250
},
{
"epoch": 14.643526876710734,
"grad_norm": 0.3602832555770874,
"learning_rate": 0.0004245301485581124,
"loss": 3.3221,
"step": 50300
},
{
"epoch": 14.658086308310523,
"grad_norm": 0.3897080421447754,
"learning_rate": 0.0004243553743081852,
"loss": 3.2985,
"step": 50350
},
{
"epoch": 14.672645739910314,
"grad_norm": 0.36867547035217285,
"learning_rate": 0.00042418060005825804,
"loss": 3.3072,
"step": 50400
},
{
"epoch": 14.687205171510104,
"grad_norm": 0.3675730228424072,
"learning_rate": 0.0004240058258083308,
"loss": 3.3038,
"step": 50450
},
{
"epoch": 14.701764603109895,
"grad_norm": 0.37027379870414734,
"learning_rate": 0.00042383105155840365,
"loss": 3.3026,
"step": 50500
},
{
"epoch": 14.716324034709684,
"grad_norm": 0.359173446893692,
"learning_rate": 0.0004236562773084765,
"loss": 3.3026,
"step": 50550
},
{
"epoch": 14.730883466309475,
"grad_norm": 0.35587379336357117,
"learning_rate": 0.0004234815030585493,
"loss": 3.3067,
"step": 50600
},
{
"epoch": 14.745442897909266,
"grad_norm": 0.3435940146446228,
"learning_rate": 0.00042330672880862215,
"loss": 3.3207,
"step": 50650
},
{
"epoch": 14.760002329509057,
"grad_norm": 0.35753193497657776,
"learning_rate": 0.00042313195455869493,
"loss": 3.3199,
"step": 50700
},
{
"epoch": 14.774561761108846,
"grad_norm": 0.377048134803772,
"learning_rate": 0.00042295718030876776,
"loss": 3.3216,
"step": 50750
},
{
"epoch": 14.789121192708636,
"grad_norm": 0.35497966408729553,
"learning_rate": 0.0004227824060588406,
"loss": 3.3176,
"step": 50800
},
{
"epoch": 14.803680624308427,
"grad_norm": 0.3708688020706177,
"learning_rate": 0.0004226076318089135,
"loss": 3.3151,
"step": 50850
},
{
"epoch": 14.818240055908218,
"grad_norm": 0.40276867151260376,
"learning_rate": 0.0004224328575589863,
"loss": 3.3208,
"step": 50900
},
{
"epoch": 14.832799487508007,
"grad_norm": 0.37389078736305237,
"learning_rate": 0.00042225808330905915,
"loss": 3.3148,
"step": 50950
},
{
"epoch": 14.847358919107798,
"grad_norm": 0.3689541220664978,
"learning_rate": 0.00042208330905913193,
"loss": 3.3317,
"step": 51000
},
{
"epoch": 14.847358919107798,
"eval_accuracy": 0.3714391069369385,
"eval_loss": 3.5418295860290527,
"eval_runtime": 194.6343,
"eval_samples_per_second": 85.509,
"eval_steps_per_second": 5.348,
"step": 51000
},
{
"epoch": 14.861918350707588,
"grad_norm": 0.3774998188018799,
"learning_rate": 0.00042190853480920476,
"loss": 3.3296,
"step": 51050
},
{
"epoch": 14.87647778230738,
"grad_norm": 0.37848007678985596,
"learning_rate": 0.0004217337605592776,
"loss": 3.3077,
"step": 51100
},
{
"epoch": 14.891037213907168,
"grad_norm": 0.37494605779647827,
"learning_rate": 0.00042155898630935043,
"loss": 3.3271,
"step": 51150
},
{
"epoch": 14.905596645506959,
"grad_norm": 0.3562757074832916,
"learning_rate": 0.0004213842120594232,
"loss": 3.3193,
"step": 51200
},
{
"epoch": 14.92015607710675,
"grad_norm": 0.3716096878051758,
"learning_rate": 0.00042120943780949604,
"loss": 3.3134,
"step": 51250
},
{
"epoch": 14.93471550870654,
"grad_norm": 0.36975473165512085,
"learning_rate": 0.00042103466355956887,
"loss": 3.3207,
"step": 51300
},
{
"epoch": 14.94927494030633,
"grad_norm": 0.37558260560035706,
"learning_rate": 0.0004208598893096417,
"loss": 3.3307,
"step": 51350
},
{
"epoch": 14.96383437190612,
"grad_norm": 0.36577916145324707,
"learning_rate": 0.00042068511505971454,
"loss": 3.3233,
"step": 51400
},
{
"epoch": 14.978393803505911,
"grad_norm": 0.3728995621204376,
"learning_rate": 0.0004205103408097873,
"loss": 3.3257,
"step": 51450
},
{
"epoch": 14.992953235105702,
"grad_norm": 0.41513046622276306,
"learning_rate": 0.00042033556655986015,
"loss": 3.3362,
"step": 51500
},
{
"epoch": 15.007279715799895,
"grad_norm": 0.38033053278923035,
"learning_rate": 0.000420160792309933,
"loss": 3.2579,
"step": 51550
},
{
"epoch": 15.021839147399685,
"grad_norm": 0.4050993323326111,
"learning_rate": 0.0004199860180600058,
"loss": 3.2234,
"step": 51600
},
{
"epoch": 15.036398578999476,
"grad_norm": 0.3728967607021332,
"learning_rate": 0.00041981124381007865,
"loss": 3.2089,
"step": 51650
},
{
"epoch": 15.050958010599267,
"grad_norm": 0.3688093423843384,
"learning_rate": 0.00041963646956015143,
"loss": 3.2277,
"step": 51700
},
{
"epoch": 15.065517442199056,
"grad_norm": 0.38589268922805786,
"learning_rate": 0.00041946169531022426,
"loss": 3.2312,
"step": 51750
},
{
"epoch": 15.080076873798847,
"grad_norm": 0.37138667702674866,
"learning_rate": 0.0004192869210602971,
"loss": 3.2244,
"step": 51800
},
{
"epoch": 15.094636305398637,
"grad_norm": 0.3603217303752899,
"learning_rate": 0.0004191121468103699,
"loss": 3.2385,
"step": 51850
},
{
"epoch": 15.109195736998428,
"grad_norm": 0.3568740487098694,
"learning_rate": 0.0004189373725604427,
"loss": 3.2404,
"step": 51900
},
{
"epoch": 15.123755168598217,
"grad_norm": 0.3630964457988739,
"learning_rate": 0.00041876259831051554,
"loss": 3.2507,
"step": 51950
},
{
"epoch": 15.138314600198008,
"grad_norm": 0.37862637639045715,
"learning_rate": 0.00041858782406058837,
"loss": 3.2427,
"step": 52000
},
{
"epoch": 15.138314600198008,
"eval_accuracy": 0.37097041791516167,
"eval_loss": 3.5578880310058594,
"eval_runtime": 180.9393,
"eval_samples_per_second": 91.981,
"eval_steps_per_second": 5.753,
"step": 52000
},
{
"epoch": 15.152874031797799,
"grad_norm": 0.37787380814552307,
"learning_rate": 0.0004184130498106612,
"loss": 3.2461,
"step": 52050
},
{
"epoch": 15.16743346339759,
"grad_norm": 0.38548505306243896,
"learning_rate": 0.00041823827556073404,
"loss": 3.2523,
"step": 52100
},
{
"epoch": 15.181992894997379,
"grad_norm": 0.4000053107738495,
"learning_rate": 0.0004180635013108068,
"loss": 3.2496,
"step": 52150
},
{
"epoch": 15.19655232659717,
"grad_norm": 0.3886640667915344,
"learning_rate": 0.00041788872706087965,
"loss": 3.2533,
"step": 52200
},
{
"epoch": 15.21111175819696,
"grad_norm": 0.38245636224746704,
"learning_rate": 0.0004177139528109525,
"loss": 3.2632,
"step": 52250
},
{
"epoch": 15.225671189796751,
"grad_norm": 0.36337175965309143,
"learning_rate": 0.0004175391785610253,
"loss": 3.2547,
"step": 52300
},
{
"epoch": 15.24023062139654,
"grad_norm": 0.4020264446735382,
"learning_rate": 0.00041736440431109815,
"loss": 3.2694,
"step": 52350
},
{
"epoch": 15.25479005299633,
"grad_norm": 0.41589444875717163,
"learning_rate": 0.0004171896300611709,
"loss": 3.2538,
"step": 52400
},
{
"epoch": 15.269349484596122,
"grad_norm": 0.3860560655593872,
"learning_rate": 0.00041701485581124376,
"loss": 3.2743,
"step": 52450
},
{
"epoch": 15.283908916195912,
"grad_norm": 0.3931313157081604,
"learning_rate": 0.0004168400815613166,
"loss": 3.2647,
"step": 52500
},
{
"epoch": 15.298468347795701,
"grad_norm": 0.38691258430480957,
"learning_rate": 0.0004166653073113894,
"loss": 3.2563,
"step": 52550
},
{
"epoch": 15.313027779395492,
"grad_norm": 0.34192565083503723,
"learning_rate": 0.0004164905330614622,
"loss": 3.2569,
"step": 52600
},
{
"epoch": 15.327587210995283,
"grad_norm": 0.3795337975025177,
"learning_rate": 0.00041631575881153504,
"loss": 3.267,
"step": 52650
},
{
"epoch": 15.342146642595074,
"grad_norm": 0.3903842270374298,
"learning_rate": 0.00041614098456160787,
"loss": 3.2731,
"step": 52700
},
{
"epoch": 15.356706074194863,
"grad_norm": 0.36304110288619995,
"learning_rate": 0.0004159662103116807,
"loss": 3.2816,
"step": 52750
},
{
"epoch": 15.371265505794653,
"grad_norm": 0.3842661678791046,
"learning_rate": 0.00041579143606175354,
"loss": 3.2652,
"step": 52800
},
{
"epoch": 15.385824937394444,
"grad_norm": 0.37199848890304565,
"learning_rate": 0.0004156166618118263,
"loss": 3.2733,
"step": 52850
},
{
"epoch": 15.400384368994235,
"grad_norm": 0.38361623883247375,
"learning_rate": 0.00041544188756189915,
"loss": 3.2763,
"step": 52900
},
{
"epoch": 15.414943800594024,
"grad_norm": 0.41524621844291687,
"learning_rate": 0.000415267113311972,
"loss": 3.2739,
"step": 52950
},
{
"epoch": 15.429503232193815,
"grad_norm": 0.3668960630893707,
"learning_rate": 0.0004150923390620448,
"loss": 3.2801,
"step": 53000
},
{
"epoch": 15.429503232193815,
"eval_accuracy": 0.371453804911781,
"eval_loss": 3.5519752502441406,
"eval_runtime": 180.7835,
"eval_samples_per_second": 92.06,
"eval_steps_per_second": 5.758,
"step": 53000
},
{
"epoch": 15.444062663793606,
"grad_norm": 0.36112311482429504,
"learning_rate": 0.0004149175648121176,
"loss": 3.2864,
"step": 53050
},
{
"epoch": 15.458622095393396,
"grad_norm": 0.40427324175834656,
"learning_rate": 0.0004147427905621904,
"loss": 3.2863,
"step": 53100
},
{
"epoch": 15.473181526993185,
"grad_norm": 0.3597639203071594,
"learning_rate": 0.00041456801631226326,
"loss": 3.2863,
"step": 53150
},
{
"epoch": 15.487740958592976,
"grad_norm": 0.3552030920982361,
"learning_rate": 0.0004143932420623361,
"loss": 3.2777,
"step": 53200
},
{
"epoch": 15.502300390192767,
"grad_norm": 0.3815247714519501,
"learning_rate": 0.0004142184678124089,
"loss": 3.2863,
"step": 53250
},
{
"epoch": 15.516859821792558,
"grad_norm": 0.3695749342441559,
"learning_rate": 0.0004140436935624817,
"loss": 3.2861,
"step": 53300
},
{
"epoch": 15.531419253392347,
"grad_norm": 0.39430856704711914,
"learning_rate": 0.00041386891931255454,
"loss": 3.2831,
"step": 53350
},
{
"epoch": 15.545978684992138,
"grad_norm": 0.38121476769447327,
"learning_rate": 0.0004136941450626274,
"loss": 3.2911,
"step": 53400
},
{
"epoch": 15.560538116591928,
"grad_norm": 0.4004911482334137,
"learning_rate": 0.00041351937081270026,
"loss": 3.2937,
"step": 53450
},
{
"epoch": 15.575097548191719,
"grad_norm": 0.3881937563419342,
"learning_rate": 0.0004133445965627731,
"loss": 3.301,
"step": 53500
},
{
"epoch": 15.58965697979151,
"grad_norm": 0.35038501024246216,
"learning_rate": 0.0004131698223128459,
"loss": 3.2991,
"step": 53550
},
{
"epoch": 15.604216411391299,
"grad_norm": 0.3707396686077118,
"learning_rate": 0.0004129950480629187,
"loss": 3.2934,
"step": 53600
},
{
"epoch": 15.61877584299109,
"grad_norm": 0.40238645672798157,
"learning_rate": 0.00041282027381299153,
"loss": 3.3011,
"step": 53650
},
{
"epoch": 15.63333527459088,
"grad_norm": 0.4219394624233246,
"learning_rate": 0.00041264549956306437,
"loss": 3.296,
"step": 53700
},
{
"epoch": 15.64789470619067,
"grad_norm": 0.4156340956687927,
"learning_rate": 0.0004124707253131372,
"loss": 3.3031,
"step": 53750
},
{
"epoch": 15.66245413779046,
"grad_norm": 0.3960428237915039,
"learning_rate": 0.00041229595106321,
"loss": 3.298,
"step": 53800
},
{
"epoch": 15.677013569390251,
"grad_norm": 0.3877508044242859,
"learning_rate": 0.0004121211768132828,
"loss": 3.295,
"step": 53850
},
{
"epoch": 15.691573000990042,
"grad_norm": 0.3831869661808014,
"learning_rate": 0.00041194640256335565,
"loss": 3.3089,
"step": 53900
},
{
"epoch": 15.706132432589833,
"grad_norm": 0.37973588705062866,
"learning_rate": 0.0004117716283134285,
"loss": 3.2996,
"step": 53950
},
{
"epoch": 15.720691864189622,
"grad_norm": 0.36781612038612366,
"learning_rate": 0.0004115968540635013,
"loss": 3.3063,
"step": 54000
},
{
"epoch": 15.720691864189622,
"eval_accuracy": 0.371678389967374,
"eval_loss": 3.5436747074127197,
"eval_runtime": 194.0997,
"eval_samples_per_second": 85.745,
"eval_steps_per_second": 5.363,
"step": 54000
},
{
"epoch": 15.735251295789412,
"grad_norm": 0.38517138361930847,
"learning_rate": 0.0004114220798135741,
"loss": 3.2983,
"step": 54050
},
{
"epoch": 15.749810727389203,
"grad_norm": 0.38750118017196655,
"learning_rate": 0.0004112473055636469,
"loss": 3.287,
"step": 54100
},
{
"epoch": 15.764370158988992,
"grad_norm": 0.36446696519851685,
"learning_rate": 0.00041107253131371976,
"loss": 3.3055,
"step": 54150
},
{
"epoch": 15.778929590588783,
"grad_norm": 0.36559680104255676,
"learning_rate": 0.0004108977570637926,
"loss": 3.3041,
"step": 54200
},
{
"epoch": 15.793489022188574,
"grad_norm": 0.3546500504016876,
"learning_rate": 0.0004107229828138654,
"loss": 3.3086,
"step": 54250
},
{
"epoch": 15.808048453788365,
"grad_norm": 0.3691169023513794,
"learning_rate": 0.0004105482085639382,
"loss": 3.3098,
"step": 54300
},
{
"epoch": 15.822607885388155,
"grad_norm": 0.37200814485549927,
"learning_rate": 0.00041037343431401103,
"loss": 3.3147,
"step": 54350
},
{
"epoch": 15.837167316987944,
"grad_norm": 0.3752872943878174,
"learning_rate": 0.00041019866006408387,
"loss": 3.3032,
"step": 54400
},
{
"epoch": 15.851726748587735,
"grad_norm": 0.3832313120365143,
"learning_rate": 0.0004100238858141567,
"loss": 3.3188,
"step": 54450
},
{
"epoch": 15.866286180187526,
"grad_norm": 0.3642217218875885,
"learning_rate": 0.0004098491115642295,
"loss": 3.3026,
"step": 54500
},
{
"epoch": 15.880845611787315,
"grad_norm": 0.3717597723007202,
"learning_rate": 0.0004096743373143023,
"loss": 3.3106,
"step": 54550
},
{
"epoch": 15.895405043387106,
"grad_norm": 0.39715245366096497,
"learning_rate": 0.00040949956306437514,
"loss": 3.3218,
"step": 54600
},
{
"epoch": 15.909964474986896,
"grad_norm": 0.37992674112319946,
"learning_rate": 0.000409324788814448,
"loss": 3.3185,
"step": 54650
},
{
"epoch": 15.924523906586687,
"grad_norm": 0.38378632068634033,
"learning_rate": 0.0004091500145645208,
"loss": 3.3142,
"step": 54700
},
{
"epoch": 15.939083338186478,
"grad_norm": 0.3747106194496155,
"learning_rate": 0.0004089752403145936,
"loss": 3.3099,
"step": 54750
},
{
"epoch": 15.953642769786267,
"grad_norm": 0.3706667721271515,
"learning_rate": 0.0004088004660646664,
"loss": 3.3069,
"step": 54800
},
{
"epoch": 15.968202201386058,
"grad_norm": 0.4007405936717987,
"learning_rate": 0.00040862569181473926,
"loss": 3.3156,
"step": 54850
},
{
"epoch": 15.982761632985849,
"grad_norm": 0.3907228410243988,
"learning_rate": 0.0004084509175648121,
"loss": 3.3087,
"step": 54900
},
{
"epoch": 15.99732106458564,
"grad_norm": 0.37928932905197144,
"learning_rate": 0.0004082761433148849,
"loss": 3.3079,
"step": 54950
},
{
"epoch": 16.011647545279832,
"grad_norm": 0.36409544944763184,
"learning_rate": 0.0004081013690649577,
"loss": 3.2192,
"step": 55000
},
{
"epoch": 16.011647545279832,
"eval_accuracy": 0.37157150629431956,
"eval_loss": 3.5547502040863037,
"eval_runtime": 218.6088,
"eval_samples_per_second": 76.131,
"eval_steps_per_second": 4.762,
"step": 55000
},
{
"epoch": 16.02620697687962,
"grad_norm": 0.3841230273246765,
"learning_rate": 0.00040792659481503053,
"loss": 3.2109,
"step": 55050
},
{
"epoch": 16.040766408479413,
"grad_norm": 0.38151082396507263,
"learning_rate": 0.00040775182056510337,
"loss": 3.2051,
"step": 55100
},
{
"epoch": 16.055325840079202,
"grad_norm": 0.39493605494499207,
"learning_rate": 0.0004075770463151762,
"loss": 3.2216,
"step": 55150
},
{
"epoch": 16.069885271678995,
"grad_norm": 0.3837626278400421,
"learning_rate": 0.000407402272065249,
"loss": 3.2114,
"step": 55200
},
{
"epoch": 16.084444703278784,
"grad_norm": 0.42783597111701965,
"learning_rate": 0.0004072274978153218,
"loss": 3.2084,
"step": 55250
},
{
"epoch": 16.099004134878573,
"grad_norm": 0.41208311915397644,
"learning_rate": 0.00040705272356539464,
"loss": 3.2166,
"step": 55300
},
{
"epoch": 16.113563566478366,
"grad_norm": 0.4104818105697632,
"learning_rate": 0.0004068779493154675,
"loss": 3.2429,
"step": 55350
},
{
"epoch": 16.128122998078155,
"grad_norm": 0.3684764802455902,
"learning_rate": 0.0004067031750655403,
"loss": 3.2416,
"step": 55400
},
{
"epoch": 16.142682429677944,
"grad_norm": 0.38913047313690186,
"learning_rate": 0.0004065284008156131,
"loss": 3.2386,
"step": 55450
},
{
"epoch": 16.157241861277736,
"grad_norm": 0.3729836344718933,
"learning_rate": 0.0004063536265656859,
"loss": 3.2345,
"step": 55500
},
{
"epoch": 16.171801292877525,
"grad_norm": 0.3831164240837097,
"learning_rate": 0.00040617885231575876,
"loss": 3.2205,
"step": 55550
},
{
"epoch": 16.186360724477318,
"grad_norm": 0.370057612657547,
"learning_rate": 0.0004060040780658316,
"loss": 3.2449,
"step": 55600
},
{
"epoch": 16.200920156077107,
"grad_norm": 0.4026546776294708,
"learning_rate": 0.0004058293038159044,
"loss": 3.2473,
"step": 55650
},
{
"epoch": 16.215479587676896,
"grad_norm": 0.3730154037475586,
"learning_rate": 0.0004056545295659772,
"loss": 3.2499,
"step": 55700
},
{
"epoch": 16.23003901927669,
"grad_norm": 0.40726903080940247,
"learning_rate": 0.00040547975531605003,
"loss": 3.2402,
"step": 55750
},
{
"epoch": 16.244598450876477,
"grad_norm": 0.36538970470428467,
"learning_rate": 0.00040530498106612287,
"loss": 3.2381,
"step": 55800
},
{
"epoch": 16.259157882476266,
"grad_norm": 0.4038563668727875,
"learning_rate": 0.0004051302068161957,
"loss": 3.2464,
"step": 55850
},
{
"epoch": 16.27371731407606,
"grad_norm": 0.3808690905570984,
"learning_rate": 0.0004049554325662686,
"loss": 3.2503,
"step": 55900
},
{
"epoch": 16.288276745675848,
"grad_norm": 0.39795416593551636,
"learning_rate": 0.00040478065831634136,
"loss": 3.2531,
"step": 55950
},
{
"epoch": 16.30283617727564,
"grad_norm": 0.3833180367946625,
"learning_rate": 0.0004046058840664142,
"loss": 3.2589,
"step": 56000
},
{
"epoch": 16.30283617727564,
"eval_accuracy": 0.3717569359449322,
"eval_loss": 3.5537829399108887,
"eval_runtime": 180.812,
"eval_samples_per_second": 92.046,
"eval_steps_per_second": 5.757,
"step": 56000
},
{
"epoch": 16.31739560887543,
"grad_norm": 0.38893625140190125,
"learning_rate": 0.00040443110981648703,
"loss": 3.2673,
"step": 56050
},
{
"epoch": 16.33195504047522,
"grad_norm": 0.3620428144931793,
"learning_rate": 0.00040425633556655986,
"loss": 3.2631,
"step": 56100
},
{
"epoch": 16.34651447207501,
"grad_norm": 0.3953818082809448,
"learning_rate": 0.0004040815613166327,
"loss": 3.253,
"step": 56150
},
{
"epoch": 16.3610739036748,
"grad_norm": 0.42537441849708557,
"learning_rate": 0.0004039067870667055,
"loss": 3.2594,
"step": 56200
},
{
"epoch": 16.375633335274593,
"grad_norm": 0.39338961243629456,
"learning_rate": 0.0004037320128167783,
"loss": 3.266,
"step": 56250
},
{
"epoch": 16.39019276687438,
"grad_norm": 0.4023808538913727,
"learning_rate": 0.00040355723856685114,
"loss": 3.2705,
"step": 56300
},
{
"epoch": 16.40475219847417,
"grad_norm": 0.4043920934200287,
"learning_rate": 0.000403382464316924,
"loss": 3.2644,
"step": 56350
},
{
"epoch": 16.419311630073963,
"grad_norm": 0.3746441602706909,
"learning_rate": 0.00040320769006699675,
"loss": 3.2603,
"step": 56400
},
{
"epoch": 16.433871061673752,
"grad_norm": 0.4176045060157776,
"learning_rate": 0.0004030329158170696,
"loss": 3.2748,
"step": 56450
},
{
"epoch": 16.44843049327354,
"grad_norm": 0.3769119381904602,
"learning_rate": 0.0004028581415671424,
"loss": 3.2631,
"step": 56500
},
{
"epoch": 16.462989924873334,
"grad_norm": 0.4049900770187378,
"learning_rate": 0.00040268336731721525,
"loss": 3.2749,
"step": 56550
},
{
"epoch": 16.477549356473123,
"grad_norm": 0.3694433569908142,
"learning_rate": 0.0004025085930672881,
"loss": 3.2737,
"step": 56600
},
{
"epoch": 16.492108788072915,
"grad_norm": 0.38329896330833435,
"learning_rate": 0.00040233381881736086,
"loss": 3.2764,
"step": 56650
},
{
"epoch": 16.506668219672704,
"grad_norm": 0.3678717613220215,
"learning_rate": 0.0004021590445674337,
"loss": 3.2785,
"step": 56700
},
{
"epoch": 16.521227651272493,
"grad_norm": 0.42633742094039917,
"learning_rate": 0.00040198427031750653,
"loss": 3.2742,
"step": 56750
},
{
"epoch": 16.535787082872286,
"grad_norm": 0.38539767265319824,
"learning_rate": 0.00040180949606757936,
"loss": 3.2687,
"step": 56800
},
{
"epoch": 16.550346514472075,
"grad_norm": 0.39155086874961853,
"learning_rate": 0.0004016347218176522,
"loss": 3.2782,
"step": 56850
},
{
"epoch": 16.564905946071864,
"grad_norm": 0.4056413471698761,
"learning_rate": 0.000401459947567725,
"loss": 3.2825,
"step": 56900
},
{
"epoch": 16.579465377671657,
"grad_norm": 0.3961426019668579,
"learning_rate": 0.0004012851733177978,
"loss": 3.2809,
"step": 56950
},
{
"epoch": 16.594024809271446,
"grad_norm": 0.37948471307754517,
"learning_rate": 0.00040111039906787064,
"loss": 3.2819,
"step": 57000
},
{
"epoch": 16.594024809271446,
"eval_accuracy": 0.3720454865870397,
"eval_loss": 3.5462021827697754,
"eval_runtime": 180.5198,
"eval_samples_per_second": 92.195,
"eval_steps_per_second": 5.767,
"step": 57000
},
{
"epoch": 16.608584240871238,
"grad_norm": 0.3997423052787781,
"learning_rate": 0.0004009356248179435,
"loss": 3.2757,
"step": 57050
},
{
"epoch": 16.623143672471027,
"grad_norm": 0.38000285625457764,
"learning_rate": 0.00040076085056801625,
"loss": 3.2857,
"step": 57100
},
{
"epoch": 16.637703104070816,
"grad_norm": 0.36798590421676636,
"learning_rate": 0.0004005860763180891,
"loss": 3.2861,
"step": 57150
},
{
"epoch": 16.65226253567061,
"grad_norm": 0.38152462244033813,
"learning_rate": 0.0004004113020681619,
"loss": 3.2859,
"step": 57200
},
{
"epoch": 16.666821967270398,
"grad_norm": 0.4101053774356842,
"learning_rate": 0.00040023652781823475,
"loss": 3.2847,
"step": 57250
},
{
"epoch": 16.681381398870187,
"grad_norm": 0.37895187735557556,
"learning_rate": 0.0004000617535683076,
"loss": 3.286,
"step": 57300
},
{
"epoch": 16.69594083046998,
"grad_norm": 0.4000397324562073,
"learning_rate": 0.00039988697931838036,
"loss": 3.2801,
"step": 57350
},
{
"epoch": 16.71050026206977,
"grad_norm": 0.3596523404121399,
"learning_rate": 0.0003997122050684532,
"loss": 3.2806,
"step": 57400
},
{
"epoch": 16.72505969366956,
"grad_norm": 0.40595123171806335,
"learning_rate": 0.00039953743081852603,
"loss": 3.2842,
"step": 57450
},
{
"epoch": 16.73961912526935,
"grad_norm": 0.3860524594783783,
"learning_rate": 0.00039936265656859886,
"loss": 3.284,
"step": 57500
},
{
"epoch": 16.75417855686914,
"grad_norm": 0.3636539876461029,
"learning_rate": 0.0003991878823186717,
"loss": 3.2895,
"step": 57550
},
{
"epoch": 16.76873798846893,
"grad_norm": 0.3696141242980957,
"learning_rate": 0.0003990131080687445,
"loss": 3.2896,
"step": 57600
},
{
"epoch": 16.78329742006872,
"grad_norm": 0.3895752429962158,
"learning_rate": 0.0003988383338188173,
"loss": 3.287,
"step": 57650
},
{
"epoch": 16.79785685166851,
"grad_norm": 0.393362820148468,
"learning_rate": 0.00039866355956889014,
"loss": 3.3037,
"step": 57700
},
{
"epoch": 16.812416283268302,
"grad_norm": 0.379912406206131,
"learning_rate": 0.000398488785318963,
"loss": 3.2958,
"step": 57750
},
{
"epoch": 16.82697571486809,
"grad_norm": 0.37024620175361633,
"learning_rate": 0.00039831401106903575,
"loss": 3.2925,
"step": 57800
},
{
"epoch": 16.841535146467884,
"grad_norm": 0.38401541113853455,
"learning_rate": 0.0003981392368191086,
"loss": 3.296,
"step": 57850
},
{
"epoch": 16.856094578067673,
"grad_norm": 0.3762282729148865,
"learning_rate": 0.0003979644625691814,
"loss": 3.2997,
"step": 57900
},
{
"epoch": 16.87065400966746,
"grad_norm": 0.38887494802474976,
"learning_rate": 0.00039778968831925425,
"loss": 3.295,
"step": 57950
},
{
"epoch": 16.885213441267254,
"grad_norm": 0.3960239589214325,
"learning_rate": 0.0003976149140693271,
"loss": 3.2991,
"step": 58000
},
{
"epoch": 16.885213441267254,
"eval_accuracy": 0.37233003937999004,
"eval_loss": 3.538398265838623,
"eval_runtime": 180.9439,
"eval_samples_per_second": 91.979,
"eval_steps_per_second": 5.753,
"step": 58000
},
{
"epoch": 16.899772872867043,
"grad_norm": 0.35838446021080017,
"learning_rate": 0.00039744013981939986,
"loss": 3.2993,
"step": 58050
},
{
"epoch": 16.914332304466832,
"grad_norm": 0.3769555985927582,
"learning_rate": 0.0003972653655694727,
"loss": 3.2954,
"step": 58100
},
{
"epoch": 16.928891736066625,
"grad_norm": 0.3769146800041199,
"learning_rate": 0.00039709059131954553,
"loss": 3.3055,
"step": 58150
},
{
"epoch": 16.943451167666414,
"grad_norm": 0.37942448258399963,
"learning_rate": 0.00039691581706961836,
"loss": 3.3018,
"step": 58200
},
{
"epoch": 16.958010599266206,
"grad_norm": 0.3871458172798157,
"learning_rate": 0.0003967410428196912,
"loss": 3.3066,
"step": 58250
},
{
"epoch": 16.972570030865995,
"grad_norm": 0.37447428703308105,
"learning_rate": 0.000396566268569764,
"loss": 3.2896,
"step": 58300
},
{
"epoch": 16.987129462465784,
"grad_norm": 0.39451682567596436,
"learning_rate": 0.0003963914943198368,
"loss": 3.294,
"step": 58350
},
{
"epoch": 17.00145594315998,
"grad_norm": 0.37564557790756226,
"learning_rate": 0.00039621672006990964,
"loss": 3.2853,
"step": 58400
},
{
"epoch": 17.01601537475977,
"grad_norm": 0.3866485357284546,
"learning_rate": 0.0003960419458199825,
"loss": 3.1838,
"step": 58450
},
{
"epoch": 17.03057480635956,
"grad_norm": 0.39509961009025574,
"learning_rate": 0.00039586717157005536,
"loss": 3.1882,
"step": 58500
},
{
"epoch": 17.04513423795935,
"grad_norm": 0.40553566813468933,
"learning_rate": 0.00039569239732012814,
"loss": 3.2081,
"step": 58550
},
{
"epoch": 17.05969366955914,
"grad_norm": 0.3974907100200653,
"learning_rate": 0.00039551762307020097,
"loss": 3.201,
"step": 58600
},
{
"epoch": 17.07425310115893,
"grad_norm": 0.4323204457759857,
"learning_rate": 0.0003953428488202738,
"loss": 3.2201,
"step": 58650
},
{
"epoch": 17.08881253275872,
"grad_norm": 0.39381760358810425,
"learning_rate": 0.00039516807457034664,
"loss": 3.2085,
"step": 58700
},
{
"epoch": 17.103371964358512,
"grad_norm": 0.3874780237674713,
"learning_rate": 0.00039499330032041947,
"loss": 3.2036,
"step": 58750
},
{
"epoch": 17.1179313959583,
"grad_norm": 0.3998047411441803,
"learning_rate": 0.00039481852607049225,
"loss": 3.2124,
"step": 58800
},
{
"epoch": 17.132490827558094,
"grad_norm": 0.37455350160598755,
"learning_rate": 0.0003946437518205651,
"loss": 3.2184,
"step": 58850
},
{
"epoch": 17.147050259157883,
"grad_norm": 0.4187104403972626,
"learning_rate": 0.0003944689775706379,
"loss": 3.2301,
"step": 58900
},
{
"epoch": 17.161609690757672,
"grad_norm": 0.3866749107837677,
"learning_rate": 0.00039429420332071075,
"loss": 3.2204,
"step": 58950
},
{
"epoch": 17.176169122357464,
"grad_norm": 0.39651066064834595,
"learning_rate": 0.0003941194290707836,
"loss": 3.2153,
"step": 59000
},
{
"epoch": 17.176169122357464,
"eval_accuracy": 0.3716996726349459,
"eval_loss": 3.555577278137207,
"eval_runtime": 181.0557,
"eval_samples_per_second": 91.922,
"eval_steps_per_second": 5.75,
"step": 59000
},
{
"epoch": 17.190728553957253,
"grad_norm": 0.41101446747779846,
"learning_rate": 0.00039394465482085636,
"loss": 3.2375,
"step": 59050
},
{
"epoch": 17.205287985557042,
"grad_norm": 0.400997132062912,
"learning_rate": 0.0003937698805709292,
"loss": 3.2352,
"step": 59100
},
{
"epoch": 17.219847417156835,
"grad_norm": 0.4068007171154022,
"learning_rate": 0.000393595106321002,
"loss": 3.2375,
"step": 59150
},
{
"epoch": 17.234406848756624,
"grad_norm": 0.38392940163612366,
"learning_rate": 0.00039342033207107486,
"loss": 3.2337,
"step": 59200
},
{
"epoch": 17.248966280356417,
"grad_norm": 0.38920333981513977,
"learning_rate": 0.00039324555782114764,
"loss": 3.2361,
"step": 59250
},
{
"epoch": 17.263525711956206,
"grad_norm": 0.408083438873291,
"learning_rate": 0.00039307078357122047,
"loss": 3.2247,
"step": 59300
},
{
"epoch": 17.278085143555995,
"grad_norm": 0.43184590339660645,
"learning_rate": 0.0003928960093212933,
"loss": 3.245,
"step": 59350
},
{
"epoch": 17.292644575155787,
"grad_norm": 0.39720863103866577,
"learning_rate": 0.00039272123507136614,
"loss": 3.2417,
"step": 59400
},
{
"epoch": 17.307204006755576,
"grad_norm": 0.38785409927368164,
"learning_rate": 0.00039254646082143897,
"loss": 3.2473,
"step": 59450
},
{
"epoch": 17.321763438355365,
"grad_norm": 0.37579146027565,
"learning_rate": 0.00039237168657151175,
"loss": 3.2439,
"step": 59500
},
{
"epoch": 17.336322869955158,
"grad_norm": 0.4311056435108185,
"learning_rate": 0.0003921969123215846,
"loss": 3.2639,
"step": 59550
},
{
"epoch": 17.350882301554947,
"grad_norm": 0.40039804577827454,
"learning_rate": 0.0003920221380716574,
"loss": 3.2553,
"step": 59600
},
{
"epoch": 17.36544173315474,
"grad_norm": 0.37928736209869385,
"learning_rate": 0.00039184736382173025,
"loss": 3.2459,
"step": 59650
},
{
"epoch": 17.38000116475453,
"grad_norm": 0.3739273250102997,
"learning_rate": 0.000391672589571803,
"loss": 3.2557,
"step": 59700
},
{
"epoch": 17.394560596354317,
"grad_norm": 0.453727662563324,
"learning_rate": 0.00039149781532187586,
"loss": 3.2579,
"step": 59750
},
{
"epoch": 17.40912002795411,
"grad_norm": 0.37308287620544434,
"learning_rate": 0.0003913230410719487,
"loss": 3.249,
"step": 59800
},
{
"epoch": 17.4236794595539,
"grad_norm": 0.40976881980895996,
"learning_rate": 0.0003911482668220215,
"loss": 3.2503,
"step": 59850
},
{
"epoch": 17.438238891153688,
"grad_norm": 0.4195505380630493,
"learning_rate": 0.00039097349257209436,
"loss": 3.245,
"step": 59900
},
{
"epoch": 17.45279832275348,
"grad_norm": 0.4036107361316681,
"learning_rate": 0.00039079871832216714,
"loss": 3.2565,
"step": 59950
},
{
"epoch": 17.46735775435327,
"grad_norm": 0.374733030796051,
"learning_rate": 0.00039062394407223997,
"loss": 3.2642,
"step": 60000
},
{
"epoch": 17.46735775435327,
"eval_accuracy": 0.3723462659442161,
"eval_loss": 3.5491600036621094,
"eval_runtime": 180.875,
"eval_samples_per_second": 92.014,
"eval_steps_per_second": 5.755,
"step": 60000
},
{
"epoch": 17.481917185953062,
"grad_norm": 0.38882243633270264,
"learning_rate": 0.0003904491698223128,
"loss": 3.2628,
"step": 60050
},
{
"epoch": 17.49647661755285,
"grad_norm": 0.37586531043052673,
"learning_rate": 0.00039027439557238564,
"loss": 3.2443,
"step": 60100
},
{
"epoch": 17.51103604915264,
"grad_norm": 0.379884272813797,
"learning_rate": 0.00039009962132245847,
"loss": 3.2743,
"step": 60150
},
{
"epoch": 17.525595480752433,
"grad_norm": 0.411455363035202,
"learning_rate": 0.00038992484707253125,
"loss": 3.265,
"step": 60200
},
{
"epoch": 17.54015491235222,
"grad_norm": 0.4151667058467865,
"learning_rate": 0.0003897500728226041,
"loss": 3.2711,
"step": 60250
},
{
"epoch": 17.55471434395201,
"grad_norm": 0.37580248713493347,
"learning_rate": 0.0003895752985726769,
"loss": 3.2486,
"step": 60300
},
{
"epoch": 17.569273775551803,
"grad_norm": 0.41014158725738525,
"learning_rate": 0.00038940052432274975,
"loss": 3.2664,
"step": 60350
},
{
"epoch": 17.583833207151592,
"grad_norm": 0.3746136724948883,
"learning_rate": 0.0003892257500728225,
"loss": 3.2613,
"step": 60400
},
{
"epoch": 17.598392638751385,
"grad_norm": 0.41069844365119934,
"learning_rate": 0.00038905097582289536,
"loss": 3.2718,
"step": 60450
},
{
"epoch": 17.612952070351174,
"grad_norm": 0.39283448457717896,
"learning_rate": 0.0003888762015729682,
"loss": 3.2655,
"step": 60500
},
{
"epoch": 17.627511501950963,
"grad_norm": 0.40771397948265076,
"learning_rate": 0.000388701427323041,
"loss": 3.2694,
"step": 60550
},
{
"epoch": 17.642070933550755,
"grad_norm": 0.42583101987838745,
"learning_rate": 0.00038852665307311386,
"loss": 3.2694,
"step": 60600
},
{
"epoch": 17.656630365150544,
"grad_norm": 0.38941916823387146,
"learning_rate": 0.00038835187882318664,
"loss": 3.2727,
"step": 60650
},
{
"epoch": 17.671189796750333,
"grad_norm": 0.4130733907222748,
"learning_rate": 0.00038817710457325947,
"loss": 3.271,
"step": 60700
},
{
"epoch": 17.685749228350126,
"grad_norm": 0.3779431879520416,
"learning_rate": 0.0003880023303233323,
"loss": 3.2729,
"step": 60750
},
{
"epoch": 17.700308659949915,
"grad_norm": 0.3987460434436798,
"learning_rate": 0.00038782755607340514,
"loss": 3.281,
"step": 60800
},
{
"epoch": 17.714868091549707,
"grad_norm": 0.41353845596313477,
"learning_rate": 0.00038765278182347797,
"loss": 3.2826,
"step": 60850
},
{
"epoch": 17.729427523149496,
"grad_norm": 0.3690580129623413,
"learning_rate": 0.00038747800757355075,
"loss": 3.2909,
"step": 60900
},
{
"epoch": 17.743986954749285,
"grad_norm": 0.39012983441352844,
"learning_rate": 0.00038730323332362363,
"loss": 3.289,
"step": 60950
},
{
"epoch": 17.758546386349078,
"grad_norm": 0.3791520893573761,
"learning_rate": 0.00038712845907369647,
"loss": 3.2743,
"step": 61000
},
{
"epoch": 17.758546386349078,
"eval_accuracy": 0.37214919549752823,
"eval_loss": 3.54073166847229,
"eval_runtime": 181.4366,
"eval_samples_per_second": 91.729,
"eval_steps_per_second": 5.738,
"step": 61000
},
{
"epoch": 17.773105817948867,
"grad_norm": 0.42223140597343445,
"learning_rate": 0.0003869536848237693,
"loss": 3.2712,
"step": 61050
},
{
"epoch": 17.787665249548656,
"grad_norm": 0.39836451411247253,
"learning_rate": 0.00038677891057384213,
"loss": 3.2741,
"step": 61100
},
{
"epoch": 17.80222468114845,
"grad_norm": 0.3820720314979553,
"learning_rate": 0.0003866041363239149,
"loss": 3.2734,
"step": 61150
},
{
"epoch": 17.816784112748238,
"grad_norm": 0.3743707239627838,
"learning_rate": 0.00038642936207398774,
"loss": 3.2878,
"step": 61200
},
{
"epoch": 17.83134354434803,
"grad_norm": 0.40424010157585144,
"learning_rate": 0.0003862545878240606,
"loss": 3.2707,
"step": 61250
},
{
"epoch": 17.84590297594782,
"grad_norm": 0.3789885640144348,
"learning_rate": 0.0003860798135741334,
"loss": 3.2847,
"step": 61300
},
{
"epoch": 17.860462407547608,
"grad_norm": 0.3790980875492096,
"learning_rate": 0.00038590503932420624,
"loss": 3.2678,
"step": 61350
},
{
"epoch": 17.8750218391474,
"grad_norm": 0.3928091526031494,
"learning_rate": 0.000385730265074279,
"loss": 3.2864,
"step": 61400
},
{
"epoch": 17.88958127074719,
"grad_norm": 0.43650928139686584,
"learning_rate": 0.00038555549082435186,
"loss": 3.2733,
"step": 61450
},
{
"epoch": 17.90414070234698,
"grad_norm": 0.3985980153083801,
"learning_rate": 0.0003853807165744247,
"loss": 3.2887,
"step": 61500
},
{
"epoch": 17.91870013394677,
"grad_norm": 0.38238638639450073,
"learning_rate": 0.0003852059423244975,
"loss": 3.2964,
"step": 61550
},
{
"epoch": 17.93325956554656,
"grad_norm": 0.4149417281150818,
"learning_rate": 0.00038503116807457035,
"loss": 3.2798,
"step": 61600
},
{
"epoch": 17.947818997146353,
"grad_norm": 0.3858490288257599,
"learning_rate": 0.00038485639382464313,
"loss": 3.286,
"step": 61650
},
{
"epoch": 17.962378428746142,
"grad_norm": 0.41513580083847046,
"learning_rate": 0.00038468161957471597,
"loss": 3.2882,
"step": 61700
},
{
"epoch": 17.97693786034593,
"grad_norm": 0.3717535436153412,
"learning_rate": 0.0003845068453247888,
"loss": 3.286,
"step": 61750
},
{
"epoch": 17.991497291945723,
"grad_norm": 0.4167464077472687,
"learning_rate": 0.00038433207107486163,
"loss": 3.2957,
"step": 61800
},
{
"epoch": 18.005823772639918,
"grad_norm": 0.39267608523368835,
"learning_rate": 0.0003841572968249344,
"loss": 3.2381,
"step": 61850
},
{
"epoch": 18.020383204239707,
"grad_norm": 0.3998408019542694,
"learning_rate": 0.00038398252257500724,
"loss": 3.1779,
"step": 61900
},
{
"epoch": 18.034942635839496,
"grad_norm": 0.38134950399398804,
"learning_rate": 0.0003838077483250801,
"loss": 3.1746,
"step": 61950
},
{
"epoch": 18.04950206743929,
"grad_norm": 0.38700148463249207,
"learning_rate": 0.0003836329740751529,
"loss": 3.1953,
"step": 62000
},
{
"epoch": 18.04950206743929,
"eval_accuracy": 0.3720806441428629,
"eval_loss": 3.5562655925750732,
"eval_runtime": 181.185,
"eval_samples_per_second": 91.856,
"eval_steps_per_second": 5.746,
"step": 62000
},
{
"epoch": 18.064061499039077,
"grad_norm": 0.4167320132255554,
"learning_rate": 0.00038345819982522574,
"loss": 3.1947,
"step": 62050
},
{
"epoch": 18.078620930638866,
"grad_norm": 0.4142136573791504,
"learning_rate": 0.0003832834255752985,
"loss": 3.1952,
"step": 62100
},
{
"epoch": 18.09318036223866,
"grad_norm": 0.3881548345088959,
"learning_rate": 0.00038310865132537135,
"loss": 3.2028,
"step": 62150
},
{
"epoch": 18.107739793838448,
"grad_norm": 0.41077175736427307,
"learning_rate": 0.0003829338770754442,
"loss": 3.2023,
"step": 62200
},
{
"epoch": 18.12229922543824,
"grad_norm": 0.38895899057388306,
"learning_rate": 0.000382759102825517,
"loss": 3.1963,
"step": 62250
},
{
"epoch": 18.13685865703803,
"grad_norm": 0.40407344698905945,
"learning_rate": 0.00038258432857558985,
"loss": 3.2109,
"step": 62300
},
{
"epoch": 18.15141808863782,
"grad_norm": 0.38854244351387024,
"learning_rate": 0.00038240955432566263,
"loss": 3.2107,
"step": 62350
},
{
"epoch": 18.16597752023761,
"grad_norm": 0.3986676037311554,
"learning_rate": 0.00038223478007573547,
"loss": 3.2017,
"step": 62400
},
{
"epoch": 18.1805369518374,
"grad_norm": 0.38370537757873535,
"learning_rate": 0.0003820600058258083,
"loss": 3.212,
"step": 62450
},
{
"epoch": 18.19509638343719,
"grad_norm": 0.3775344491004944,
"learning_rate": 0.00038188523157588113,
"loss": 3.2255,
"step": 62500
},
{
"epoch": 18.20965581503698,
"grad_norm": 0.4089907705783844,
"learning_rate": 0.0003817104573259539,
"loss": 3.2281,
"step": 62550
},
{
"epoch": 18.22421524663677,
"grad_norm": 0.4235895574092865,
"learning_rate": 0.00038153568307602674,
"loss": 3.2173,
"step": 62600
},
{
"epoch": 18.238774678236563,
"grad_norm": 0.41684481501579285,
"learning_rate": 0.0003813609088260996,
"loss": 3.2157,
"step": 62650
},
{
"epoch": 18.253334109836352,
"grad_norm": 0.3882802724838257,
"learning_rate": 0.0003811861345761724,
"loss": 3.2313,
"step": 62700
},
{
"epoch": 18.26789354143614,
"grad_norm": 0.4060615599155426,
"learning_rate": 0.00038101136032624524,
"loss": 3.2251,
"step": 62750
},
{
"epoch": 18.282452973035934,
"grad_norm": 0.40829920768737793,
"learning_rate": 0.000380836586076318,
"loss": 3.2242,
"step": 62800
},
{
"epoch": 18.297012404635723,
"grad_norm": 0.39912256598472595,
"learning_rate": 0.00038066181182639085,
"loss": 3.2388,
"step": 62850
},
{
"epoch": 18.31157183623551,
"grad_norm": 0.41347721219062805,
"learning_rate": 0.0003804870375764637,
"loss": 3.2414,
"step": 62900
},
{
"epoch": 18.326131267835304,
"grad_norm": 0.37032246589660645,
"learning_rate": 0.0003803122633265365,
"loss": 3.2287,
"step": 62950
},
{
"epoch": 18.340690699435093,
"grad_norm": 0.38557493686676025,
"learning_rate": 0.0003801374890766093,
"loss": 3.2446,
"step": 63000
},
{
"epoch": 18.340690699435093,
"eval_accuracy": 0.3720570097993162,
"eval_loss": 3.5540997982025146,
"eval_runtime": 181.3354,
"eval_samples_per_second": 91.78,
"eval_steps_per_second": 5.741,
"step": 63000
},
{
"epoch": 18.355250131034886,
"grad_norm": 0.4208027422428131,
"learning_rate": 0.00037996271482668213,
"loss": 3.2347,
"step": 63050
},
{
"epoch": 18.369809562634675,
"grad_norm": 0.42822667956352234,
"learning_rate": 0.00037978794057675497,
"loss": 3.2171,
"step": 63100
},
{
"epoch": 18.384368994234464,
"grad_norm": 0.39856502413749695,
"learning_rate": 0.0003796131663268278,
"loss": 3.2528,
"step": 63150
},
{
"epoch": 18.398928425834256,
"grad_norm": 0.38449880480766296,
"learning_rate": 0.00037943839207690063,
"loss": 3.2357,
"step": 63200
},
{
"epoch": 18.413487857434045,
"grad_norm": 0.3990757167339325,
"learning_rate": 0.0003792636178269734,
"loss": 3.2576,
"step": 63250
},
{
"epoch": 18.428047289033834,
"grad_norm": 0.4236275255680084,
"learning_rate": 0.00037908884357704624,
"loss": 3.2358,
"step": 63300
},
{
"epoch": 18.442606720633627,
"grad_norm": 0.39217713475227356,
"learning_rate": 0.0003789140693271191,
"loss": 3.24,
"step": 63350
},
{
"epoch": 18.457166152233416,
"grad_norm": 0.4095120131969452,
"learning_rate": 0.0003787392950771919,
"loss": 3.2689,
"step": 63400
},
{
"epoch": 18.47172558383321,
"grad_norm": 0.4244193136692047,
"learning_rate": 0.00037856452082726474,
"loss": 3.2418,
"step": 63450
},
{
"epoch": 18.486285015432998,
"grad_norm": 0.3923884332180023,
"learning_rate": 0.00037838974657733763,
"loss": 3.2559,
"step": 63500
},
{
"epoch": 18.500844447032787,
"grad_norm": 0.3991324007511139,
"learning_rate": 0.0003782149723274104,
"loss": 3.2467,
"step": 63550
},
{
"epoch": 18.51540387863258,
"grad_norm": 0.37614545226097107,
"learning_rate": 0.00037804019807748324,
"loss": 3.2485,
"step": 63600
},
{
"epoch": 18.529963310232368,
"grad_norm": 0.40092289447784424,
"learning_rate": 0.0003778654238275561,
"loss": 3.2604,
"step": 63650
},
{
"epoch": 18.544522741832157,
"grad_norm": 0.3537710905075073,
"learning_rate": 0.0003776906495776289,
"loss": 3.2492,
"step": 63700
},
{
"epoch": 18.55908217343195,
"grad_norm": 0.3787946105003357,
"learning_rate": 0.0003775158753277017,
"loss": 3.2538,
"step": 63750
},
{
"epoch": 18.57364160503174,
"grad_norm": 0.38140779733657837,
"learning_rate": 0.0003773411010777745,
"loss": 3.2461,
"step": 63800
},
{
"epoch": 18.58820103663153,
"grad_norm": 0.3862013518810272,
"learning_rate": 0.00037716632682784735,
"loss": 3.2435,
"step": 63850
},
{
"epoch": 18.60276046823132,
"grad_norm": 0.41837623715400696,
"learning_rate": 0.0003769915525779202,
"loss": 3.2602,
"step": 63900
},
{
"epoch": 18.61731989983111,
"grad_norm": 0.42126792669296265,
"learning_rate": 0.000376816778327993,
"loss": 3.2575,
"step": 63950
},
{
"epoch": 18.631879331430902,
"grad_norm": 0.38360005617141724,
"learning_rate": 0.0003766420040780658,
"loss": 3.263,
"step": 64000
},
{
"epoch": 18.631879331430902,
"eval_accuracy": 0.3726539827455182,
"eval_loss": 3.5427675247192383,
"eval_runtime": 180.7526,
"eval_samples_per_second": 92.076,
"eval_steps_per_second": 5.759,
"step": 64000
},
{
"epoch": 18.64643876303069,
"grad_norm": 0.38449525833129883,
"learning_rate": 0.00037646722982813863,
"loss": 3.2636,
"step": 64050
},
{
"epoch": 18.66099819463048,
"grad_norm": 0.39993739128112793,
"learning_rate": 0.00037629245557821146,
"loss": 3.2687,
"step": 64100
},
{
"epoch": 18.675557626230272,
"grad_norm": 0.44774314761161804,
"learning_rate": 0.0003761176813282843,
"loss": 3.264,
"step": 64150
},
{
"epoch": 18.69011705783006,
"grad_norm": 0.4117605686187744,
"learning_rate": 0.00037594290707835713,
"loss": 3.2523,
"step": 64200
},
{
"epoch": 18.704676489429854,
"grad_norm": 0.41900861263275146,
"learning_rate": 0.0003757681328284299,
"loss": 3.2652,
"step": 64250
},
{
"epoch": 18.719235921029643,
"grad_norm": 0.3894200921058655,
"learning_rate": 0.00037559335857850274,
"loss": 3.2627,
"step": 64300
},
{
"epoch": 18.733795352629432,
"grad_norm": 0.3939887583255768,
"learning_rate": 0.0003754185843285756,
"loss": 3.2714,
"step": 64350
},
{
"epoch": 18.748354784229225,
"grad_norm": 0.3855105936527252,
"learning_rate": 0.0003752438100786484,
"loss": 3.2576,
"step": 64400
},
{
"epoch": 18.762914215829014,
"grad_norm": 0.40547657012939453,
"learning_rate": 0.0003750690358287212,
"loss": 3.2643,
"step": 64450
},
{
"epoch": 18.777473647428806,
"grad_norm": 0.3899041414260864,
"learning_rate": 0.000374894261578794,
"loss": 3.2565,
"step": 64500
},
{
"epoch": 18.792033079028595,
"grad_norm": 0.3877187669277191,
"learning_rate": 0.00037471948732886685,
"loss": 3.2734,
"step": 64550
},
{
"epoch": 18.806592510628384,
"grad_norm": 0.4046708047389984,
"learning_rate": 0.0003745447130789397,
"loss": 3.2639,
"step": 64600
},
{
"epoch": 18.821151942228177,
"grad_norm": 0.3916832208633423,
"learning_rate": 0.0003743699388290125,
"loss": 3.2698,
"step": 64650
},
{
"epoch": 18.835711373827966,
"grad_norm": 0.3999544382095337,
"learning_rate": 0.0003741951645790853,
"loss": 3.2824,
"step": 64700
},
{
"epoch": 18.850270805427755,
"grad_norm": 0.4189528524875641,
"learning_rate": 0.00037402039032915813,
"loss": 3.2592,
"step": 64750
},
{
"epoch": 18.864830237027547,
"grad_norm": 0.39608845114707947,
"learning_rate": 0.00037384561607923096,
"loss": 3.2647,
"step": 64800
},
{
"epoch": 18.879389668627336,
"grad_norm": 0.36444324254989624,
"learning_rate": 0.0003736708418293038,
"loss": 3.2628,
"step": 64850
},
{
"epoch": 18.893949100227125,
"grad_norm": 0.39453035593032837,
"learning_rate": 0.00037349606757937663,
"loss": 3.2773,
"step": 64900
},
{
"epoch": 18.908508531826918,
"grad_norm": 0.4110398590564728,
"learning_rate": 0.0003733212933294494,
"loss": 3.2737,
"step": 64950
},
{
"epoch": 18.923067963426707,
"grad_norm": 0.37432223558425903,
"learning_rate": 0.00037314651907952224,
"loss": 3.2807,
"step": 65000
},
{
"epoch": 18.923067963426707,
"eval_accuracy": 0.3731833450074448,
"eval_loss": 3.5371851921081543,
"eval_runtime": 181.3913,
"eval_samples_per_second": 91.752,
"eval_steps_per_second": 5.739,
"step": 65000
},
{
"epoch": 18.9376273950265,
"grad_norm": 0.38230255246162415,
"learning_rate": 0.00037297174482959507,
"loss": 3.2801,
"step": 65050
},
{
"epoch": 18.95218682662629,
"grad_norm": 0.4243530035018921,
"learning_rate": 0.0003727969705796679,
"loss": 3.2761,
"step": 65100
},
{
"epoch": 18.966746258226078,
"grad_norm": 0.4133412539958954,
"learning_rate": 0.0003726221963297407,
"loss": 3.2716,
"step": 65150
},
{
"epoch": 18.98130568982587,
"grad_norm": 0.3846762180328369,
"learning_rate": 0.0003724474220798135,
"loss": 3.2876,
"step": 65200
},
{
"epoch": 18.99586512142566,
"grad_norm": 0.422029972076416,
"learning_rate": 0.00037227264782988635,
"loss": 3.275,
"step": 65250
},
{
"epoch": 19.010191602119853,
"grad_norm": 0.40092238783836365,
"learning_rate": 0.0003720978735799592,
"loss": 3.2038,
"step": 65300
},
{
"epoch": 19.024751033719642,
"grad_norm": 0.356789767742157,
"learning_rate": 0.000371923099330032,
"loss": 3.1628,
"step": 65350
},
{
"epoch": 19.039310465319435,
"grad_norm": 0.3983159363269806,
"learning_rate": 0.0003717483250801048,
"loss": 3.1639,
"step": 65400
},
{
"epoch": 19.053869896919224,
"grad_norm": 0.4232736825942993,
"learning_rate": 0.00037157355083017763,
"loss": 3.1838,
"step": 65450
},
{
"epoch": 19.068429328519013,
"grad_norm": 0.41093751788139343,
"learning_rate": 0.00037139877658025046,
"loss": 3.1817,
"step": 65500
},
{
"epoch": 19.082988760118806,
"grad_norm": 0.40463700890541077,
"learning_rate": 0.0003712240023303233,
"loss": 3.1925,
"step": 65550
},
{
"epoch": 19.097548191718595,
"grad_norm": 0.403666228055954,
"learning_rate": 0.00037104922808039607,
"loss": 3.1855,
"step": 65600
},
{
"epoch": 19.112107623318387,
"grad_norm": 0.4537275731563568,
"learning_rate": 0.0003708744538304689,
"loss": 3.1927,
"step": 65650
},
{
"epoch": 19.126667054918176,
"grad_norm": 0.401798814535141,
"learning_rate": 0.00037069967958054174,
"loss": 3.1889,
"step": 65700
},
{
"epoch": 19.141226486517965,
"grad_norm": 0.40314981341362,
"learning_rate": 0.00037052490533061457,
"loss": 3.1879,
"step": 65750
},
{
"epoch": 19.155785918117758,
"grad_norm": 0.3906761705875397,
"learning_rate": 0.0003703501310806874,
"loss": 3.2167,
"step": 65800
},
{
"epoch": 19.170345349717547,
"grad_norm": 0.39637866616249084,
"learning_rate": 0.0003701753568307602,
"loss": 3.1943,
"step": 65850
},
{
"epoch": 19.184904781317336,
"grad_norm": 0.40415269136428833,
"learning_rate": 0.000370000582580833,
"loss": 3.1988,
"step": 65900
},
{
"epoch": 19.19946421291713,
"grad_norm": 0.41272252798080444,
"learning_rate": 0.00036982580833090585,
"loss": 3.1979,
"step": 65950
},
{
"epoch": 19.214023644516917,
"grad_norm": 0.4298579692840576,
"learning_rate": 0.00036965103408097874,
"loss": 3.2138,
"step": 66000
},
{
"epoch": 19.214023644516917,
"eval_accuracy": 0.3722166885980049,
"eval_loss": 3.5523126125335693,
"eval_runtime": 180.988,
"eval_samples_per_second": 91.956,
"eval_steps_per_second": 5.752,
"step": 66000
},
{
"epoch": 19.22858307611671,
"grad_norm": 0.3859882652759552,
"learning_rate": 0.00036947625983105157,
"loss": 3.2164,
"step": 66050
},
{
"epoch": 19.2431425077165,
"grad_norm": 0.38444507122039795,
"learning_rate": 0.0003693014855811244,
"loss": 3.222,
"step": 66100
},
{
"epoch": 19.257701939316288,
"grad_norm": 0.3962699770927429,
"learning_rate": 0.0003691267113311972,
"loss": 3.2082,
"step": 66150
},
{
"epoch": 19.27226137091608,
"grad_norm": 0.389569491147995,
"learning_rate": 0.00036895193708127,
"loss": 3.2159,
"step": 66200
},
{
"epoch": 19.28682080251587,
"grad_norm": 0.4205169081687927,
"learning_rate": 0.00036877716283134285,
"loss": 3.2215,
"step": 66250
},
{
"epoch": 19.30138023411566,
"grad_norm": 0.4000534415245056,
"learning_rate": 0.0003686023885814157,
"loss": 3.2277,
"step": 66300
},
{
"epoch": 19.31593966571545,
"grad_norm": 0.40057796239852905,
"learning_rate": 0.00036842761433148846,
"loss": 3.2363,
"step": 66350
},
{
"epoch": 19.33049909731524,
"grad_norm": 0.44507649540901184,
"learning_rate": 0.0003682528400815613,
"loss": 3.2307,
"step": 66400
},
{
"epoch": 19.345058528915033,
"grad_norm": 0.4065307378768921,
"learning_rate": 0.0003680780658316341,
"loss": 3.2169,
"step": 66450
},
{
"epoch": 19.35961796051482,
"grad_norm": 0.41337087750434875,
"learning_rate": 0.00036790329158170696,
"loss": 3.2245,
"step": 66500
},
{
"epoch": 19.37417739211461,
"grad_norm": 0.4373385012149811,
"learning_rate": 0.0003677285173317798,
"loss": 3.2265,
"step": 66550
},
{
"epoch": 19.388736823714403,
"grad_norm": 0.41589170694351196,
"learning_rate": 0.00036755374308185257,
"loss": 3.2428,
"step": 66600
},
{
"epoch": 19.403296255314192,
"grad_norm": 0.4082939922809601,
"learning_rate": 0.0003673789688319254,
"loss": 3.2366,
"step": 66650
},
{
"epoch": 19.41785568691398,
"grad_norm": 0.3798679709434509,
"learning_rate": 0.00036720419458199824,
"loss": 3.2216,
"step": 66700
},
{
"epoch": 19.432415118513774,
"grad_norm": 0.3959912359714508,
"learning_rate": 0.00036702942033207107,
"loss": 3.2231,
"step": 66750
},
{
"epoch": 19.446974550113563,
"grad_norm": 0.42029306292533875,
"learning_rate": 0.0003668546460821439,
"loss": 3.2401,
"step": 66800
},
{
"epoch": 19.461533981713355,
"grad_norm": 0.400526225566864,
"learning_rate": 0.0003666798718322167,
"loss": 3.2324,
"step": 66850
},
{
"epoch": 19.476093413313144,
"grad_norm": 0.4720441997051239,
"learning_rate": 0.0003665050975822895,
"loss": 3.2414,
"step": 66900
},
{
"epoch": 19.490652844912933,
"grad_norm": 0.40877315402030945,
"learning_rate": 0.00036633032333236235,
"loss": 3.2441,
"step": 66950
},
{
"epoch": 19.505212276512726,
"grad_norm": 0.3751513659954071,
"learning_rate": 0.0003661555490824352,
"loss": 3.242,
"step": 67000
},
{
"epoch": 19.505212276512726,
"eval_accuracy": 0.37279508330400596,
"eval_loss": 3.547092914581299,
"eval_runtime": 181.1392,
"eval_samples_per_second": 91.88,
"eval_steps_per_second": 5.747,
"step": 67000
},
{
"epoch": 19.519771708112515,
"grad_norm": 0.39515799283981323,
"learning_rate": 0.00036598077483250796,
"loss": 3.2452,
"step": 67050
},
{
"epoch": 19.534331139712307,
"grad_norm": 0.41457104682922363,
"learning_rate": 0.0003658060005825808,
"loss": 3.2361,
"step": 67100
},
{
"epoch": 19.548890571312096,
"grad_norm": 0.43827950954437256,
"learning_rate": 0.0003656312263326536,
"loss": 3.2423,
"step": 67150
},
{
"epoch": 19.563450002911885,
"grad_norm": 0.3718095123767853,
"learning_rate": 0.00036545645208272646,
"loss": 3.2491,
"step": 67200
},
{
"epoch": 19.578009434511678,
"grad_norm": 0.3950149714946747,
"learning_rate": 0.0003652816778327993,
"loss": 3.2372,
"step": 67250
},
{
"epoch": 19.592568866111467,
"grad_norm": 0.39129117131233215,
"learning_rate": 0.00036510690358287207,
"loss": 3.2336,
"step": 67300
},
{
"epoch": 19.607128297711256,
"grad_norm": 0.39655396342277527,
"learning_rate": 0.0003649321293329449,
"loss": 3.2283,
"step": 67350
},
{
"epoch": 19.62168772931105,
"grad_norm": 0.43846869468688965,
"learning_rate": 0.00036475735508301774,
"loss": 3.2469,
"step": 67400
},
{
"epoch": 19.636247160910838,
"grad_norm": 0.4126003682613373,
"learning_rate": 0.00036458258083309057,
"loss": 3.2552,
"step": 67450
},
{
"epoch": 19.650806592510627,
"grad_norm": 0.4312933385372162,
"learning_rate": 0.0003644078065831634,
"loss": 3.2403,
"step": 67500
},
{
"epoch": 19.66536602411042,
"grad_norm": 0.4025894105434418,
"learning_rate": 0.0003642330323332362,
"loss": 3.2417,
"step": 67550
},
{
"epoch": 19.679925455710208,
"grad_norm": 0.3867190480232239,
"learning_rate": 0.000364058258083309,
"loss": 3.2577,
"step": 67600
},
{
"epoch": 19.69448488731,
"grad_norm": 0.3977561593055725,
"learning_rate": 0.00036388348383338185,
"loss": 3.2423,
"step": 67650
},
{
"epoch": 19.70904431890979,
"grad_norm": 0.4209028482437134,
"learning_rate": 0.0003637087095834547,
"loss": 3.2664,
"step": 67700
},
{
"epoch": 19.72360375050958,
"grad_norm": 0.39911961555480957,
"learning_rate": 0.00036353393533352746,
"loss": 3.2596,
"step": 67750
},
{
"epoch": 19.73816318210937,
"grad_norm": 0.43913912773132324,
"learning_rate": 0.0003633591610836003,
"loss": 3.2618,
"step": 67800
},
{
"epoch": 19.75272261370916,
"grad_norm": 0.40178295969963074,
"learning_rate": 0.0003631843868336731,
"loss": 3.2504,
"step": 67850
},
{
"epoch": 19.767282045308953,
"grad_norm": 0.3968295454978943,
"learning_rate": 0.00036300961258374596,
"loss": 3.2515,
"step": 67900
},
{
"epoch": 19.781841476908742,
"grad_norm": 0.41022300720214844,
"learning_rate": 0.0003628348383338188,
"loss": 3.2553,
"step": 67950
},
{
"epoch": 19.79640090850853,
"grad_norm": 0.40481844544410706,
"learning_rate": 0.00036266006408389157,
"loss": 3.253,
"step": 68000
},
{
"epoch": 19.79640090850853,
"eval_accuracy": 0.37316241509126913,
"eval_loss": 3.537109613418579,
"eval_runtime": 181.0978,
"eval_samples_per_second": 91.901,
"eval_steps_per_second": 5.748,
"step": 68000
},
{
"epoch": 19.810960340108323,
"grad_norm": 0.384703665971756,
"learning_rate": 0.0003624852898339644,
"loss": 3.256,
"step": 68050
},
{
"epoch": 19.825519771708112,
"grad_norm": 0.3775125741958618,
"learning_rate": 0.00036231051558403723,
"loss": 3.2636,
"step": 68100
},
{
"epoch": 19.8400792033079,
"grad_norm": 0.41662731766700745,
"learning_rate": 0.00036213574133411007,
"loss": 3.2597,
"step": 68150
},
{
"epoch": 19.854638634907694,
"grad_norm": 0.40957972407341003,
"learning_rate": 0.0003619609670841829,
"loss": 3.2596,
"step": 68200
},
{
"epoch": 19.869198066507483,
"grad_norm": 0.38087132573127747,
"learning_rate": 0.0003617861928342557,
"loss": 3.2742,
"step": 68250
},
{
"epoch": 19.883757498107276,
"grad_norm": 0.4036879241466522,
"learning_rate": 0.0003616114185843285,
"loss": 3.2549,
"step": 68300
},
{
"epoch": 19.898316929707065,
"grad_norm": 0.4047390818595886,
"learning_rate": 0.00036143664433440135,
"loss": 3.2638,
"step": 68350
},
{
"epoch": 19.912876361306854,
"grad_norm": 0.41273418068885803,
"learning_rate": 0.0003612618700844742,
"loss": 3.2695,
"step": 68400
},
{
"epoch": 19.927435792906646,
"grad_norm": 0.40376052260398865,
"learning_rate": 0.00036108709583454696,
"loss": 3.277,
"step": 68450
},
{
"epoch": 19.941995224506435,
"grad_norm": 0.372547447681427,
"learning_rate": 0.0003609123215846198,
"loss": 3.2634,
"step": 68500
},
{
"epoch": 19.956554656106224,
"grad_norm": 0.41053515672683716,
"learning_rate": 0.0003607375473346927,
"loss": 3.2729,
"step": 68550
},
{
"epoch": 19.971114087706017,
"grad_norm": 0.40540847182273865,
"learning_rate": 0.0003605627730847655,
"loss": 3.2581,
"step": 68600
},
{
"epoch": 19.985673519305806,
"grad_norm": 0.412577360868454,
"learning_rate": 0.00036038799883483834,
"loss": 3.2543,
"step": 68650
},
{
"epoch": 20.0,
"grad_norm": 1.0419994592666626,
"learning_rate": 0.0003602132245849112,
"loss": 3.2561,
"step": 68700
},
{
"epoch": 20.01455943159979,
"grad_norm": 0.44616925716400146,
"learning_rate": 0.00036003845033498395,
"loss": 3.1465,
"step": 68750
},
{
"epoch": 20.02911886319958,
"grad_norm": 0.4104091227054596,
"learning_rate": 0.0003598636760850568,
"loss": 3.173,
"step": 68800
},
{
"epoch": 20.04367829479937,
"grad_norm": 0.39892300963401794,
"learning_rate": 0.0003596889018351296,
"loss": 3.1733,
"step": 68850
},
{
"epoch": 20.058237726399163,
"grad_norm": 0.39944881200790405,
"learning_rate": 0.00035951412758520245,
"loss": 3.1739,
"step": 68900
},
{
"epoch": 20.072797157998952,
"grad_norm": 0.4186191260814667,
"learning_rate": 0.00035933935333527523,
"loss": 3.1701,
"step": 68950
},
{
"epoch": 20.08735658959874,
"grad_norm": 0.43529069423675537,
"learning_rate": 0.00035916457908534807,
"loss": 3.17,
"step": 69000
},
{
"epoch": 20.08735658959874,
"eval_accuracy": 0.37242316574859197,
"eval_loss": 3.556870698928833,
"eval_runtime": 180.4494,
"eval_samples_per_second": 92.231,
"eval_steps_per_second": 5.769,
"step": 69000
},
{
"epoch": 20.101916021198534,
"grad_norm": 0.43885257840156555,
"learning_rate": 0.0003589898048354209,
"loss": 3.1713,
"step": 69050
},
{
"epoch": 20.116475452798323,
"grad_norm": 0.3850330412387848,
"learning_rate": 0.00035881503058549373,
"loss": 3.1862,
"step": 69100
},
{
"epoch": 20.13103488439811,
"grad_norm": 0.38994720578193665,
"learning_rate": 0.00035864025633556656,
"loss": 3.185,
"step": 69150
},
{
"epoch": 20.145594315997904,
"grad_norm": 0.3972725570201874,
"learning_rate": 0.00035846548208563934,
"loss": 3.1977,
"step": 69200
},
{
"epoch": 20.160153747597693,
"grad_norm": 0.41154587268829346,
"learning_rate": 0.0003582907078357122,
"loss": 3.1837,
"step": 69250
},
{
"epoch": 20.174713179197486,
"grad_norm": 0.41088539361953735,
"learning_rate": 0.000358115933585785,
"loss": 3.1911,
"step": 69300
},
{
"epoch": 20.189272610797275,
"grad_norm": 0.4196443259716034,
"learning_rate": 0.00035794115933585784,
"loss": 3.1966,
"step": 69350
},
{
"epoch": 20.203832042397064,
"grad_norm": 0.40109729766845703,
"learning_rate": 0.0003577663850859307,
"loss": 3.2108,
"step": 69400
},
{
"epoch": 20.218391473996856,
"grad_norm": 0.3926057815551758,
"learning_rate": 0.00035759161083600345,
"loss": 3.1926,
"step": 69450
},
{
"epoch": 20.232950905596645,
"grad_norm": 0.4119376540184021,
"learning_rate": 0.0003574168365860763,
"loss": 3.2071,
"step": 69500
},
{
"epoch": 20.247510337196434,
"grad_norm": 0.39600470662117004,
"learning_rate": 0.0003572420623361491,
"loss": 3.2095,
"step": 69550
},
{
"epoch": 20.262069768796227,
"grad_norm": 0.40082696080207825,
"learning_rate": 0.00035706728808622195,
"loss": 3.1961,
"step": 69600
},
{
"epoch": 20.276629200396016,
"grad_norm": 0.40221554040908813,
"learning_rate": 0.00035689251383629473,
"loss": 3.2103,
"step": 69650
},
{
"epoch": 20.29118863199581,
"grad_norm": 0.38792362809181213,
"learning_rate": 0.00035671773958636757,
"loss": 3.2052,
"step": 69700
},
{
"epoch": 20.305748063595598,
"grad_norm": 0.3968353569507599,
"learning_rate": 0.0003565429653364404,
"loss": 3.2091,
"step": 69750
},
{
"epoch": 20.320307495195387,
"grad_norm": 0.3859781324863434,
"learning_rate": 0.00035636819108651323,
"loss": 3.2032,
"step": 69800
},
{
"epoch": 20.33486692679518,
"grad_norm": 0.4044977128505707,
"learning_rate": 0.00035619341683658606,
"loss": 3.2157,
"step": 69850
},
{
"epoch": 20.349426358394968,
"grad_norm": 0.41051003336906433,
"learning_rate": 0.00035601864258665884,
"loss": 3.2151,
"step": 69900
},
{
"epoch": 20.363985789994757,
"grad_norm": 0.4241284728050232,
"learning_rate": 0.0003558438683367317,
"loss": 3.2137,
"step": 69950
},
{
"epoch": 20.37854522159455,
"grad_norm": 0.40410953760147095,
"learning_rate": 0.0003556690940868045,
"loss": 3.2225,
"step": 70000
},
{
"epoch": 20.37854522159455,
"eval_accuracy": 0.37306611396010125,
"eval_loss": 3.548208713531494,
"eval_runtime": 180.5894,
"eval_samples_per_second": 92.159,
"eval_steps_per_second": 5.764,
"step": 70000
},
{
"epoch": 20.39310465319434,
"grad_norm": 0.3968863785266876,
"learning_rate": 0.00035549431983687734,
"loss": 3.2168,
"step": 70050
},
{
"epoch": 20.40766408479413,
"grad_norm": 0.4126570224761963,
"learning_rate": 0.0003553195455869502,
"loss": 3.2173,
"step": 70100
},
{
"epoch": 20.42222351639392,
"grad_norm": 0.388510137796402,
"learning_rate": 0.00035514477133702295,
"loss": 3.2307,
"step": 70150
},
{
"epoch": 20.43678294799371,
"grad_norm": 0.38976356387138367,
"learning_rate": 0.0003549699970870958,
"loss": 3.224,
"step": 70200
},
{
"epoch": 20.451342379593502,
"grad_norm": 0.4209003448486328,
"learning_rate": 0.0003547952228371686,
"loss": 3.2257,
"step": 70250
},
{
"epoch": 20.46590181119329,
"grad_norm": 0.40154707431793213,
"learning_rate": 0.00035462044858724145,
"loss": 3.2186,
"step": 70300
},
{
"epoch": 20.48046124279308,
"grad_norm": 0.3838447332382202,
"learning_rate": 0.00035444567433731423,
"loss": 3.2274,
"step": 70350
},
{
"epoch": 20.495020674392872,
"grad_norm": 0.39228400588035583,
"learning_rate": 0.00035427090008738706,
"loss": 3.2203,
"step": 70400
},
{
"epoch": 20.50958010599266,
"grad_norm": 0.41162481904029846,
"learning_rate": 0.0003540961258374599,
"loss": 3.2273,
"step": 70450
},
{
"epoch": 20.524139537592454,
"grad_norm": 0.4010622501373291,
"learning_rate": 0.00035392135158753273,
"loss": 3.2287,
"step": 70500
},
{
"epoch": 20.538698969192243,
"grad_norm": 0.3961891233921051,
"learning_rate": 0.00035374657733760556,
"loss": 3.2322,
"step": 70550
},
{
"epoch": 20.553258400792032,
"grad_norm": 0.42891672253608704,
"learning_rate": 0.00035357180308767834,
"loss": 3.232,
"step": 70600
},
{
"epoch": 20.567817832391825,
"grad_norm": 0.41287800669670105,
"learning_rate": 0.0003533970288377512,
"loss": 3.2326,
"step": 70650
},
{
"epoch": 20.582377263991614,
"grad_norm": 0.4244118928909302,
"learning_rate": 0.000353222254587824,
"loss": 3.2308,
"step": 70700
},
{
"epoch": 20.596936695591403,
"grad_norm": 0.4150402247905731,
"learning_rate": 0.00035304748033789684,
"loss": 3.2365,
"step": 70750
},
{
"epoch": 20.611496127191195,
"grad_norm": 0.41076770424842834,
"learning_rate": 0.0003528727060879697,
"loss": 3.2409,
"step": 70800
},
{
"epoch": 20.626055558790984,
"grad_norm": 0.4489847719669342,
"learning_rate": 0.00035269793183804245,
"loss": 3.2417,
"step": 70850
},
{
"epoch": 20.640614990390777,
"grad_norm": 0.41337257623672485,
"learning_rate": 0.0003525231575881153,
"loss": 3.2453,
"step": 70900
},
{
"epoch": 20.655174421990566,
"grad_norm": 0.3830353319644928,
"learning_rate": 0.0003523483833381881,
"loss": 3.2388,
"step": 70950
},
{
"epoch": 20.669733853590355,
"grad_norm": 0.4028426706790924,
"learning_rate": 0.00035217360908826095,
"loss": 3.237,
"step": 71000
},
{
"epoch": 20.669733853590355,
"eval_accuracy": 0.3736510933588318,
"eval_loss": 3.543719530105591,
"eval_runtime": 180.4495,
"eval_samples_per_second": 92.231,
"eval_steps_per_second": 5.769,
"step": 71000
},
{
"epoch": 20.684293285190147,
"grad_norm": 0.4202480614185333,
"learning_rate": 0.00035199883483833384,
"loss": 3.2331,
"step": 71050
},
{
"epoch": 20.698852716789936,
"grad_norm": 0.4162975549697876,
"learning_rate": 0.0003518240605884066,
"loss": 3.242,
"step": 71100
},
{
"epoch": 20.713412148389725,
"grad_norm": 0.3857875466346741,
"learning_rate": 0.00035164928633847945,
"loss": 3.2475,
"step": 71150
},
{
"epoch": 20.727971579989518,
"grad_norm": 0.3922126591205597,
"learning_rate": 0.0003514745120885523,
"loss": 3.2386,
"step": 71200
},
{
"epoch": 20.742531011589307,
"grad_norm": 0.4351789653301239,
"learning_rate": 0.0003512997378386251,
"loss": 3.2494,
"step": 71250
},
{
"epoch": 20.7570904431891,
"grad_norm": 0.40401214361190796,
"learning_rate": 0.00035112496358869795,
"loss": 3.2468,
"step": 71300
},
{
"epoch": 20.77164987478889,
"grad_norm": 0.4370771050453186,
"learning_rate": 0.00035095018933877073,
"loss": 3.2306,
"step": 71350
},
{
"epoch": 20.786209306388677,
"grad_norm": 0.42607468366622925,
"learning_rate": 0.00035077541508884356,
"loss": 3.2454,
"step": 71400
},
{
"epoch": 20.80076873798847,
"grad_norm": 0.42872190475463867,
"learning_rate": 0.0003506006408389164,
"loss": 3.2379,
"step": 71450
},
{
"epoch": 20.81532816958826,
"grad_norm": 0.4074784815311432,
"learning_rate": 0.00035042586658898923,
"loss": 3.2438,
"step": 71500
},
{
"epoch": 20.829887601188048,
"grad_norm": 0.4053134322166443,
"learning_rate": 0.00035025109233906206,
"loss": 3.2373,
"step": 71550
},
{
"epoch": 20.84444703278784,
"grad_norm": 0.4389936923980713,
"learning_rate": 0.00035007631808913484,
"loss": 3.2509,
"step": 71600
},
{
"epoch": 20.85900646438763,
"grad_norm": 0.3987066149711609,
"learning_rate": 0.00034990154383920767,
"loss": 3.2476,
"step": 71650
},
{
"epoch": 20.873565895987422,
"grad_norm": 0.403385728597641,
"learning_rate": 0.0003497267695892805,
"loss": 3.2615,
"step": 71700
},
{
"epoch": 20.88812532758721,
"grad_norm": 0.3917509913444519,
"learning_rate": 0.00034955199533935334,
"loss": 3.2535,
"step": 71750
},
{
"epoch": 20.902684759187,
"grad_norm": 0.3859056234359741,
"learning_rate": 0.0003493772210894261,
"loss": 3.2504,
"step": 71800
},
{
"epoch": 20.917244190786793,
"grad_norm": 0.381082683801651,
"learning_rate": 0.00034920244683949895,
"loss": 3.2605,
"step": 71850
},
{
"epoch": 20.93180362238658,
"grad_norm": 0.4057201147079468,
"learning_rate": 0.0003490276725895718,
"loss": 3.2611,
"step": 71900
},
{
"epoch": 20.94636305398637,
"grad_norm": 0.4282720983028412,
"learning_rate": 0.0003488528983396446,
"loss": 3.2509,
"step": 71950
},
{
"epoch": 20.960922485586163,
"grad_norm": 0.43336808681488037,
"learning_rate": 0.00034867812408971745,
"loss": 3.263,
"step": 72000
},
{
"epoch": 20.960922485586163,
"eval_accuracy": 0.37352304460200414,
"eval_loss": 3.5321924686431885,
"eval_runtime": 181.1295,
"eval_samples_per_second": 91.885,
"eval_steps_per_second": 5.747,
"step": 72000
},
{
"epoch": 20.975481917185952,
"grad_norm": 0.38790565729141235,
"learning_rate": 0.00034850334983979023,
"loss": 3.2522,
"step": 72050
},
{
"epoch": 20.990041348785745,
"grad_norm": 0.3824382424354553,
"learning_rate": 0.00034832857558986306,
"loss": 3.2437,
"step": 72100
},
{
"epoch": 21.004367829479936,
"grad_norm": 0.4122096598148346,
"learning_rate": 0.0003481538013399359,
"loss": 3.2138,
"step": 72150
},
{
"epoch": 21.018927261079728,
"grad_norm": 0.39322277903556824,
"learning_rate": 0.0003479790270900087,
"loss": 3.1639,
"step": 72200
},
{
"epoch": 21.033486692679517,
"grad_norm": 0.417479544878006,
"learning_rate": 0.0003478042528400815,
"loss": 3.1547,
"step": 72250
},
{
"epoch": 21.04804612427931,
"grad_norm": 0.40138930082321167,
"learning_rate": 0.00034762947859015434,
"loss": 3.1497,
"step": 72300
},
{
"epoch": 21.0626055558791,
"grad_norm": 0.4107111394405365,
"learning_rate": 0.00034745470434022717,
"loss": 3.1665,
"step": 72350
},
{
"epoch": 21.077164987478888,
"grad_norm": 0.4117753803730011,
"learning_rate": 0.0003472799300903,
"loss": 3.1677,
"step": 72400
},
{
"epoch": 21.09172441907868,
"grad_norm": 0.4315739870071411,
"learning_rate": 0.00034710515584037284,
"loss": 3.1559,
"step": 72450
},
{
"epoch": 21.10628385067847,
"grad_norm": 0.4121326506137848,
"learning_rate": 0.0003469303815904456,
"loss": 3.1717,
"step": 72500
},
{
"epoch": 21.12084328227826,
"grad_norm": 0.3968290388584137,
"learning_rate": 0.00034675560734051845,
"loss": 3.1652,
"step": 72550
},
{
"epoch": 21.13540271387805,
"grad_norm": 0.4244532287120819,
"learning_rate": 0.0003465808330905913,
"loss": 3.1783,
"step": 72600
},
{
"epoch": 21.14996214547784,
"grad_norm": 0.4135434329509735,
"learning_rate": 0.0003464060588406641,
"loss": 3.1797,
"step": 72650
},
{
"epoch": 21.164521577077632,
"grad_norm": 0.38270533084869385,
"learning_rate": 0.00034623128459073695,
"loss": 3.1792,
"step": 72700
},
{
"epoch": 21.17908100867742,
"grad_norm": 0.433463990688324,
"learning_rate": 0.00034605651034080973,
"loss": 3.17,
"step": 72750
},
{
"epoch": 21.19364044027721,
"grad_norm": 0.3898848295211792,
"learning_rate": 0.00034588173609088256,
"loss": 3.1825,
"step": 72800
},
{
"epoch": 21.208199871877003,
"grad_norm": 0.4255882203578949,
"learning_rate": 0.0003457069618409554,
"loss": 3.1903,
"step": 72850
},
{
"epoch": 21.222759303476792,
"grad_norm": 0.47647958993911743,
"learning_rate": 0.0003455321875910282,
"loss": 3.188,
"step": 72900
},
{
"epoch": 21.23731873507658,
"grad_norm": 0.4151340126991272,
"learning_rate": 0.000345357413341101,
"loss": 3.1946,
"step": 72950
},
{
"epoch": 21.251878166676374,
"grad_norm": 0.4168964922428131,
"learning_rate": 0.00034518263909117384,
"loss": 3.2052,
"step": 73000
},
{
"epoch": 21.251878166676374,
"eval_accuracy": 0.37267949842984477,
"eval_loss": 3.5532169342041016,
"eval_runtime": 181.4222,
"eval_samples_per_second": 91.736,
"eval_steps_per_second": 5.738,
"step": 73000
},
{
"epoch": 21.266437598276163,
"grad_norm": 0.39896926283836365,
"learning_rate": 0.00034500786484124667,
"loss": 3.194,
"step": 73050
},
{
"epoch": 21.280997029875955,
"grad_norm": 0.4467354416847229,
"learning_rate": 0.0003448330905913195,
"loss": 3.1996,
"step": 73100
},
{
"epoch": 21.295556461475744,
"grad_norm": 0.40804705023765564,
"learning_rate": 0.00034465831634139234,
"loss": 3.212,
"step": 73150
},
{
"epoch": 21.310115893075533,
"grad_norm": 0.41930943727493286,
"learning_rate": 0.0003444835420914651,
"loss": 3.2012,
"step": 73200
},
{
"epoch": 21.324675324675326,
"grad_norm": 0.4978606700897217,
"learning_rate": 0.00034430876784153795,
"loss": 3.2046,
"step": 73250
},
{
"epoch": 21.339234756275115,
"grad_norm": 0.4318115711212158,
"learning_rate": 0.0003441339935916108,
"loss": 3.1995,
"step": 73300
},
{
"epoch": 21.353794187874904,
"grad_norm": 0.4151814579963684,
"learning_rate": 0.0003439592193416836,
"loss": 3.2036,
"step": 73350
},
{
"epoch": 21.368353619474696,
"grad_norm": 0.4480758309364319,
"learning_rate": 0.00034378444509175645,
"loss": 3.2069,
"step": 73400
},
{
"epoch": 21.382913051074485,
"grad_norm": 0.40307602286338806,
"learning_rate": 0.0003436096708418292,
"loss": 3.1965,
"step": 73450
},
{
"epoch": 21.397472482674278,
"grad_norm": 0.4136466979980469,
"learning_rate": 0.00034343489659190206,
"loss": 3.2067,
"step": 73500
},
{
"epoch": 21.412031914274067,
"grad_norm": 0.41154733300209045,
"learning_rate": 0.0003432601223419749,
"loss": 3.1987,
"step": 73550
},
{
"epoch": 21.426591345873856,
"grad_norm": 0.3847964107990265,
"learning_rate": 0.0003430853480920478,
"loss": 3.2141,
"step": 73600
},
{
"epoch": 21.44115077747365,
"grad_norm": 0.3878454566001892,
"learning_rate": 0.0003429105738421206,
"loss": 3.2038,
"step": 73650
},
{
"epoch": 21.455710209073438,
"grad_norm": 0.4452696442604065,
"learning_rate": 0.0003427357995921934,
"loss": 3.2162,
"step": 73700
},
{
"epoch": 21.470269640673227,
"grad_norm": 0.39309459924697876,
"learning_rate": 0.0003425610253422662,
"loss": 3.2198,
"step": 73750
},
{
"epoch": 21.48482907227302,
"grad_norm": 0.4251159429550171,
"learning_rate": 0.00034238625109233906,
"loss": 3.2216,
"step": 73800
},
{
"epoch": 21.499388503872808,
"grad_norm": 0.41249531507492065,
"learning_rate": 0.0003422114768424119,
"loss": 3.2121,
"step": 73850
},
{
"epoch": 21.5139479354726,
"grad_norm": 0.43736204504966736,
"learning_rate": 0.0003420367025924847,
"loss": 3.2238,
"step": 73900
},
{
"epoch": 21.52850736707239,
"grad_norm": 0.4327532649040222,
"learning_rate": 0.0003418619283425575,
"loss": 3.2167,
"step": 73950
},
{
"epoch": 21.54306679867218,
"grad_norm": 0.4144987165927887,
"learning_rate": 0.00034168715409263034,
"loss": 3.2214,
"step": 74000
},
{
"epoch": 21.54306679867218,
"eval_accuracy": 0.37350446636180323,
"eval_loss": 3.544901132583618,
"eval_runtime": 181.1526,
"eval_samples_per_second": 91.873,
"eval_steps_per_second": 5.747,
"step": 74000
},
{
"epoch": 21.55762623027197,
"grad_norm": 0.4094686210155487,
"learning_rate": 0.00034151237984270317,
"loss": 3.2336,
"step": 74050
},
{
"epoch": 21.57218566187176,
"grad_norm": 0.3899657726287842,
"learning_rate": 0.000341337605592776,
"loss": 3.23,
"step": 74100
},
{
"epoch": 21.58674509347155,
"grad_norm": 0.38108712434768677,
"learning_rate": 0.00034116283134284883,
"loss": 3.2234,
"step": 74150
},
{
"epoch": 21.601304525071342,
"grad_norm": 0.4188117980957031,
"learning_rate": 0.0003409880570929216,
"loss": 3.2367,
"step": 74200
},
{
"epoch": 21.61586395667113,
"grad_norm": 0.4578975439071655,
"learning_rate": 0.00034081328284299445,
"loss": 3.2332,
"step": 74250
},
{
"epoch": 21.630423388270923,
"grad_norm": 0.411504864692688,
"learning_rate": 0.0003406385085930673,
"loss": 3.2302,
"step": 74300
},
{
"epoch": 21.644982819870712,
"grad_norm": 0.3843030035495758,
"learning_rate": 0.0003404637343431401,
"loss": 3.2405,
"step": 74350
},
{
"epoch": 21.6595422514705,
"grad_norm": 0.40367045998573303,
"learning_rate": 0.0003402889600932129,
"loss": 3.2326,
"step": 74400
},
{
"epoch": 21.674101683070294,
"grad_norm": 0.41419780254364014,
"learning_rate": 0.0003401141858432857,
"loss": 3.226,
"step": 74450
},
{
"epoch": 21.688661114670083,
"grad_norm": 0.4172533452510834,
"learning_rate": 0.00033993941159335856,
"loss": 3.2313,
"step": 74500
},
{
"epoch": 21.703220546269872,
"grad_norm": 0.42541390657424927,
"learning_rate": 0.0003397646373434314,
"loss": 3.2203,
"step": 74550
},
{
"epoch": 21.717779977869665,
"grad_norm": 0.430610716342926,
"learning_rate": 0.0003395898630935042,
"loss": 3.2308,
"step": 74600
},
{
"epoch": 21.732339409469454,
"grad_norm": 0.39513736963272095,
"learning_rate": 0.000339415088843577,
"loss": 3.231,
"step": 74650
},
{
"epoch": 21.746898841069246,
"grad_norm": 0.43501347303390503,
"learning_rate": 0.00033924031459364983,
"loss": 3.2268,
"step": 74700
},
{
"epoch": 21.761458272669035,
"grad_norm": 0.4237409830093384,
"learning_rate": 0.00033906554034372267,
"loss": 3.2324,
"step": 74750
},
{
"epoch": 21.776017704268824,
"grad_norm": 0.41927024722099304,
"learning_rate": 0.0003388907660937955,
"loss": 3.2298,
"step": 74800
},
{
"epoch": 21.790577135868617,
"grad_norm": 0.4185371696949005,
"learning_rate": 0.0003387159918438683,
"loss": 3.2384,
"step": 74850
},
{
"epoch": 21.805136567468406,
"grad_norm": 0.4015279710292816,
"learning_rate": 0.0003385412175939411,
"loss": 3.2271,
"step": 74900
},
{
"epoch": 21.819695999068195,
"grad_norm": 0.4096405804157257,
"learning_rate": 0.00033836644334401395,
"loss": 3.2573,
"step": 74950
},
{
"epoch": 21.834255430667987,
"grad_norm": 0.40210607647895813,
"learning_rate": 0.0003381916690940868,
"loss": 3.2313,
"step": 75000
},
{
"epoch": 21.834255430667987,
"eval_accuracy": 0.37395798707354266,
"eval_loss": 3.53704833984375,
"eval_runtime": 181.2122,
"eval_samples_per_second": 91.843,
"eval_steps_per_second": 5.745,
"step": 75000
},
{
"epoch": 21.848814862267776,
"grad_norm": 0.39823204278945923,
"learning_rate": 0.0003380168948441596,
"loss": 3.2451,
"step": 75050
},
{
"epoch": 21.86337429386757,
"grad_norm": 0.3969886600971222,
"learning_rate": 0.0003378421205942324,
"loss": 3.2378,
"step": 75100
},
{
"epoch": 21.877933725467358,
"grad_norm": 0.4100249409675598,
"learning_rate": 0.0003376673463443052,
"loss": 3.2505,
"step": 75150
},
{
"epoch": 21.892493157067147,
"grad_norm": 0.405699223279953,
"learning_rate": 0.00033749257209437806,
"loss": 3.2278,
"step": 75200
},
{
"epoch": 21.90705258866694,
"grad_norm": 0.39252954721450806,
"learning_rate": 0.0003373177978444509,
"loss": 3.2441,
"step": 75250
},
{
"epoch": 21.92161202026673,
"grad_norm": 0.38878968358039856,
"learning_rate": 0.0003371430235945237,
"loss": 3.2341,
"step": 75300
},
{
"epoch": 21.93617145186652,
"grad_norm": 0.3854546844959259,
"learning_rate": 0.0003369682493445965,
"loss": 3.2396,
"step": 75350
},
{
"epoch": 21.95073088346631,
"grad_norm": 0.421974241733551,
"learning_rate": 0.00033679347509466933,
"loss": 3.2496,
"step": 75400
},
{
"epoch": 21.9652903150661,
"grad_norm": 0.3947569727897644,
"learning_rate": 0.00033661870084474217,
"loss": 3.2349,
"step": 75450
},
{
"epoch": 21.97984974666589,
"grad_norm": 0.40448886156082153,
"learning_rate": 0.000336443926594815,
"loss": 3.2366,
"step": 75500
},
{
"epoch": 21.99440917826568,
"grad_norm": 0.40597161650657654,
"learning_rate": 0.0003362691523448878,
"loss": 3.2422,
"step": 75550
},
{
"epoch": 22.008735658959875,
"grad_norm": 0.38081154227256775,
"learning_rate": 0.0003360943780949606,
"loss": 3.1834,
"step": 75600
},
{
"epoch": 22.023295090559664,
"grad_norm": 0.4210168719291687,
"learning_rate": 0.00033591960384503344,
"loss": 3.1409,
"step": 75650
},
{
"epoch": 22.037854522159456,
"grad_norm": 0.40493515133857727,
"learning_rate": 0.0003357448295951063,
"loss": 3.1449,
"step": 75700
},
{
"epoch": 22.052413953759245,
"grad_norm": 0.4119669795036316,
"learning_rate": 0.0003355700553451791,
"loss": 3.1488,
"step": 75750
},
{
"epoch": 22.066973385359034,
"grad_norm": 0.44360145926475525,
"learning_rate": 0.0003353952810952519,
"loss": 3.1563,
"step": 75800
},
{
"epoch": 22.081532816958827,
"grad_norm": 0.4260116517543793,
"learning_rate": 0.0003352205068453247,
"loss": 3.1691,
"step": 75850
},
{
"epoch": 22.096092248558616,
"grad_norm": 0.42039230465888977,
"learning_rate": 0.00033504573259539756,
"loss": 3.1466,
"step": 75900
},
{
"epoch": 22.110651680158405,
"grad_norm": 0.4209480285644531,
"learning_rate": 0.0003348709583454704,
"loss": 3.1731,
"step": 75950
},
{
"epoch": 22.125211111758198,
"grad_norm": 0.4045847952365875,
"learning_rate": 0.0003346961840955432,
"loss": 3.1508,
"step": 76000
},
{
"epoch": 22.125211111758198,
"eval_accuracy": 0.37304953464447893,
"eval_loss": 3.552255868911743,
"eval_runtime": 181.5419,
"eval_samples_per_second": 91.676,
"eval_steps_per_second": 5.734,
"step": 76000
},
{
"epoch": 22.139770543357987,
"grad_norm": 0.42066532373428345,
"learning_rate": 0.000334521409845616,
"loss": 3.18,
"step": 76050
},
{
"epoch": 22.15432997495778,
"grad_norm": 0.40776318311691284,
"learning_rate": 0.0003343466355956889,
"loss": 3.1774,
"step": 76100
},
{
"epoch": 22.168889406557568,
"grad_norm": 0.40850281715393066,
"learning_rate": 0.0003341718613457617,
"loss": 3.1786,
"step": 76150
},
{
"epoch": 22.183448838157357,
"grad_norm": 0.42042505741119385,
"learning_rate": 0.00033399708709583455,
"loss": 3.1611,
"step": 76200
},
{
"epoch": 22.19800826975715,
"grad_norm": 0.43354547023773193,
"learning_rate": 0.0003338223128459074,
"loss": 3.162,
"step": 76250
},
{
"epoch": 22.21256770135694,
"grad_norm": 0.4136315882205963,
"learning_rate": 0.00033364753859598016,
"loss": 3.1812,
"step": 76300
},
{
"epoch": 22.227127132956728,
"grad_norm": 0.4253956377506256,
"learning_rate": 0.000333472764346053,
"loss": 3.1806,
"step": 76350
},
{
"epoch": 22.24168656455652,
"grad_norm": 0.41711297631263733,
"learning_rate": 0.00033329799009612583,
"loss": 3.1771,
"step": 76400
},
{
"epoch": 22.25624599615631,
"grad_norm": 0.40799766778945923,
"learning_rate": 0.00033312321584619866,
"loss": 3.1944,
"step": 76450
},
{
"epoch": 22.270805427756102,
"grad_norm": 0.4232812821865082,
"learning_rate": 0.0003329484415962715,
"loss": 3.1952,
"step": 76500
},
{
"epoch": 22.28536485935589,
"grad_norm": 0.40353062748908997,
"learning_rate": 0.0003327736673463443,
"loss": 3.171,
"step": 76550
},
{
"epoch": 22.29992429095568,
"grad_norm": 0.39604508876800537,
"learning_rate": 0.0003325988930964171,
"loss": 3.1883,
"step": 76600
},
{
"epoch": 22.314483722555472,
"grad_norm": 0.4719644784927368,
"learning_rate": 0.00033242411884648994,
"loss": 3.1888,
"step": 76650
},
{
"epoch": 22.32904315415526,
"grad_norm": 0.45007991790771484,
"learning_rate": 0.0003322493445965628,
"loss": 3.178,
"step": 76700
},
{
"epoch": 22.34360258575505,
"grad_norm": 0.40224868059158325,
"learning_rate": 0.0003320745703466356,
"loss": 3.1944,
"step": 76750
},
{
"epoch": 22.358162017354843,
"grad_norm": 0.4655930995941162,
"learning_rate": 0.0003318997960967084,
"loss": 3.1932,
"step": 76800
},
{
"epoch": 22.372721448954632,
"grad_norm": 0.4105687141418457,
"learning_rate": 0.0003317250218467812,
"loss": 3.2026,
"step": 76850
},
{
"epoch": 22.387280880554425,
"grad_norm": 0.42557498812675476,
"learning_rate": 0.00033155024759685405,
"loss": 3.193,
"step": 76900
},
{
"epoch": 22.401840312154214,
"grad_norm": 0.4060722589492798,
"learning_rate": 0.0003313754733469269,
"loss": 3.1982,
"step": 76950
},
{
"epoch": 22.416399743754003,
"grad_norm": 0.4282877445220947,
"learning_rate": 0.00033120069909699966,
"loss": 3.2084,
"step": 77000
},
{
"epoch": 22.416399743754003,
"eval_accuracy": 0.3734410886942825,
"eval_loss": 3.547260046005249,
"eval_runtime": 181.4848,
"eval_samples_per_second": 91.705,
"eval_steps_per_second": 5.736,
"step": 77000
},
{
"epoch": 22.430959175353795,
"grad_norm": 0.40111586451530457,
"learning_rate": 0.0003310259248470725,
"loss": 3.2089,
"step": 77050
},
{
"epoch": 22.445518606953584,
"grad_norm": 0.4090210199356079,
"learning_rate": 0.00033085115059714533,
"loss": 3.2076,
"step": 77100
},
{
"epoch": 22.460078038553373,
"grad_norm": 0.40906432271003723,
"learning_rate": 0.00033067637634721816,
"loss": 3.2039,
"step": 77150
},
{
"epoch": 22.474637470153166,
"grad_norm": 0.40148457884788513,
"learning_rate": 0.000330501602097291,
"loss": 3.2014,
"step": 77200
},
{
"epoch": 22.489196901752955,
"grad_norm": 0.427418053150177,
"learning_rate": 0.0003303268278473638,
"loss": 3.2069,
"step": 77250
},
{
"epoch": 22.503756333352747,
"grad_norm": 0.4312450587749481,
"learning_rate": 0.0003301520535974366,
"loss": 3.2027,
"step": 77300
},
{
"epoch": 22.518315764952536,
"grad_norm": 0.40587109327316284,
"learning_rate": 0.00032997727934750944,
"loss": 3.2128,
"step": 77350
},
{
"epoch": 22.532875196552325,
"grad_norm": 0.4101792871952057,
"learning_rate": 0.0003298025050975823,
"loss": 3.2068,
"step": 77400
},
{
"epoch": 22.547434628152118,
"grad_norm": 0.40882381796836853,
"learning_rate": 0.0003296277308476551,
"loss": 3.2115,
"step": 77450
},
{
"epoch": 22.561994059751907,
"grad_norm": 0.44296374917030334,
"learning_rate": 0.0003294529565977279,
"loss": 3.208,
"step": 77500
},
{
"epoch": 22.576553491351696,
"grad_norm": 0.40894755721092224,
"learning_rate": 0.0003292781823478007,
"loss": 3.209,
"step": 77550
},
{
"epoch": 22.59111292295149,
"grad_norm": 0.4192773997783661,
"learning_rate": 0.00032910340809787355,
"loss": 3.2099,
"step": 77600
},
{
"epoch": 22.605672354551277,
"grad_norm": 0.40448158979415894,
"learning_rate": 0.0003289286338479464,
"loss": 3.2154,
"step": 77650
},
{
"epoch": 22.62023178615107,
"grad_norm": 0.4075862765312195,
"learning_rate": 0.00032875385959801916,
"loss": 3.2135,
"step": 77700
},
{
"epoch": 22.63479121775086,
"grad_norm": 0.41879263520240784,
"learning_rate": 0.000328579085348092,
"loss": 3.2297,
"step": 77750
},
{
"epoch": 22.649350649350648,
"grad_norm": 0.3986618220806122,
"learning_rate": 0.00032840431109816483,
"loss": 3.2217,
"step": 77800
},
{
"epoch": 22.66391008095044,
"grad_norm": 0.42791303992271423,
"learning_rate": 0.00032822953684823766,
"loss": 3.2292,
"step": 77850
},
{
"epoch": 22.67846951255023,
"grad_norm": 0.3977803885936737,
"learning_rate": 0.0003280547625983105,
"loss": 3.2199,
"step": 77900
},
{
"epoch": 22.693028944150022,
"grad_norm": 0.402538537979126,
"learning_rate": 0.0003278799883483833,
"loss": 3.2187,
"step": 77950
},
{
"epoch": 22.70758837574981,
"grad_norm": 0.4115932285785675,
"learning_rate": 0.0003277052140984561,
"loss": 3.2192,
"step": 78000
},
{
"epoch": 22.70758837574981,
"eval_accuracy": 0.37358524643153745,
"eval_loss": 3.5417346954345703,
"eval_runtime": 180.8954,
"eval_samples_per_second": 92.003,
"eval_steps_per_second": 5.755,
"step": 78000
},
{
"epoch": 22.7221478073496,
"grad_norm": 0.42461565136909485,
"learning_rate": 0.00032753043984852894,
"loss": 3.2215,
"step": 78050
},
{
"epoch": 22.736707238949393,
"grad_norm": 0.3989030420780182,
"learning_rate": 0.0003273556655986018,
"loss": 3.2121,
"step": 78100
},
{
"epoch": 22.75126667054918,
"grad_norm": 0.40380460023880005,
"learning_rate": 0.00032718089134867455,
"loss": 3.2228,
"step": 78150
},
{
"epoch": 22.76582610214897,
"grad_norm": 0.4479510486125946,
"learning_rate": 0.0003270061170987474,
"loss": 3.219,
"step": 78200
},
{
"epoch": 22.780385533748763,
"grad_norm": 0.4035521149635315,
"learning_rate": 0.0003268313428488202,
"loss": 3.2273,
"step": 78250
},
{
"epoch": 22.794944965348552,
"grad_norm": 0.41515037417411804,
"learning_rate": 0.00032665656859889305,
"loss": 3.2283,
"step": 78300
},
{
"epoch": 22.80950439694834,
"grad_norm": 0.39006373286247253,
"learning_rate": 0.0003264817943489659,
"loss": 3.2261,
"step": 78350
},
{
"epoch": 22.824063828548134,
"grad_norm": 0.44352978467941284,
"learning_rate": 0.00032630702009903866,
"loss": 3.2247,
"step": 78400
},
{
"epoch": 22.838623260147923,
"grad_norm": 0.3941769599914551,
"learning_rate": 0.0003261322458491115,
"loss": 3.2325,
"step": 78450
},
{
"epoch": 22.853182691747715,
"grad_norm": 0.41656285524368286,
"learning_rate": 0.00032595747159918433,
"loss": 3.2328,
"step": 78500
},
{
"epoch": 22.867742123347504,
"grad_norm": 0.43540436029434204,
"learning_rate": 0.00032578269734925716,
"loss": 3.2362,
"step": 78550
},
{
"epoch": 22.882301554947293,
"grad_norm": 0.42130741477012634,
"learning_rate": 0.00032560792309933,
"loss": 3.231,
"step": 78600
},
{
"epoch": 22.896860986547086,
"grad_norm": 0.39623120427131653,
"learning_rate": 0.0003254331488494029,
"loss": 3.2289,
"step": 78650
},
{
"epoch": 22.911420418146875,
"grad_norm": 0.41021716594696045,
"learning_rate": 0.00032525837459947566,
"loss": 3.2312,
"step": 78700
},
{
"epoch": 22.925979849746668,
"grad_norm": 0.3942141532897949,
"learning_rate": 0.0003250836003495485,
"loss": 3.2271,
"step": 78750
},
{
"epoch": 22.940539281346457,
"grad_norm": 0.4345264136791229,
"learning_rate": 0.0003249088260996213,
"loss": 3.2332,
"step": 78800
},
{
"epoch": 22.955098712946246,
"grad_norm": 0.423775315284729,
"learning_rate": 0.00032473405184969416,
"loss": 3.2217,
"step": 78850
},
{
"epoch": 22.969658144546038,
"grad_norm": 0.4158564805984497,
"learning_rate": 0.00032455927759976694,
"loss": 3.2264,
"step": 78900
},
{
"epoch": 22.984217576145827,
"grad_norm": 0.4096801280975342,
"learning_rate": 0.00032438450334983977,
"loss": 3.2465,
"step": 78950
},
{
"epoch": 22.998777007745616,
"grad_norm": 0.38529083132743835,
"learning_rate": 0.0003242097290999126,
"loss": 3.2454,
"step": 79000
},
{
"epoch": 22.998777007745616,
"eval_accuracy": 0.3740442935818177,
"eval_loss": 3.533736228942871,
"eval_runtime": 181.0451,
"eval_samples_per_second": 91.927,
"eval_steps_per_second": 5.75,
"step": 79000
},
{
"epoch": 23.01310348843981,
"grad_norm": 0.4034730792045593,
"learning_rate": 0.00032403495484998544,
"loss": 3.1405,
"step": 79050
},
{
"epoch": 23.027662920039603,
"grad_norm": 0.4115341007709503,
"learning_rate": 0.00032386018060005827,
"loss": 3.1426,
"step": 79100
},
{
"epoch": 23.042222351639392,
"grad_norm": 0.4165702760219574,
"learning_rate": 0.00032368540635013105,
"loss": 3.1417,
"step": 79150
},
{
"epoch": 23.05678178323918,
"grad_norm": 0.42881783843040466,
"learning_rate": 0.0003235106321002039,
"loss": 3.1477,
"step": 79200
},
{
"epoch": 23.071341214838974,
"grad_norm": 0.4214617609977722,
"learning_rate": 0.0003233358578502767,
"loss": 3.137,
"step": 79250
},
{
"epoch": 23.085900646438763,
"grad_norm": 0.4396951198577881,
"learning_rate": 0.00032316108360034955,
"loss": 3.1507,
"step": 79300
},
{
"epoch": 23.10046007803855,
"grad_norm": 0.42560476064682007,
"learning_rate": 0.0003229863093504224,
"loss": 3.1489,
"step": 79350
},
{
"epoch": 23.115019509638344,
"grad_norm": 0.4523085653781891,
"learning_rate": 0.00032281153510049516,
"loss": 3.1484,
"step": 79400
},
{
"epoch": 23.129578941238133,
"grad_norm": 0.4005928337574005,
"learning_rate": 0.000322636760850568,
"loss": 3.162,
"step": 79450
},
{
"epoch": 23.144138372837926,
"grad_norm": 0.4236561357975006,
"learning_rate": 0.0003224619866006408,
"loss": 3.167,
"step": 79500
},
{
"epoch": 23.158697804437715,
"grad_norm": 0.4328435957431793,
"learning_rate": 0.00032228721235071366,
"loss": 3.1655,
"step": 79550
},
{
"epoch": 23.173257236037504,
"grad_norm": 0.41501981019973755,
"learning_rate": 0.00032211243810078644,
"loss": 3.1654,
"step": 79600
},
{
"epoch": 23.187816667637296,
"grad_norm": 0.43546444177627563,
"learning_rate": 0.00032193766385085927,
"loss": 3.1676,
"step": 79650
},
{
"epoch": 23.202376099237085,
"grad_norm": 0.45414286851882935,
"learning_rate": 0.0003217628896009321,
"loss": 3.1616,
"step": 79700
},
{
"epoch": 23.216935530836878,
"grad_norm": 0.4444766044616699,
"learning_rate": 0.00032158811535100494,
"loss": 3.1677,
"step": 79750
},
{
"epoch": 23.231494962436667,
"grad_norm": 0.43765148520469666,
"learning_rate": 0.00032141334110107777,
"loss": 3.165,
"step": 79800
},
{
"epoch": 23.246054394036456,
"grad_norm": 0.4241009056568146,
"learning_rate": 0.00032123856685115055,
"loss": 3.165,
"step": 79850
},
{
"epoch": 23.26061382563625,
"grad_norm": 0.41852983832359314,
"learning_rate": 0.0003210637926012234,
"loss": 3.1684,
"step": 79900
},
{
"epoch": 23.275173257236037,
"grad_norm": 0.4211058020591736,
"learning_rate": 0.0003208890183512962,
"loss": 3.1892,
"step": 79950
},
{
"epoch": 23.289732688835826,
"grad_norm": 0.41997700929641724,
"learning_rate": 0.00032071424410136905,
"loss": 3.1837,
"step": 80000
},
{
"epoch": 23.289732688835826,
"eval_accuracy": 0.3734963530796902,
"eval_loss": 3.5481793880462646,
"eval_runtime": 180.9399,
"eval_samples_per_second": 91.981,
"eval_steps_per_second": 5.753,
"step": 80000
},
{
"epoch": 23.30429212043562,
"grad_norm": 0.41827014088630676,
"learning_rate": 0.0003205394698514419,
"loss": 3.1311,
"step": 80050
},
{
"epoch": 23.318851552035408,
"grad_norm": 0.41703981161117554,
"learning_rate": 0.00032036469560151466,
"loss": 3.1356,
"step": 80100
},
{
"epoch": 23.3334109836352,
"grad_norm": 0.4520207941532135,
"learning_rate": 0.0003201899213515875,
"loss": 3.143,
"step": 80150
},
{
"epoch": 23.34797041523499,
"grad_norm": 0.43337830901145935,
"learning_rate": 0.0003200151471016603,
"loss": 3.1458,
"step": 80200
},
{
"epoch": 23.36252984683478,
"grad_norm": 0.4231213331222534,
"learning_rate": 0.00031984037285173316,
"loss": 3.1387,
"step": 80250
},
{
"epoch": 23.37708927843457,
"grad_norm": 0.44809332489967346,
"learning_rate": 0.00031966559860180594,
"loss": 3.1528,
"step": 80300
},
{
"epoch": 23.39164871003436,
"grad_norm": 0.43833646178245544,
"learning_rate": 0.00031949082435187877,
"loss": 3.1466,
"step": 80350
},
{
"epoch": 23.40620814163415,
"grad_norm": 0.4350813329219818,
"learning_rate": 0.0003193160501019516,
"loss": 3.1661,
"step": 80400
},
{
"epoch": 23.42076757323394,
"grad_norm": 0.4260636270046234,
"learning_rate": 0.00031914127585202444,
"loss": 3.1666,
"step": 80450
},
{
"epoch": 23.43532700483373,
"grad_norm": 0.44178247451782227,
"learning_rate": 0.00031896650160209727,
"loss": 3.1526,
"step": 80500
},
{
"epoch": 23.449886436433523,
"grad_norm": 0.4201173484325409,
"learning_rate": 0.00031879172735217005,
"loss": 3.1709,
"step": 80550
},
{
"epoch": 23.464445868033312,
"grad_norm": 0.4208093285560608,
"learning_rate": 0.0003186169531022429,
"loss": 3.1733,
"step": 80600
},
{
"epoch": 23.4790052996331,
"grad_norm": 0.44302788376808167,
"learning_rate": 0.0003184421788523157,
"loss": 3.1758,
"step": 80650
},
{
"epoch": 23.493564731232894,
"grad_norm": 0.42564231157302856,
"learning_rate": 0.00031826740460238855,
"loss": 3.1739,
"step": 80700
},
{
"epoch": 23.508124162832683,
"grad_norm": 0.4493538439273834,
"learning_rate": 0.0003180926303524614,
"loss": 3.1696,
"step": 80750
},
{
"epoch": 23.522683594432472,
"grad_norm": 0.445277601480484,
"learning_rate": 0.00031791785610253416,
"loss": 3.1706,
"step": 80800
},
{
"epoch": 23.537243026032264,
"grad_norm": 0.4273313581943512,
"learning_rate": 0.000317743081852607,
"loss": 3.1791,
"step": 80850
},
{
"epoch": 23.551802457632053,
"grad_norm": 0.4198790490627289,
"learning_rate": 0.0003175683076026798,
"loss": 3.1725,
"step": 80900
},
{
"epoch": 23.566361889231846,
"grad_norm": 0.4127906858921051,
"learning_rate": 0.00031739353335275266,
"loss": 3.1838,
"step": 80950
},
{
"epoch": 23.580921320831635,
"grad_norm": 0.4176872968673706,
"learning_rate": 0.00031721875910282544,
"loss": 3.1719,
"step": 81000
},
{
"epoch": 23.580921320831635,
"eval_accuracy": 0.3729034955664441,
"eval_loss": 3.5576372146606445,
"eval_runtime": 180.9809,
"eval_samples_per_second": 91.96,
"eval_steps_per_second": 5.752,
"step": 81000
},
{
"epoch": 23.595480752431424,
"grad_norm": 0.44739457964897156,
"learning_rate": 0.00031704398485289827,
"loss": 3.1871,
"step": 81050
},
{
"epoch": 23.610040184031217,
"grad_norm": 0.4258241355419159,
"learning_rate": 0.0003168692106029711,
"loss": 3.1846,
"step": 81100
},
{
"epoch": 23.624599615631006,
"grad_norm": 0.44633159041404724,
"learning_rate": 0.000316694436353044,
"loss": 3.1742,
"step": 81150
},
{
"epoch": 23.639159047230795,
"grad_norm": 0.4098469913005829,
"learning_rate": 0.0003165196621031168,
"loss": 3.1868,
"step": 81200
},
{
"epoch": 23.653718478830587,
"grad_norm": 0.42541512846946716,
"learning_rate": 0.00031634488785318966,
"loss": 3.1906,
"step": 81250
},
{
"epoch": 23.668277910430376,
"grad_norm": 0.4811411499977112,
"learning_rate": 0.00031617011360326243,
"loss": 3.1903,
"step": 81300
},
{
"epoch": 23.68283734203017,
"grad_norm": 0.42175766825675964,
"learning_rate": 0.00031599533935333527,
"loss": 3.1967,
"step": 81350
},
{
"epoch": 23.697396773629958,
"grad_norm": 0.4402812421321869,
"learning_rate": 0.0003158205651034081,
"loss": 3.1909,
"step": 81400
},
{
"epoch": 23.711956205229747,
"grad_norm": 0.4189620018005371,
"learning_rate": 0.00031564579085348093,
"loss": 3.1839,
"step": 81450
},
{
"epoch": 23.72651563682954,
"grad_norm": 0.4007689654827118,
"learning_rate": 0.0003154710166035537,
"loss": 3.1906,
"step": 81500
},
{
"epoch": 23.74107506842933,
"grad_norm": 0.42000943422317505,
"learning_rate": 0.00031529624235362655,
"loss": 3.2023,
"step": 81550
},
{
"epoch": 23.755634500029117,
"grad_norm": 0.4228629469871521,
"learning_rate": 0.0003151214681036994,
"loss": 3.1989,
"step": 81600
},
{
"epoch": 23.77019393162891,
"grad_norm": 0.4203532338142395,
"learning_rate": 0.0003149466938537722,
"loss": 3.1811,
"step": 81650
},
{
"epoch": 23.7847533632287,
"grad_norm": 0.4034475088119507,
"learning_rate": 0.00031477191960384504,
"loss": 3.1947,
"step": 81700
},
{
"epoch": 23.79931279482849,
"grad_norm": 0.4441761374473572,
"learning_rate": 0.0003145971453539178,
"loss": 3.1996,
"step": 81750
},
{
"epoch": 23.81387222642828,
"grad_norm": 0.40938514471054077,
"learning_rate": 0.00031442237110399066,
"loss": 3.1999,
"step": 81800
},
{
"epoch": 23.82843165802807,
"grad_norm": 0.42533165216445923,
"learning_rate": 0.0003142475968540635,
"loss": 3.2039,
"step": 81850
},
{
"epoch": 23.842991089627862,
"grad_norm": 0.40785083174705505,
"learning_rate": 0.0003140728226041363,
"loss": 3.1983,
"step": 81900
},
{
"epoch": 23.85755052122765,
"grad_norm": 0.40363097190856934,
"learning_rate": 0.00031389804835420915,
"loss": 3.206,
"step": 81950
},
{
"epoch": 23.87210995282744,
"grad_norm": 0.43303757905960083,
"learning_rate": 0.00031372327410428193,
"loss": 3.2133,
"step": 82000
},
{
"epoch": 23.87210995282744,
"eval_accuracy": 0.37346883847078505,
"eval_loss": 3.5454885959625244,
"eval_runtime": 179.9471,
"eval_samples_per_second": 92.488,
"eval_steps_per_second": 5.785,
"step": 82000
},
{
"epoch": 23.886669384427233,
"grad_norm": 0.42279261350631714,
"learning_rate": 0.00031354849985435477,
"loss": 3.2198,
"step": 82050
},
{
"epoch": 23.90122881602702,
"grad_norm": 0.3982255160808563,
"learning_rate": 0.0003133737256044276,
"loss": 3.1981,
"step": 82100
},
{
"epoch": 23.915788247626814,
"grad_norm": 0.4409744441509247,
"learning_rate": 0.00031319895135450043,
"loss": 3.2056,
"step": 82150
},
{
"epoch": 23.930347679226603,
"grad_norm": 0.4399181604385376,
"learning_rate": 0.0003130241771045732,
"loss": 3.1991,
"step": 82200
},
{
"epoch": 23.944907110826392,
"grad_norm": 0.4288428723812103,
"learning_rate": 0.00031284940285464604,
"loss": 3.1975,
"step": 82250
},
{
"epoch": 23.959466542426185,
"grad_norm": 0.4355453550815582,
"learning_rate": 0.0003126746286047189,
"loss": 3.2068,
"step": 82300
},
{
"epoch": 23.974025974025974,
"grad_norm": 0.4542391300201416,
"learning_rate": 0.0003124998543547917,
"loss": 3.2008,
"step": 82350
},
{
"epoch": 23.988585405625763,
"grad_norm": 0.41459161043167114,
"learning_rate": 0.00031232508010486454,
"loss": 3.2094,
"step": 82400
},
{
"epoch": 24.003203074951955,
"grad_norm": 0.4521249830722809,
"learning_rate": 0.0003121503058549373,
"loss": 3.2476,
"step": 82450
},
{
"epoch": 24.017762506551744,
"grad_norm": 0.43049049377441406,
"learning_rate": 0.00031197553160501016,
"loss": 3.1265,
"step": 82500
},
{
"epoch": 24.032321938151533,
"grad_norm": 0.45564672350883484,
"learning_rate": 0.000311800757355083,
"loss": 3.1277,
"step": 82550
},
{
"epoch": 24.046881369751326,
"grad_norm": 0.4392612874507904,
"learning_rate": 0.0003116259831051558,
"loss": 3.1344,
"step": 82600
},
{
"epoch": 24.061440801351115,
"grad_norm": 0.4402536153793335,
"learning_rate": 0.00031145120885522865,
"loss": 3.1388,
"step": 82650
},
{
"epoch": 24.076000232950907,
"grad_norm": 0.428521990776062,
"learning_rate": 0.00031127643460530143,
"loss": 3.1424,
"step": 82700
},
{
"epoch": 24.090559664550696,
"grad_norm": 0.42514893412590027,
"learning_rate": 0.00031110166035537427,
"loss": 3.1492,
"step": 82750
},
{
"epoch": 24.105119096150485,
"grad_norm": 0.44392284750938416,
"learning_rate": 0.0003109268861054471,
"loss": 3.154,
"step": 82800
},
{
"epoch": 24.119678527750278,
"grad_norm": 0.41279974579811096,
"learning_rate": 0.00031075211185551993,
"loss": 3.1598,
"step": 82850
},
{
"epoch": 24.134237959350067,
"grad_norm": 0.46087998151779175,
"learning_rate": 0.0003105773376055927,
"loss": 3.1509,
"step": 82900
},
{
"epoch": 24.148797390949856,
"grad_norm": 0.4077779948711395,
"learning_rate": 0.00031040256335566554,
"loss": 3.1598,
"step": 82950
},
{
"epoch": 24.16335682254965,
"grad_norm": 0.404313862323761,
"learning_rate": 0.0003102277891057384,
"loss": 3.1573,
"step": 83000
},
{
"epoch": 24.16335682254965,
"eval_accuracy": 0.37331809604080063,
"eval_loss": 3.554750919342041,
"eval_runtime": 179.8077,
"eval_samples_per_second": 92.56,
"eval_steps_per_second": 5.79,
"step": 83000
},
{
"epoch": 24.177916254149437,
"grad_norm": 0.4314119219779968,
"learning_rate": 0.0003100530148558112,
"loss": 3.1623,
"step": 83050
},
{
"epoch": 24.19247568574923,
"grad_norm": 0.44017013907432556,
"learning_rate": 0.00030987824060588404,
"loss": 3.1604,
"step": 83100
},
{
"epoch": 24.20703511734902,
"grad_norm": 0.4304588735103607,
"learning_rate": 0.0003097034663559568,
"loss": 3.1786,
"step": 83150
},
{
"epoch": 24.221594548948808,
"grad_norm": 0.4203026294708252,
"learning_rate": 0.00030952869210602965,
"loss": 3.1765,
"step": 83200
},
{
"epoch": 24.2361539805486,
"grad_norm": 0.4360384941101074,
"learning_rate": 0.0003093539178561025,
"loss": 3.1594,
"step": 83250
},
{
"epoch": 24.25071341214839,
"grad_norm": 0.419367253780365,
"learning_rate": 0.0003091791436061753,
"loss": 3.1581,
"step": 83300
},
{
"epoch": 24.26527284374818,
"grad_norm": 0.43426862359046936,
"learning_rate": 0.00030900436935624815,
"loss": 3.1767,
"step": 83350
},
{
"epoch": 24.27983227534797,
"grad_norm": 0.4190501868724823,
"learning_rate": 0.00030882959510632093,
"loss": 3.1773,
"step": 83400
},
{
"epoch": 24.29439170694776,
"grad_norm": 0.4230547845363617,
"learning_rate": 0.00030865482085639377,
"loss": 3.1788,
"step": 83450
},
{
"epoch": 24.308951138547553,
"grad_norm": 0.4331873059272766,
"learning_rate": 0.0003084800466064666,
"loss": 3.1688,
"step": 83500
},
{
"epoch": 24.32351057014734,
"grad_norm": 0.4186467230319977,
"learning_rate": 0.00030830527235653943,
"loss": 3.1707,
"step": 83550
},
{
"epoch": 24.33807000174713,
"grad_norm": 0.41275304555892944,
"learning_rate": 0.0003081304981066122,
"loss": 3.1758,
"step": 83600
},
{
"epoch": 24.352629433346923,
"grad_norm": 0.5706424713134766,
"learning_rate": 0.00030795572385668504,
"loss": 3.1845,
"step": 83650
},
{
"epoch": 24.367188864946712,
"grad_norm": 0.43165168166160583,
"learning_rate": 0.00030778094960675793,
"loss": 3.1623,
"step": 83700
},
{
"epoch": 24.3817482965465,
"grad_norm": 0.44549575448036194,
"learning_rate": 0.00030760617535683076,
"loss": 3.1842,
"step": 83750
},
{
"epoch": 24.396307728146294,
"grad_norm": 0.42982882261276245,
"learning_rate": 0.0003074314011069036,
"loss": 3.1825,
"step": 83800
},
{
"epoch": 24.410867159746083,
"grad_norm": 0.42767763137817383,
"learning_rate": 0.00030725662685697643,
"loss": 3.1915,
"step": 83850
},
{
"epoch": 24.425426591345875,
"grad_norm": 0.42826566100120544,
"learning_rate": 0.0003070818526070492,
"loss": 3.1861,
"step": 83900
},
{
"epoch": 24.439986022945664,
"grad_norm": 0.4173426926136017,
"learning_rate": 0.00030690707835712204,
"loss": 3.1829,
"step": 83950
},
{
"epoch": 24.454545454545453,
"grad_norm": 0.4142906069755554,
"learning_rate": 0.0003067323041071949,
"loss": 3.1941,
"step": 84000
},
{
"epoch": 24.454545454545453,
"eval_accuracy": 0.37343744359652153,
"eval_loss": 3.5522103309631348,
"eval_runtime": 179.9407,
"eval_samples_per_second": 92.492,
"eval_steps_per_second": 5.785,
"step": 84000
},
{
"epoch": 24.469104886145246,
"grad_norm": 0.42911434173583984,
"learning_rate": 0.0003065575298572677,
"loss": 3.1824,
"step": 84050
},
{
"epoch": 24.483664317745035,
"grad_norm": 0.4673752784729004,
"learning_rate": 0.00030638275560734054,
"loss": 3.1878,
"step": 84100
},
{
"epoch": 24.498223749344824,
"grad_norm": 0.4118911325931549,
"learning_rate": 0.0003062079813574133,
"loss": 3.1945,
"step": 84150
},
{
"epoch": 24.512783180944616,
"grad_norm": 0.46239766478538513,
"learning_rate": 0.00030603320710748615,
"loss": 3.1837,
"step": 84200
},
{
"epoch": 24.527342612544405,
"grad_norm": 0.4571888744831085,
"learning_rate": 0.000305858432857559,
"loss": 3.1924,
"step": 84250
},
{
"epoch": 24.541902044144198,
"grad_norm": 0.4163052439689636,
"learning_rate": 0.0003056836586076318,
"loss": 3.1966,
"step": 84300
},
{
"epoch": 24.556461475743987,
"grad_norm": 0.4218531847000122,
"learning_rate": 0.0003055088843577046,
"loss": 3.2015,
"step": 84350
},
{
"epoch": 24.571020907343776,
"grad_norm": 0.4395730197429657,
"learning_rate": 0.00030533411010777743,
"loss": 3.2011,
"step": 84400
},
{
"epoch": 24.58558033894357,
"grad_norm": 0.43827715516090393,
"learning_rate": 0.00030515933585785026,
"loss": 3.2071,
"step": 84450
},
{
"epoch": 24.600139770543358,
"grad_norm": 0.4391777813434601,
"learning_rate": 0.0003049845616079231,
"loss": 3.199,
"step": 84500
},
{
"epoch": 24.61469920214315,
"grad_norm": 0.45976823568344116,
"learning_rate": 0.00030480978735799593,
"loss": 3.2007,
"step": 84550
},
{
"epoch": 24.62925863374294,
"grad_norm": 0.43706169724464417,
"learning_rate": 0.0003046350131080687,
"loss": 3.2013,
"step": 84600
},
{
"epoch": 24.643818065342728,
"grad_norm": 0.43872877955436707,
"learning_rate": 0.00030446023885814154,
"loss": 3.2041,
"step": 84650
},
{
"epoch": 24.65837749694252,
"grad_norm": 0.42261114716529846,
"learning_rate": 0.0003042854646082144,
"loss": 3.2004,
"step": 84700
},
{
"epoch": 24.67293692854231,
"grad_norm": 0.43338093161582947,
"learning_rate": 0.0003041106903582872,
"loss": 3.1977,
"step": 84750
},
{
"epoch": 24.6874963601421,
"grad_norm": 0.41846537590026855,
"learning_rate": 0.00030393591610836,
"loss": 3.1972,
"step": 84800
},
{
"epoch": 24.70205579174189,
"grad_norm": 0.42820459604263306,
"learning_rate": 0.0003037611418584328,
"loss": 3.2049,
"step": 84850
},
{
"epoch": 24.71661522334168,
"grad_norm": 0.43307816982269287,
"learning_rate": 0.00030358636760850565,
"loss": 3.2179,
"step": 84900
},
{
"epoch": 24.73117465494147,
"grad_norm": 0.4248763918876648,
"learning_rate": 0.0003034115933585785,
"loss": 3.1926,
"step": 84950
},
{
"epoch": 24.745734086541262,
"grad_norm": 0.41839364171028137,
"learning_rate": 0.0003032368191086513,
"loss": 3.2039,
"step": 85000
},
{
"epoch": 24.745734086541262,
"eval_accuracy": 0.37404652767399377,
"eval_loss": 3.5435619354248047,
"eval_runtime": 177.4415,
"eval_samples_per_second": 93.794,
"eval_steps_per_second": 5.867,
"step": 85000
},
{
"epoch": 24.76029351814105,
"grad_norm": 0.4315228760242462,
"learning_rate": 0.0003030620448587241,
"loss": 3.1948,
"step": 85050
},
{
"epoch": 24.774852949740843,
"grad_norm": 0.43137073516845703,
"learning_rate": 0.00030288727060879693,
"loss": 3.2095,
"step": 85100
},
{
"epoch": 24.789412381340632,
"grad_norm": 0.43723025918006897,
"learning_rate": 0.00030271249635886976,
"loss": 3.209,
"step": 85150
},
{
"epoch": 24.80397181294042,
"grad_norm": 0.4241926372051239,
"learning_rate": 0.0003025377221089426,
"loss": 3.214,
"step": 85200
},
{
"epoch": 24.818531244540214,
"grad_norm": 0.41172343492507935,
"learning_rate": 0.00030236294785901543,
"loss": 3.2183,
"step": 85250
},
{
"epoch": 24.833090676140003,
"grad_norm": 0.4208565652370453,
"learning_rate": 0.0003021881736090882,
"loss": 3.2044,
"step": 85300
},
{
"epoch": 24.847650107739796,
"grad_norm": 0.41848379373550415,
"learning_rate": 0.00030201339935916104,
"loss": 3.2094,
"step": 85350
},
{
"epoch": 24.862209539339585,
"grad_norm": 0.41701391339302063,
"learning_rate": 0.00030183862510923387,
"loss": 3.1972,
"step": 85400
},
{
"epoch": 24.876768970939374,
"grad_norm": 0.4314718246459961,
"learning_rate": 0.0003016638508593067,
"loss": 3.205,
"step": 85450
},
{
"epoch": 24.891328402539166,
"grad_norm": 0.41032907366752625,
"learning_rate": 0.0003014890766093795,
"loss": 3.2128,
"step": 85500
},
{
"epoch": 24.905887834138955,
"grad_norm": 0.4259736239910126,
"learning_rate": 0.0003013143023594523,
"loss": 3.2088,
"step": 85550
},
{
"epoch": 24.920447265738744,
"grad_norm": 0.39667725563049316,
"learning_rate": 0.00030113952810952515,
"loss": 3.2289,
"step": 85600
},
{
"epoch": 24.935006697338537,
"grad_norm": 0.41058072447776794,
"learning_rate": 0.000300964753859598,
"loss": 3.2142,
"step": 85650
},
{
"epoch": 24.949566128938326,
"grad_norm": 0.42639005184173584,
"learning_rate": 0.0003007899796096708,
"loss": 3.2211,
"step": 85700
},
{
"epoch": 24.96412556053812,
"grad_norm": 0.41749000549316406,
"learning_rate": 0.0003006152053597436,
"loss": 3.213,
"step": 85750
},
{
"epoch": 24.978684992137907,
"grad_norm": 0.4088965058326721,
"learning_rate": 0.00030044043110981643,
"loss": 3.2202,
"step": 85800
},
{
"epoch": 24.993244423737696,
"grad_norm": 0.4255065619945526,
"learning_rate": 0.00030026565685988926,
"loss": 3.2187,
"step": 85850
},
{
"epoch": 25.00757090443189,
"grad_norm": 0.4710450768470764,
"learning_rate": 0.0003000908826099621,
"loss": 3.1597,
"step": 85900
},
{
"epoch": 25.02213033603168,
"grad_norm": 0.43516671657562256,
"learning_rate": 0.0002999161083600349,
"loss": 3.1244,
"step": 85950
},
{
"epoch": 25.036689767631472,
"grad_norm": 0.46017566323280334,
"learning_rate": 0.00029974133411010776,
"loss": 3.1082,
"step": 86000
},
{
"epoch": 25.036689767631472,
"eval_accuracy": 0.3735376249930479,
"eval_loss": 3.5517325401306152,
"eval_runtime": 178.0523,
"eval_samples_per_second": 93.473,
"eval_steps_per_second": 5.847,
"step": 86000
},
{
"epoch": 25.05124919923126,
"grad_norm": 0.44637131690979004,
"learning_rate": 0.0002995665598601806,
"loss": 3.1258,
"step": 86050
},
{
"epoch": 25.065808630831054,
"grad_norm": 0.39747855067253113,
"learning_rate": 0.00029939178561025337,
"loss": 3.1318,
"step": 86100
},
{
"epoch": 25.080368062430843,
"grad_norm": 0.4319270849227905,
"learning_rate": 0.0002992170113603262,
"loss": 3.1203,
"step": 86150
},
{
"epoch": 25.09492749403063,
"grad_norm": 0.41497504711151123,
"learning_rate": 0.00029904223711039904,
"loss": 3.1417,
"step": 86200
},
{
"epoch": 25.109486925630424,
"grad_norm": 0.4178447127342224,
"learning_rate": 0.00029886746286047187,
"loss": 3.1361,
"step": 86250
},
{
"epoch": 25.124046357230213,
"grad_norm": 0.41532012820243835,
"learning_rate": 0.0002986926886105447,
"loss": 3.1451,
"step": 86300
},
{
"epoch": 25.138605788830006,
"grad_norm": 0.4553651809692383,
"learning_rate": 0.0002985179143606175,
"loss": 3.1575,
"step": 86350
},
{
"epoch": 25.153165220429795,
"grad_norm": 0.4206736087799072,
"learning_rate": 0.0002983431401106903,
"loss": 3.1446,
"step": 86400
},
{
"epoch": 25.167724652029584,
"grad_norm": 0.46819040179252625,
"learning_rate": 0.00029816836586076315,
"loss": 3.1393,
"step": 86450
},
{
"epoch": 25.182284083629376,
"grad_norm": 0.4540737271308899,
"learning_rate": 0.000297993591610836,
"loss": 3.1469,
"step": 86500
},
{
"epoch": 25.196843515229165,
"grad_norm": 0.44796428084373474,
"learning_rate": 0.0002978188173609088,
"loss": 3.1496,
"step": 86550
},
{
"epoch": 25.211402946828954,
"grad_norm": 0.4612496495246887,
"learning_rate": 0.0002976440431109816,
"loss": 3.1554,
"step": 86600
},
{
"epoch": 25.225962378428747,
"grad_norm": 0.44751015305519104,
"learning_rate": 0.0002974692688610544,
"loss": 3.1544,
"step": 86650
},
{
"epoch": 25.240521810028536,
"grad_norm": 0.452826589345932,
"learning_rate": 0.00029729449461112726,
"loss": 3.1547,
"step": 86700
},
{
"epoch": 25.25508124162833,
"grad_norm": 0.4215683937072754,
"learning_rate": 0.0002971197203612001,
"loss": 3.1616,
"step": 86750
},
{
"epoch": 25.269640673228118,
"grad_norm": 0.484609454870224,
"learning_rate": 0.00029694494611127287,
"loss": 3.1572,
"step": 86800
},
{
"epoch": 25.284200104827907,
"grad_norm": 0.420391321182251,
"learning_rate": 0.00029677017186134576,
"loss": 3.165,
"step": 86850
},
{
"epoch": 25.2987595364277,
"grad_norm": 0.42065128684043884,
"learning_rate": 0.0002965953976114186,
"loss": 3.1636,
"step": 86900
},
{
"epoch": 25.313318968027488,
"grad_norm": 0.43046483397483826,
"learning_rate": 0.00029642062336149137,
"loss": 3.1632,
"step": 86950
},
{
"epoch": 25.327878399627277,
"grad_norm": 0.4139987528324127,
"learning_rate": 0.0002962458491115642,
"loss": 3.1654,
"step": 87000
},
{
"epoch": 25.327878399627277,
"eval_accuracy": 0.3735500888757143,
"eval_loss": 3.551439046859741,
"eval_runtime": 178.5097,
"eval_samples_per_second": 93.233,
"eval_steps_per_second": 5.832,
"step": 87000
},
{
"epoch": 25.34243783122707,
"grad_norm": 0.42929497361183167,
"learning_rate": 0.00029607107486163704,
"loss": 3.1601,
"step": 87050
},
{
"epoch": 25.35699726282686,
"grad_norm": 0.45584776997566223,
"learning_rate": 0.00029589630061170987,
"loss": 3.1801,
"step": 87100
},
{
"epoch": 25.37155669442665,
"grad_norm": 0.43226608633995056,
"learning_rate": 0.0002957215263617827,
"loss": 3.1721,
"step": 87150
},
{
"epoch": 25.38611612602644,
"grad_norm": 0.4360572099685669,
"learning_rate": 0.0002955467521118555,
"loss": 3.1776,
"step": 87200
},
{
"epoch": 25.40067555762623,
"grad_norm": 0.44590285420417786,
"learning_rate": 0.0002953719778619283,
"loss": 3.1612,
"step": 87250
},
{
"epoch": 25.415234989226022,
"grad_norm": 0.4651472866535187,
"learning_rate": 0.00029519720361200115,
"loss": 3.1681,
"step": 87300
},
{
"epoch": 25.42979442082581,
"grad_norm": 0.4429469704627991,
"learning_rate": 0.000295022429362074,
"loss": 3.1682,
"step": 87350
},
{
"epoch": 25.4443538524256,
"grad_norm": 0.4391014575958252,
"learning_rate": 0.00029484765511214676,
"loss": 3.1772,
"step": 87400
},
{
"epoch": 25.458913284025392,
"grad_norm": 0.4634419083595276,
"learning_rate": 0.0002946728808622196,
"loss": 3.1747,
"step": 87450
},
{
"epoch": 25.47347271562518,
"grad_norm": 0.43895062804222107,
"learning_rate": 0.0002944981066122924,
"loss": 3.1781,
"step": 87500
},
{
"epoch": 25.488032147224974,
"grad_norm": 0.46622321009635925,
"learning_rate": 0.00029432333236236526,
"loss": 3.1714,
"step": 87550
},
{
"epoch": 25.502591578824763,
"grad_norm": 0.4347987771034241,
"learning_rate": 0.0002941485581124381,
"loss": 3.1755,
"step": 87600
},
{
"epoch": 25.517151010424552,
"grad_norm": 0.4216921925544739,
"learning_rate": 0.00029397378386251087,
"loss": 3.1928,
"step": 87650
},
{
"epoch": 25.531710442024345,
"grad_norm": 0.426127552986145,
"learning_rate": 0.0002937990096125837,
"loss": 3.1887,
"step": 87700
},
{
"epoch": 25.546269873624134,
"grad_norm": 0.4356168508529663,
"learning_rate": 0.00029362423536265654,
"loss": 3.1794,
"step": 87750
},
{
"epoch": 25.560829305223923,
"grad_norm": 0.43473145365715027,
"learning_rate": 0.00029344946111272937,
"loss": 3.1756,
"step": 87800
},
{
"epoch": 25.575388736823715,
"grad_norm": 0.42265087366104126,
"learning_rate": 0.0002932746868628022,
"loss": 3.1773,
"step": 87850
},
{
"epoch": 25.589948168423504,
"grad_norm": 0.46903398633003235,
"learning_rate": 0.000293099912612875,
"loss": 3.1875,
"step": 87900
},
{
"epoch": 25.604507600023297,
"grad_norm": 0.46841877698898315,
"learning_rate": 0.0002929251383629478,
"loss": 3.1823,
"step": 87950
},
{
"epoch": 25.619067031623086,
"grad_norm": 0.41188380122184753,
"learning_rate": 0.00029275036411302065,
"loss": 3.1816,
"step": 88000
},
{
"epoch": 25.619067031623086,
"eval_accuracy": 0.3739879709422213,
"eval_loss": 3.542097806930542,
"eval_runtime": 176.7868,
"eval_samples_per_second": 94.142,
"eval_steps_per_second": 5.888,
"step": 88000
},
{
"epoch": 25.633626463222875,
"grad_norm": 0.4317525625228882,
"learning_rate": 0.0002925755898630935,
"loss": 3.1857,
"step": 88050
},
{
"epoch": 25.648185894822667,
"grad_norm": 0.43668872117996216,
"learning_rate": 0.0002924008156131663,
"loss": 3.2019,
"step": 88100
},
{
"epoch": 25.662745326422456,
"grad_norm": 0.5344383120536804,
"learning_rate": 0.00029222604136323915,
"loss": 3.1902,
"step": 88150
},
{
"epoch": 25.677304758022245,
"grad_norm": 0.44829392433166504,
"learning_rate": 0.000292051267113312,
"loss": 3.1828,
"step": 88200
},
{
"epoch": 25.691864189622038,
"grad_norm": 0.45893147587776184,
"learning_rate": 0.00029187649286338476,
"loss": 3.1951,
"step": 88250
},
{
"epoch": 25.706423621221827,
"grad_norm": 0.4264863133430481,
"learning_rate": 0.0002917017186134576,
"loss": 3.186,
"step": 88300
},
{
"epoch": 25.72098305282162,
"grad_norm": 0.4173523187637329,
"learning_rate": 0.0002915269443635304,
"loss": 3.2085,
"step": 88350
},
{
"epoch": 25.73554248442141,
"grad_norm": 0.4248258173465729,
"learning_rate": 0.00029135217011360326,
"loss": 3.2001,
"step": 88400
},
{
"epoch": 25.750101916021197,
"grad_norm": 0.41881099343299866,
"learning_rate": 0.0002911773958636761,
"loss": 3.1997,
"step": 88450
},
{
"epoch": 25.76466134762099,
"grad_norm": 0.4275151789188385,
"learning_rate": 0.00029100262161374887,
"loss": 3.2037,
"step": 88500
},
{
"epoch": 25.77922077922078,
"grad_norm": 0.473474383354187,
"learning_rate": 0.0002908278473638217,
"loss": 3.2004,
"step": 88550
},
{
"epoch": 25.793780210820568,
"grad_norm": 0.41812214255332947,
"learning_rate": 0.00029065307311389453,
"loss": 3.1992,
"step": 88600
},
{
"epoch": 25.80833964242036,
"grad_norm": 0.4513116180896759,
"learning_rate": 0.00029047829886396737,
"loss": 3.2088,
"step": 88650
},
{
"epoch": 25.82289907402015,
"grad_norm": 0.4494040012359619,
"learning_rate": 0.0002903035246140402,
"loss": 3.2008,
"step": 88700
},
{
"epoch": 25.837458505619942,
"grad_norm": 0.4608217477798462,
"learning_rate": 0.000290128750364113,
"loss": 3.2038,
"step": 88750
},
{
"epoch": 25.85201793721973,
"grad_norm": 0.4516802132129669,
"learning_rate": 0.0002899539761141858,
"loss": 3.2057,
"step": 88800
},
{
"epoch": 25.86657736881952,
"grad_norm": 0.41412249207496643,
"learning_rate": 0.00028977920186425864,
"loss": 3.204,
"step": 88850
},
{
"epoch": 25.881136800419313,
"grad_norm": 0.44088810682296753,
"learning_rate": 0.0002896044276143315,
"loss": 3.2054,
"step": 88900
},
{
"epoch": 25.8956962320191,
"grad_norm": 0.42800846695899963,
"learning_rate": 0.00028942965336440426,
"loss": 3.1936,
"step": 88950
},
{
"epoch": 25.91025566361889,
"grad_norm": 0.47048747539520264,
"learning_rate": 0.0002892548791144771,
"loss": 3.2067,
"step": 89000
},
{
"epoch": 25.91025566361889,
"eval_accuracy": 0.3746011704526494,
"eval_loss": 3.5390212535858154,
"eval_runtime": 176.1021,
"eval_samples_per_second": 94.508,
"eval_steps_per_second": 5.911,
"step": 89000
},
{
"epoch": 25.924815095218683,
"grad_norm": 0.4377363324165344,
"learning_rate": 0.0002890801048645499,
"loss": 3.2089,
"step": 89050
},
{
"epoch": 25.939374526818472,
"grad_norm": 0.43033215403556824,
"learning_rate": 0.00028890533061462276,
"loss": 3.2125,
"step": 89100
},
{
"epoch": 25.953933958418265,
"grad_norm": 0.4349314570426941,
"learning_rate": 0.0002887305563646956,
"loss": 3.201,
"step": 89150
},
{
"epoch": 25.968493390018054,
"grad_norm": 0.45417773723602295,
"learning_rate": 0.00028855578211476837,
"loss": 3.2129,
"step": 89200
},
{
"epoch": 25.983052821617843,
"grad_norm": 0.4514182507991791,
"learning_rate": 0.0002883810078648412,
"loss": 3.2026,
"step": 89250
},
{
"epoch": 25.997612253217635,
"grad_norm": 0.4432578980922699,
"learning_rate": 0.00028820623361491403,
"loss": 3.2073,
"step": 89300
},
{
"epoch": 26.01193873391183,
"grad_norm": 0.43464499711990356,
"learning_rate": 0.00028803145936498687,
"loss": 3.1196,
"step": 89350
},
{
"epoch": 26.02649816551162,
"grad_norm": 0.427076518535614,
"learning_rate": 0.0002878566851150597,
"loss": 3.1105,
"step": 89400
},
{
"epoch": 26.041057597111408,
"grad_norm": 0.44585657119750977,
"learning_rate": 0.00028768191086513253,
"loss": 3.115,
"step": 89450
},
{
"epoch": 26.0556170287112,
"grad_norm": 0.45118430256843567,
"learning_rate": 0.00028750713661520536,
"loss": 3.1275,
"step": 89500
},
{
"epoch": 26.07017646031099,
"grad_norm": 0.41694751381874084,
"learning_rate": 0.00028733236236527814,
"loss": 3.1038,
"step": 89550
},
{
"epoch": 26.08473589191078,
"grad_norm": 0.4492938816547394,
"learning_rate": 0.000287157588115351,
"loss": 3.1343,
"step": 89600
},
{
"epoch": 26.09929532351057,
"grad_norm": 0.44321227073669434,
"learning_rate": 0.0002869828138654238,
"loss": 3.1221,
"step": 89650
},
{
"epoch": 26.11385475511036,
"grad_norm": 0.42474403977394104,
"learning_rate": 0.00028680803961549664,
"loss": 3.1323,
"step": 89700
},
{
"epoch": 26.128414186710152,
"grad_norm": 0.42992469668388367,
"learning_rate": 0.0002866332653655695,
"loss": 3.1243,
"step": 89750
},
{
"epoch": 26.14297361830994,
"grad_norm": 0.4463481903076172,
"learning_rate": 0.00028645849111564225,
"loss": 3.1386,
"step": 89800
},
{
"epoch": 26.15753304990973,
"grad_norm": 0.42387306690216064,
"learning_rate": 0.0002862837168657151,
"loss": 3.1402,
"step": 89850
},
{
"epoch": 26.172092481509523,
"grad_norm": 0.44750088453292847,
"learning_rate": 0.0002861089426157879,
"loss": 3.1307,
"step": 89900
},
{
"epoch": 26.186651913109312,
"grad_norm": 0.49076640605926514,
"learning_rate": 0.00028593416836586075,
"loss": 3.1315,
"step": 89950
},
{
"epoch": 26.2012113447091,
"grad_norm": 0.43750905990600586,
"learning_rate": 0.0002857593941159336,
"loss": 3.1412,
"step": 90000
},
{
"epoch": 26.2012113447091,
"eval_accuracy": 0.37368295856829026,
"eval_loss": 3.552704095840454,
"eval_runtime": 175.9971,
"eval_samples_per_second": 94.564,
"eval_steps_per_second": 5.915,
"step": 90000
},
{
"epoch": 26.215770776308894,
"grad_norm": 0.43243998289108276,
"learning_rate": 0.00028558461986600637,
"loss": 3.1477,
"step": 90050
},
{
"epoch": 26.230330207908683,
"grad_norm": 0.44490525126457214,
"learning_rate": 0.0002854098456160792,
"loss": 3.1395,
"step": 90100
},
{
"epoch": 26.244889639508475,
"grad_norm": 0.4202319383621216,
"learning_rate": 0.00028523507136615203,
"loss": 3.1557,
"step": 90150
},
{
"epoch": 26.259449071108264,
"grad_norm": 0.4608379006385803,
"learning_rate": 0.00028506029711622486,
"loss": 3.1526,
"step": 90200
},
{
"epoch": 26.274008502708053,
"grad_norm": 0.473849356174469,
"learning_rate": 0.00028488552286629764,
"loss": 3.1539,
"step": 90250
},
{
"epoch": 26.288567934307846,
"grad_norm": 0.42520883679389954,
"learning_rate": 0.0002847107486163705,
"loss": 3.1494,
"step": 90300
},
{
"epoch": 26.303127365907635,
"grad_norm": 0.4529514014720917,
"learning_rate": 0.0002845359743664433,
"loss": 3.1534,
"step": 90350
},
{
"epoch": 26.317686797507424,
"grad_norm": 0.4225722551345825,
"learning_rate": 0.00028436120011651614,
"loss": 3.1542,
"step": 90400
},
{
"epoch": 26.332246229107216,
"grad_norm": 0.4789310693740845,
"learning_rate": 0.000284186425866589,
"loss": 3.1621,
"step": 90450
},
{
"epoch": 26.346805660707005,
"grad_norm": 0.42410004138946533,
"learning_rate": 0.00028401165161666175,
"loss": 3.1612,
"step": 90500
},
{
"epoch": 26.361365092306798,
"grad_norm": 0.43350374698638916,
"learning_rate": 0.0002838368773667346,
"loss": 3.1668,
"step": 90550
},
{
"epoch": 26.375924523906587,
"grad_norm": 0.4327101409435272,
"learning_rate": 0.0002836621031168075,
"loss": 3.1691,
"step": 90600
},
{
"epoch": 26.390483955506376,
"grad_norm": 0.4356554448604584,
"learning_rate": 0.00028348732886688025,
"loss": 3.162,
"step": 90650
},
{
"epoch": 26.40504338710617,
"grad_norm": 0.44769665598869324,
"learning_rate": 0.0002833125546169531,
"loss": 3.1721,
"step": 90700
},
{
"epoch": 26.419602818705958,
"grad_norm": 0.43828025460243225,
"learning_rate": 0.0002831377803670259,
"loss": 3.1689,
"step": 90750
},
{
"epoch": 26.434162250305747,
"grad_norm": 0.4498804807662964,
"learning_rate": 0.00028296300611709875,
"loss": 3.1753,
"step": 90800
},
{
"epoch": 26.44872168190554,
"grad_norm": 0.4299849569797516,
"learning_rate": 0.00028278823186717153,
"loss": 3.1675,
"step": 90850
},
{
"epoch": 26.463281113505328,
"grad_norm": 0.42671430110931396,
"learning_rate": 0.00028261345761724436,
"loss": 3.1616,
"step": 90900
},
{
"epoch": 26.47784054510512,
"grad_norm": 0.4571177661418915,
"learning_rate": 0.0002824386833673172,
"loss": 3.1643,
"step": 90950
},
{
"epoch": 26.49239997670491,
"grad_norm": 0.4367692172527313,
"learning_rate": 0.00028226390911739003,
"loss": 3.1852,
"step": 91000
},
{
"epoch": 26.49239997670491,
"eval_accuracy": 0.3740624014868236,
"eval_loss": 3.545224666595459,
"eval_runtime": 176.1401,
"eval_samples_per_second": 94.487,
"eval_steps_per_second": 5.91,
"step": 91000
},
{
"epoch": 26.5069594083047,
"grad_norm": 0.45861831307411194,
"learning_rate": 0.00028208913486746286,
"loss": 3.1676,
"step": 91050
},
{
"epoch": 26.52151883990449,
"grad_norm": 0.42132991552352905,
"learning_rate": 0.00028191436061753564,
"loss": 3.1729,
"step": 91100
},
{
"epoch": 26.53607827150428,
"grad_norm": 0.4799935817718506,
"learning_rate": 0.0002817395863676085,
"loss": 3.1639,
"step": 91150
},
{
"epoch": 26.55063770310407,
"grad_norm": 0.467133492231369,
"learning_rate": 0.0002815648121176813,
"loss": 3.1814,
"step": 91200
},
{
"epoch": 26.565197134703862,
"grad_norm": 0.40786775946617126,
"learning_rate": 0.00028139003786775414,
"loss": 3.1825,
"step": 91250
},
{
"epoch": 26.57975656630365,
"grad_norm": 0.46661651134490967,
"learning_rate": 0.000281215263617827,
"loss": 3.1724,
"step": 91300
},
{
"epoch": 26.594315997903443,
"grad_norm": 0.43742454051971436,
"learning_rate": 0.00028104048936789975,
"loss": 3.1886,
"step": 91350
},
{
"epoch": 26.608875429503232,
"grad_norm": 0.43313705921173096,
"learning_rate": 0.0002808657151179726,
"loss": 3.1689,
"step": 91400
},
{
"epoch": 26.62343486110302,
"grad_norm": 0.42333483695983887,
"learning_rate": 0.0002806909408680454,
"loss": 3.1738,
"step": 91450
},
{
"epoch": 26.637994292702814,
"grad_norm": 0.42646193504333496,
"learning_rate": 0.00028051616661811825,
"loss": 3.1797,
"step": 91500
},
{
"epoch": 26.652553724302603,
"grad_norm": 0.4323969781398773,
"learning_rate": 0.00028034139236819103,
"loss": 3.1817,
"step": 91550
},
{
"epoch": 26.667113155902392,
"grad_norm": 0.4830801784992218,
"learning_rate": 0.00028016661811826386,
"loss": 3.169,
"step": 91600
},
{
"epoch": 26.681672587502185,
"grad_norm": 0.44306960701942444,
"learning_rate": 0.0002799918438683367,
"loss": 3.1853,
"step": 91650
},
{
"epoch": 26.696232019101974,
"grad_norm": 0.438679963350296,
"learning_rate": 0.00027981706961840953,
"loss": 3.1856,
"step": 91700
},
{
"epoch": 26.710791450701766,
"grad_norm": 0.4450984299182892,
"learning_rate": 0.00027964229536848236,
"loss": 3.1767,
"step": 91750
},
{
"epoch": 26.725350882301555,
"grad_norm": 0.46220827102661133,
"learning_rate": 0.00027946752111855514,
"loss": 3.1854,
"step": 91800
},
{
"epoch": 26.739910313901344,
"grad_norm": 0.4325886070728302,
"learning_rate": 0.000279292746868628,
"loss": 3.1755,
"step": 91850
},
{
"epoch": 26.754469745501137,
"grad_norm": 0.45203420519828796,
"learning_rate": 0.00027911797261870086,
"loss": 3.1833,
"step": 91900
},
{
"epoch": 26.769029177100926,
"grad_norm": 0.4569106698036194,
"learning_rate": 0.00027894319836877364,
"loss": 3.1961,
"step": 91950
},
{
"epoch": 26.783588608700715,
"grad_norm": 0.4249524772167206,
"learning_rate": 0.00027876842411884647,
"loss": 3.1961,
"step": 92000
},
{
"epoch": 26.783588608700715,
"eval_accuracy": 0.3745718920867632,
"eval_loss": 3.5404043197631836,
"eval_runtime": 176.1461,
"eval_samples_per_second": 94.484,
"eval_steps_per_second": 5.91,
"step": 92000
},
{
"epoch": 26.798148040300507,
"grad_norm": 0.4360384941101074,
"learning_rate": 0.0002785936498689193,
"loss": 3.1865,
"step": 92050
},
{
"epoch": 26.812707471900296,
"grad_norm": 0.4332277178764343,
"learning_rate": 0.00027841887561899214,
"loss": 3.1902,
"step": 92100
},
{
"epoch": 26.82726690350009,
"grad_norm": 0.46646252274513245,
"learning_rate": 0.0002782441013690649,
"loss": 3.1949,
"step": 92150
},
{
"epoch": 26.841826335099878,
"grad_norm": 0.4422335922718048,
"learning_rate": 0.00027806932711913775,
"loss": 3.1888,
"step": 92200
},
{
"epoch": 26.856385766699667,
"grad_norm": 0.4528057873249054,
"learning_rate": 0.0002778945528692106,
"loss": 3.2017,
"step": 92250
},
{
"epoch": 26.87094519829946,
"grad_norm": 0.4345687925815582,
"learning_rate": 0.0002777197786192834,
"loss": 3.1882,
"step": 92300
},
{
"epoch": 26.88550462989925,
"grad_norm": 0.4311511218547821,
"learning_rate": 0.00027754500436935625,
"loss": 3.1851,
"step": 92350
},
{
"epoch": 26.900064061499037,
"grad_norm": 0.44769158959388733,
"learning_rate": 0.00027737023011942903,
"loss": 3.1828,
"step": 92400
},
{
"epoch": 26.91462349309883,
"grad_norm": 0.45371246337890625,
"learning_rate": 0.00027719545586950186,
"loss": 3.1854,
"step": 92450
},
{
"epoch": 26.92918292469862,
"grad_norm": 0.4307236671447754,
"learning_rate": 0.0002770206816195747,
"loss": 3.2048,
"step": 92500
},
{
"epoch": 26.94374235629841,
"grad_norm": 0.5101444125175476,
"learning_rate": 0.0002768459073696475,
"loss": 3.1934,
"step": 92550
},
{
"epoch": 26.9583017878982,
"grad_norm": 0.4513118863105774,
"learning_rate": 0.00027667113311972036,
"loss": 3.1901,
"step": 92600
},
{
"epoch": 26.97286121949799,
"grad_norm": 0.4673958420753479,
"learning_rate": 0.00027649635886979314,
"loss": 3.2089,
"step": 92650
},
{
"epoch": 26.987420651097782,
"grad_norm": 0.4448801279067993,
"learning_rate": 0.00027632158461986597,
"loss": 3.2034,
"step": 92700
},
{
"epoch": 27.001747131791976,
"grad_norm": 0.46219342947006226,
"learning_rate": 0.0002761468103699388,
"loss": 3.1836,
"step": 92750
},
{
"epoch": 27.016306563391765,
"grad_norm": 0.431754469871521,
"learning_rate": 0.00027597203612001164,
"loss": 3.0981,
"step": 92800
},
{
"epoch": 27.030865994991554,
"grad_norm": 0.4268021583557129,
"learning_rate": 0.0002757972618700844,
"loss": 3.0964,
"step": 92850
},
{
"epoch": 27.045425426591347,
"grad_norm": 0.44790345430374146,
"learning_rate": 0.00027562248762015725,
"loss": 3.094,
"step": 92900
},
{
"epoch": 27.059984858191136,
"grad_norm": 0.43892034888267517,
"learning_rate": 0.0002754477133702301,
"loss": 3.1152,
"step": 92950
},
{
"epoch": 27.074544289790925,
"grad_norm": 0.446199893951416,
"learning_rate": 0.0002752729391203029,
"loss": 3.1188,
"step": 93000
},
{
"epoch": 27.074544289790925,
"eval_accuracy": 0.3737672661519867,
"eval_loss": 3.5553925037384033,
"eval_runtime": 176.0476,
"eval_samples_per_second": 94.537,
"eval_steps_per_second": 5.913,
"step": 93000
},
{
"epoch": 27.089103721390718,
"grad_norm": 0.4196743667125702,
"learning_rate": 0.00027509816487037575,
"loss": 3.1054,
"step": 93050
},
{
"epoch": 27.103663152990507,
"grad_norm": 0.43984171748161316,
"learning_rate": 0.00027492339062044853,
"loss": 3.1183,
"step": 93100
},
{
"epoch": 27.1182225845903,
"grad_norm": 0.44343408942222595,
"learning_rate": 0.0002747486163705214,
"loss": 3.1244,
"step": 93150
},
{
"epoch": 27.132782016190088,
"grad_norm": 0.4573042690753937,
"learning_rate": 0.00027457384212059425,
"loss": 3.1338,
"step": 93200
},
{
"epoch": 27.147341447789877,
"grad_norm": 0.435369074344635,
"learning_rate": 0.000274399067870667,
"loss": 3.1289,
"step": 93250
},
{
"epoch": 27.16190087938967,
"grad_norm": 0.4890010356903076,
"learning_rate": 0.00027422429362073986,
"loss": 3.1225,
"step": 93300
},
{
"epoch": 27.17646031098946,
"grad_norm": 0.44470691680908203,
"learning_rate": 0.0002740495193708127,
"loss": 3.1252,
"step": 93350
},
{
"epoch": 27.191019742589248,
"grad_norm": 0.4353671371936798,
"learning_rate": 0.0002738747451208855,
"loss": 3.1254,
"step": 93400
},
{
"epoch": 27.20557917418904,
"grad_norm": 0.44027820229530334,
"learning_rate": 0.0002736999708709583,
"loss": 3.1343,
"step": 93450
},
{
"epoch": 27.22013860578883,
"grad_norm": 0.44768866896629333,
"learning_rate": 0.00027352519662103114,
"loss": 3.1497,
"step": 93500
},
{
"epoch": 27.234698037388622,
"grad_norm": 0.4610934853553772,
"learning_rate": 0.00027335042237110397,
"loss": 3.1332,
"step": 93550
},
{
"epoch": 27.24925746898841,
"grad_norm": 0.4198959171772003,
"learning_rate": 0.0002731756481211768,
"loss": 3.1289,
"step": 93600
},
{
"epoch": 27.2638169005882,
"grad_norm": 0.4309977889060974,
"learning_rate": 0.00027300087387124964,
"loss": 3.1357,
"step": 93650
},
{
"epoch": 27.278376332187992,
"grad_norm": 0.4930909276008606,
"learning_rate": 0.0002728260996213224,
"loss": 3.1572,
"step": 93700
},
{
"epoch": 27.29293576378778,
"grad_norm": 0.4195786118507385,
"learning_rate": 0.00027265132537139525,
"loss": 3.1518,
"step": 93750
},
{
"epoch": 27.30749519538757,
"grad_norm": 0.4539899230003357,
"learning_rate": 0.0002724765511214681,
"loss": 3.1462,
"step": 93800
},
{
"epoch": 27.322054626987363,
"grad_norm": 0.4597534239292145,
"learning_rate": 0.0002723017768715409,
"loss": 3.1491,
"step": 93850
},
{
"epoch": 27.336614058587152,
"grad_norm": 0.42478206753730774,
"learning_rate": 0.00027212700262161375,
"loss": 3.1528,
"step": 93900
},
{
"epoch": 27.351173490186945,
"grad_norm": 0.42217180132865906,
"learning_rate": 0.0002719522283716865,
"loss": 3.1603,
"step": 93950
},
{
"epoch": 27.365732921786734,
"grad_norm": 0.4504246711730957,
"learning_rate": 0.00027177745412175936,
"loss": 3.1557,
"step": 94000
},
{
"epoch": 27.365732921786734,
"eval_accuracy": 0.3741429463889604,
"eval_loss": 3.549989938735962,
"eval_runtime": 176.2106,
"eval_samples_per_second": 94.449,
"eval_steps_per_second": 5.908,
"step": 94000
},
{
"epoch": 27.380292353386523,
"grad_norm": 0.43801188468933105,
"learning_rate": 0.0002716026798718322,
"loss": 3.147,
"step": 94050
},
{
"epoch": 27.394851784986315,
"grad_norm": 0.4779060482978821,
"learning_rate": 0.000271427905621905,
"loss": 3.1564,
"step": 94100
},
{
"epoch": 27.409411216586104,
"grad_norm": 0.4386543035507202,
"learning_rate": 0.0002712531313719778,
"loss": 3.172,
"step": 94150
},
{
"epoch": 27.423970648185893,
"grad_norm": 0.4766581058502197,
"learning_rate": 0.00027107835712205064,
"loss": 3.1584,
"step": 94200
},
{
"epoch": 27.438530079785686,
"grad_norm": 0.4249516427516937,
"learning_rate": 0.00027090358287212347,
"loss": 3.1512,
"step": 94250
},
{
"epoch": 27.453089511385475,
"grad_norm": 0.44262126088142395,
"learning_rate": 0.0002707288086221963,
"loss": 3.1583,
"step": 94300
},
{
"epoch": 27.467648942985267,
"grad_norm": 0.4553981125354767,
"learning_rate": 0.00027055403437226914,
"loss": 3.1597,
"step": 94350
},
{
"epoch": 27.482208374585056,
"grad_norm": 0.4854367673397064,
"learning_rate": 0.00027037926012234197,
"loss": 3.1701,
"step": 94400
},
{
"epoch": 27.496767806184845,
"grad_norm": 0.4336088001728058,
"learning_rate": 0.0002702044858724148,
"loss": 3.16,
"step": 94450
},
{
"epoch": 27.511327237784638,
"grad_norm": 0.4473203718662262,
"learning_rate": 0.00027002971162248763,
"loss": 3.1578,
"step": 94500
},
{
"epoch": 27.525886669384427,
"grad_norm": 0.43575260043144226,
"learning_rate": 0.0002698549373725604,
"loss": 3.1591,
"step": 94550
},
{
"epoch": 27.540446100984216,
"grad_norm": 0.45486170053482056,
"learning_rate": 0.00026968016312263325,
"loss": 3.1661,
"step": 94600
},
{
"epoch": 27.55500553258401,
"grad_norm": 0.4264790713787079,
"learning_rate": 0.0002695053888727061,
"loss": 3.168,
"step": 94650
},
{
"epoch": 27.569564964183797,
"grad_norm": 0.41783830523490906,
"learning_rate": 0.0002693306146227789,
"loss": 3.1697,
"step": 94700
},
{
"epoch": 27.58412439578359,
"grad_norm": 0.46509838104248047,
"learning_rate": 0.0002691558403728517,
"loss": 3.1784,
"step": 94750
},
{
"epoch": 27.59868382738338,
"grad_norm": 0.46188145875930786,
"learning_rate": 0.0002689810661229245,
"loss": 3.1765,
"step": 94800
},
{
"epoch": 27.613243258983168,
"grad_norm": 0.43464067578315735,
"learning_rate": 0.00026880629187299736,
"loss": 3.1751,
"step": 94850
},
{
"epoch": 27.62780269058296,
"grad_norm": 0.4667948782444,
"learning_rate": 0.0002686315176230702,
"loss": 3.1575,
"step": 94900
},
{
"epoch": 27.64236212218275,
"grad_norm": 0.4603780210018158,
"learning_rate": 0.000268456743373143,
"loss": 3.1652,
"step": 94950
},
{
"epoch": 27.65692155378254,
"grad_norm": 0.4413023591041565,
"learning_rate": 0.0002682819691232158,
"loss": 3.1792,
"step": 95000
},
{
"epoch": 27.65692155378254,
"eval_accuracy": 0.37462045419564277,
"eval_loss": 3.538667917251587,
"eval_runtime": 176.0826,
"eval_samples_per_second": 94.518,
"eval_steps_per_second": 5.912,
"step": 95000
},
{
"epoch": 27.67148098538233,
"grad_norm": 0.48090237379074097,
"learning_rate": 0.00026810719487328863,
"loss": 3.1765,
"step": 95050
},
{
"epoch": 27.68604041698212,
"grad_norm": 0.43271276354789734,
"learning_rate": 0.00026793242062336147,
"loss": 3.172,
"step": 95100
},
{
"epoch": 27.700599848581913,
"grad_norm": 0.45895567536354065,
"learning_rate": 0.0002677576463734343,
"loss": 3.1673,
"step": 95150
},
{
"epoch": 27.7151592801817,
"grad_norm": 0.4838683605194092,
"learning_rate": 0.00026758287212350713,
"loss": 3.1804,
"step": 95200
},
{
"epoch": 27.72971871178149,
"grad_norm": 0.4482438266277313,
"learning_rate": 0.0002674080978735799,
"loss": 3.177,
"step": 95250
},
{
"epoch": 27.744278143381283,
"grad_norm": 0.4664458930492401,
"learning_rate": 0.00026723332362365275,
"loss": 3.1773,
"step": 95300
},
{
"epoch": 27.758837574981072,
"grad_norm": 0.44228246808052063,
"learning_rate": 0.0002670585493737256,
"loss": 3.1871,
"step": 95350
},
{
"epoch": 27.773397006580865,
"grad_norm": 0.4452652633190155,
"learning_rate": 0.0002668837751237984,
"loss": 3.1735,
"step": 95400
},
{
"epoch": 27.787956438180654,
"grad_norm": 0.4514998495578766,
"learning_rate": 0.0002667090008738712,
"loss": 3.1836,
"step": 95450
},
{
"epoch": 27.802515869780443,
"grad_norm": 0.45780277252197266,
"learning_rate": 0.000266534226623944,
"loss": 3.1752,
"step": 95500
},
{
"epoch": 27.817075301380235,
"grad_norm": 0.42713457345962524,
"learning_rate": 0.00026635945237401686,
"loss": 3.1747,
"step": 95550
},
{
"epoch": 27.831634732980024,
"grad_norm": 0.4621380865573883,
"learning_rate": 0.0002661846781240897,
"loss": 3.1898,
"step": 95600
},
{
"epoch": 27.846194164579813,
"grad_norm": 0.44072896242141724,
"learning_rate": 0.0002660099038741625,
"loss": 3.1854,
"step": 95650
},
{
"epoch": 27.860753596179606,
"grad_norm": 0.4288334548473358,
"learning_rate": 0.00026583512962423536,
"loss": 3.1805,
"step": 95700
},
{
"epoch": 27.875313027779395,
"grad_norm": 0.42848989367485046,
"learning_rate": 0.0002656603553743082,
"loss": 3.1719,
"step": 95750
},
{
"epoch": 27.889872459379184,
"grad_norm": 0.4722457230091095,
"learning_rate": 0.000265485581124381,
"loss": 3.1861,
"step": 95800
},
{
"epoch": 27.904431890978977,
"grad_norm": 0.4696759283542633,
"learning_rate": 0.0002653108068744538,
"loss": 3.187,
"step": 95850
},
{
"epoch": 27.918991322578766,
"grad_norm": 0.4808979034423828,
"learning_rate": 0.00026513603262452663,
"loss": 3.1933,
"step": 95900
},
{
"epoch": 27.933550754178558,
"grad_norm": 0.4903891682624817,
"learning_rate": 0.00026496125837459947,
"loss": 3.1746,
"step": 95950
},
{
"epoch": 27.948110185778347,
"grad_norm": 0.44483309984207153,
"learning_rate": 0.0002647864841246723,
"loss": 3.1917,
"step": 96000
},
{
"epoch": 27.948110185778347,
"eval_accuracy": 0.374729689544672,
"eval_loss": 3.535334348678589,
"eval_runtime": 175.9673,
"eval_samples_per_second": 94.58,
"eval_steps_per_second": 5.916,
"step": 96000
},
{
"epoch": 27.962669617378136,
"grad_norm": 0.4571549892425537,
"learning_rate": 0.0002646117098747451,
"loss": 3.1916,
"step": 96050
},
{
"epoch": 27.97722904897793,
"grad_norm": 0.44937238097190857,
"learning_rate": 0.0002644369356248179,
"loss": 3.176,
"step": 96100
},
{
"epoch": 27.991788480577718,
"grad_norm": 0.4279572367668152,
"learning_rate": 0.00026426216137489074,
"loss": 3.1906,
"step": 96150
},
{
"epoch": 28.006114961271912,
"grad_norm": 0.4721224904060364,
"learning_rate": 0.0002640873871249636,
"loss": 3.1421,
"step": 96200
},
{
"epoch": 28.0206743928717,
"grad_norm": 0.4491647779941559,
"learning_rate": 0.0002639126128750364,
"loss": 3.0955,
"step": 96250
},
{
"epoch": 28.035233824471494,
"grad_norm": 0.436516672372818,
"learning_rate": 0.0002637378386251092,
"loss": 3.0938,
"step": 96300
},
{
"epoch": 28.049793256071283,
"grad_norm": 0.4377647936344147,
"learning_rate": 0.000263563064375182,
"loss": 3.1034,
"step": 96350
},
{
"epoch": 28.06435268767107,
"grad_norm": 0.4661620557308197,
"learning_rate": 0.00026338829012525485,
"loss": 3.1043,
"step": 96400
},
{
"epoch": 28.078912119270864,
"grad_norm": 0.4748687744140625,
"learning_rate": 0.0002632135158753277,
"loss": 3.1055,
"step": 96450
},
{
"epoch": 28.093471550870653,
"grad_norm": 0.4351736903190613,
"learning_rate": 0.0002630387416254005,
"loss": 3.1103,
"step": 96500
},
{
"epoch": 28.108030982470446,
"grad_norm": 0.4778978228569031,
"learning_rate": 0.0002628639673754733,
"loss": 3.1122,
"step": 96550
},
{
"epoch": 28.122590414070235,
"grad_norm": 0.47012126445770264,
"learning_rate": 0.00026268919312554613,
"loss": 3.1148,
"step": 96600
},
{
"epoch": 28.137149845670024,
"grad_norm": 0.4676751494407654,
"learning_rate": 0.00026251441887561897,
"loss": 3.1213,
"step": 96650
},
{
"epoch": 28.151709277269816,
"grad_norm": 0.46864810585975647,
"learning_rate": 0.0002623396446256918,
"loss": 3.1236,
"step": 96700
},
{
"epoch": 28.166268708869605,
"grad_norm": 0.4691295921802521,
"learning_rate": 0.0002621648703757646,
"loss": 3.1346,
"step": 96750
},
{
"epoch": 28.180828140469394,
"grad_norm": 0.4796832501888275,
"learning_rate": 0.0002619900961258374,
"loss": 3.1204,
"step": 96800
},
{
"epoch": 28.195387572069187,
"grad_norm": 0.47000518441200256,
"learning_rate": 0.00026181532187591024,
"loss": 3.1227,
"step": 96850
},
{
"epoch": 28.209947003668976,
"grad_norm": 0.4579707682132721,
"learning_rate": 0.0002616405476259831,
"loss": 3.1165,
"step": 96900
},
{
"epoch": 28.22450643526877,
"grad_norm": 0.4562205672264099,
"learning_rate": 0.0002614657733760559,
"loss": 3.1214,
"step": 96950
},
{
"epoch": 28.239065866868557,
"grad_norm": 0.4554135501384735,
"learning_rate": 0.00026129099912612874,
"loss": 3.1361,
"step": 97000
},
{
"epoch": 28.239065866868557,
"eval_accuracy": 0.37413753753421836,
"eval_loss": 3.554201364517212,
"eval_runtime": 176.0871,
"eval_samples_per_second": 94.516,
"eval_steps_per_second": 5.912,
"step": 97000
},
{
"epoch": 28.253625298468346,
"grad_norm": 0.4856763184070587,
"learning_rate": 0.0002611162248762016,
"loss": 3.1141,
"step": 97050
},
{
"epoch": 28.26818473006814,
"grad_norm": 0.43940815329551697,
"learning_rate": 0.0002609414506262744,
"loss": 3.1263,
"step": 97100
},
{
"epoch": 28.282744161667928,
"grad_norm": 0.4479314088821411,
"learning_rate": 0.0002607666763763472,
"loss": 3.1356,
"step": 97150
},
{
"epoch": 28.29730359326772,
"grad_norm": 0.4651418924331665,
"learning_rate": 0.00026059190212642,
"loss": 3.1287,
"step": 97200
},
{
"epoch": 28.31186302486751,
"grad_norm": 0.433713436126709,
"learning_rate": 0.00026041712787649285,
"loss": 3.1419,
"step": 97250
},
{
"epoch": 28.3264224564673,
"grad_norm": 0.44931450486183167,
"learning_rate": 0.0002602423536265657,
"loss": 3.1499,
"step": 97300
},
{
"epoch": 28.34098188806709,
"grad_norm": 0.4344748258590698,
"learning_rate": 0.00026006757937663846,
"loss": 3.1301,
"step": 97350
},
{
"epoch": 28.35554131966688,
"grad_norm": 0.4586000144481659,
"learning_rate": 0.0002598928051267113,
"loss": 3.1288,
"step": 97400
},
{
"epoch": 28.37010075126667,
"grad_norm": 0.4540034234523773,
"learning_rate": 0.00025971803087678413,
"loss": 3.1552,
"step": 97450
},
{
"epoch": 28.38466018286646,
"grad_norm": 0.4512953758239746,
"learning_rate": 0.00025954325662685696,
"loss": 3.1397,
"step": 97500
},
{
"epoch": 28.39921961446625,
"grad_norm": 0.42919859290122986,
"learning_rate": 0.0002593684823769298,
"loss": 3.1338,
"step": 97550
},
{
"epoch": 28.413779046066043,
"grad_norm": 0.4367040693759918,
"learning_rate": 0.0002591937081270026,
"loss": 3.1489,
"step": 97600
},
{
"epoch": 28.428338477665832,
"grad_norm": 0.4803733229637146,
"learning_rate": 0.0002590189338770754,
"loss": 3.1471,
"step": 97650
},
{
"epoch": 28.44289790926562,
"grad_norm": 0.45755213499069214,
"learning_rate": 0.00025884415962714824,
"loss": 3.1535,
"step": 97700
},
{
"epoch": 28.457457340865414,
"grad_norm": 0.4427001178264618,
"learning_rate": 0.0002586693853772211,
"loss": 3.1458,
"step": 97750
},
{
"epoch": 28.472016772465203,
"grad_norm": 0.46226221323013306,
"learning_rate": 0.0002584946111272939,
"loss": 3.1571,
"step": 97800
},
{
"epoch": 28.486576204064992,
"grad_norm": 0.4624316990375519,
"learning_rate": 0.0002583198368773667,
"loss": 3.1577,
"step": 97850
},
{
"epoch": 28.501135635664784,
"grad_norm": 0.4385261535644531,
"learning_rate": 0.0002581450626274395,
"loss": 3.1523,
"step": 97900
},
{
"epoch": 28.515695067264573,
"grad_norm": 0.4555385112762451,
"learning_rate": 0.00025797028837751235,
"loss": 3.1588,
"step": 97950
},
{
"epoch": 28.530254498864366,
"grad_norm": 0.4434893727302551,
"learning_rate": 0.0002577955141275852,
"loss": 3.1525,
"step": 98000
},
{
"epoch": 28.530254498864366,
"eval_accuracy": 0.37440010215680436,
"eval_loss": 3.545118570327759,
"eval_runtime": 176.0773,
"eval_samples_per_second": 94.521,
"eval_steps_per_second": 5.912,
"step": 98000
},
{
"epoch": 28.544813930464155,
"grad_norm": 0.4388698637485504,
"learning_rate": 0.00025762073987765796,
"loss": 3.1592,
"step": 98050
},
{
"epoch": 28.559373362063944,
"grad_norm": 0.5155254602432251,
"learning_rate": 0.0002574459656277308,
"loss": 3.1566,
"step": 98100
},
{
"epoch": 28.573932793663737,
"grad_norm": 0.44777607917785645,
"learning_rate": 0.00025727119137780363,
"loss": 3.1591,
"step": 98150
},
{
"epoch": 28.588492225263526,
"grad_norm": 0.47019049525260925,
"learning_rate": 0.00025709641712787646,
"loss": 3.1679,
"step": 98200
},
{
"epoch": 28.603051656863315,
"grad_norm": 0.4456532895565033,
"learning_rate": 0.0002569216428779493,
"loss": 3.1684,
"step": 98250
},
{
"epoch": 28.617611088463107,
"grad_norm": 0.46583524346351624,
"learning_rate": 0.00025674686862802213,
"loss": 3.1535,
"step": 98300
},
{
"epoch": 28.632170520062896,
"grad_norm": 0.4479556381702423,
"learning_rate": 0.00025657209437809496,
"loss": 3.1657,
"step": 98350
},
{
"epoch": 28.64672995166269,
"grad_norm": 0.4612073004245758,
"learning_rate": 0.0002563973201281678,
"loss": 3.1703,
"step": 98400
},
{
"epoch": 28.661289383262478,
"grad_norm": 0.4215568006038666,
"learning_rate": 0.0002562225458782406,
"loss": 3.1538,
"step": 98450
},
{
"epoch": 28.675848814862267,
"grad_norm": 0.44902729988098145,
"learning_rate": 0.0002560477716283134,
"loss": 3.1591,
"step": 98500
},
{
"epoch": 28.69040824646206,
"grad_norm": 0.42525118589401245,
"learning_rate": 0.00025587299737838624,
"loss": 3.1546,
"step": 98550
},
{
"epoch": 28.70496767806185,
"grad_norm": 0.47658178210258484,
"learning_rate": 0.00025569822312845907,
"loss": 3.1737,
"step": 98600
},
{
"epoch": 28.719527109661637,
"grad_norm": 0.4418516457080841,
"learning_rate": 0.00025552344887853185,
"loss": 3.1702,
"step": 98650
},
{
"epoch": 28.73408654126143,
"grad_norm": 0.4458206295967102,
"learning_rate": 0.0002553486746286047,
"loss": 3.1577,
"step": 98700
},
{
"epoch": 28.74864597286122,
"grad_norm": 0.44089481234550476,
"learning_rate": 0.0002551739003786775,
"loss": 3.1698,
"step": 98750
},
{
"epoch": 28.76320540446101,
"grad_norm": 0.4285340905189514,
"learning_rate": 0.00025499912612875035,
"loss": 3.1718,
"step": 98800
},
{
"epoch": 28.7777648360608,
"grad_norm": 0.435953825712204,
"learning_rate": 0.0002548243518788232,
"loss": 3.1659,
"step": 98850
},
{
"epoch": 28.79232426766059,
"grad_norm": 0.43512865900993347,
"learning_rate": 0.00025464957762889596,
"loss": 3.1716,
"step": 98900
},
{
"epoch": 28.806883699260382,
"grad_norm": 0.4703885316848755,
"learning_rate": 0.0002544748033789688,
"loss": 3.1799,
"step": 98950
},
{
"epoch": 28.82144313086017,
"grad_norm": 0.4259127974510193,
"learning_rate": 0.00025430002912904163,
"loss": 3.1863,
"step": 99000
},
{
"epoch": 28.82144313086017,
"eval_accuracy": 0.37484139415347484,
"eval_loss": 3.539607048034668,
"eval_runtime": 176.2346,
"eval_samples_per_second": 94.437,
"eval_steps_per_second": 5.907,
"step": 99000
},
{
"epoch": 28.83600256245996,
"grad_norm": 0.4716574251651764,
"learning_rate": 0.00025412525487911446,
"loss": 3.1813,
"step": 99050
},
{
"epoch": 28.850561994059753,
"grad_norm": 0.4820871353149414,
"learning_rate": 0.0002539504806291873,
"loss": 3.1705,
"step": 99100
},
{
"epoch": 28.86512142565954,
"grad_norm": 0.4526395797729492,
"learning_rate": 0.00025377570637926007,
"loss": 3.1719,
"step": 99150
},
{
"epoch": 28.879680857259334,
"grad_norm": 0.4278837740421295,
"learning_rate": 0.0002536009321293329,
"loss": 3.1699,
"step": 99200
},
{
"epoch": 28.894240288859123,
"grad_norm": 0.4406258761882782,
"learning_rate": 0.00025342615787940574,
"loss": 3.1713,
"step": 99250
},
{
"epoch": 28.908799720458912,
"grad_norm": 0.4723201394081116,
"learning_rate": 0.00025325138362947857,
"loss": 3.1784,
"step": 99300
},
{
"epoch": 28.923359152058705,
"grad_norm": 0.4460213780403137,
"learning_rate": 0.00025307660937955135,
"loss": 3.1768,
"step": 99350
},
{
"epoch": 28.937918583658494,
"grad_norm": 0.44851601123809814,
"learning_rate": 0.0002529018351296242,
"loss": 3.1791,
"step": 99400
},
{
"epoch": 28.952478015258283,
"grad_norm": 0.45233815908432007,
"learning_rate": 0.00025272706087969707,
"loss": 3.1863,
"step": 99450
},
{
"epoch": 28.967037446858075,
"grad_norm": 0.4619120955467224,
"learning_rate": 0.00025255228662976985,
"loss": 3.1833,
"step": 99500
},
{
"epoch": 28.981596878457864,
"grad_norm": 0.44890516996383667,
"learning_rate": 0.0002523775123798427,
"loss": 3.1821,
"step": 99550
},
{
"epoch": 28.996156310057657,
"grad_norm": 0.4457606077194214,
"learning_rate": 0.0002522027381299155,
"loss": 3.1888,
"step": 99600
},
{
"epoch": 29.010482790751848,
"grad_norm": 0.46348798274993896,
"learning_rate": 0.00025202796387998835,
"loss": 3.1317,
"step": 99650
},
{
"epoch": 29.02504222235164,
"grad_norm": 0.45175158977508545,
"learning_rate": 0.0002518531896300612,
"loss": 3.0791,
"step": 99700
},
{
"epoch": 29.03960165395143,
"grad_norm": 0.492388516664505,
"learning_rate": 0.00025167841538013396,
"loss": 3.0982,
"step": 99750
},
{
"epoch": 29.054161085551222,
"grad_norm": 0.4821106195449829,
"learning_rate": 0.0002515036411302068,
"loss": 3.0746,
"step": 99800
},
{
"epoch": 29.06872051715101,
"grad_norm": 0.4595145285129547,
"learning_rate": 0.0002513288668802796,
"loss": 3.1005,
"step": 99850
},
{
"epoch": 29.0832799487508,
"grad_norm": 0.4326048195362091,
"learning_rate": 0.00025115409263035246,
"loss": 3.0932,
"step": 99900
},
{
"epoch": 29.097839380350592,
"grad_norm": 0.4561901092529297,
"learning_rate": 0.00025097931838042524,
"loss": 3.0916,
"step": 99950
},
{
"epoch": 29.11239881195038,
"grad_norm": 0.470004677772522,
"learning_rate": 0.00025080454413049807,
"loss": 3.0999,
"step": 100000
},
{
"epoch": 29.11239881195038,
"eval_accuracy": 0.3739163624087888,
"eval_loss": 3.552521228790283,
"eval_runtime": 178.7823,
"eval_samples_per_second": 93.091,
"eval_steps_per_second": 5.823,
"step": 100000
},
{
"epoch": 29.11239881195038,
"step": 100000,
"total_flos": 2.089834314006528e+18,
"train_loss": 0.6335119366455079,
"train_runtime": 39859.0956,
"train_samples_per_second": 344.628,
"train_steps_per_second": 4.309
}
],
"logging_steps": 50,
"max_steps": 171750,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.089834314006528e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}