craa's picture
Upload folder using huggingface_hub
faabefa verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 65000,
"best_metric": 3.520042657852173,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_drop_frequency_3591/checkpoint-40000",
"epoch": 20.0,
"eval_steps": 1000,
"global_step": 68700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014559431599790344,
"grad_norm": 1.4049561023712158,
"learning_rate": 0.000294,
"loss": 8.4124,
"step": 50
},
{
"epoch": 0.029118863199580687,
"grad_norm": 0.8360756635665894,
"learning_rate": 0.0005939999999999999,
"loss": 6.7276,
"step": 100
},
{
"epoch": 0.043678294799371034,
"grad_norm": 0.42227354645729065,
"learning_rate": 0.0005995714285714286,
"loss": 6.3402,
"step": 150
},
{
"epoch": 0.058237726399161374,
"grad_norm": 0.9324970841407776,
"learning_rate": 0.0005991341107871719,
"loss": 6.1609,
"step": 200
},
{
"epoch": 0.07279715799895171,
"grad_norm": 0.4407173991203308,
"learning_rate": 0.0005986967930029154,
"loss": 6.0089,
"step": 250
},
{
"epoch": 0.08735658959874207,
"grad_norm": 0.4540535807609558,
"learning_rate": 0.0005982594752186589,
"loss": 5.8627,
"step": 300
},
{
"epoch": 0.10191602119853241,
"grad_norm": 0.4887666404247284,
"learning_rate": 0.0005978221574344022,
"loss": 5.74,
"step": 350
},
{
"epoch": 0.11647545279832275,
"grad_norm": 0.5809242129325867,
"learning_rate": 0.0005973848396501457,
"loss": 5.6281,
"step": 400
},
{
"epoch": 0.1310348843981131,
"grad_norm": 0.4683547914028168,
"learning_rate": 0.0005969475218658892,
"loss": 5.5265,
"step": 450
},
{
"epoch": 0.14559431599790343,
"grad_norm": 0.4089968502521515,
"learning_rate": 0.0005965102040816326,
"loss": 5.4214,
"step": 500
},
{
"epoch": 0.1601537475976938,
"grad_norm": 0.5466117858886719,
"learning_rate": 0.000596072886297376,
"loss": 5.3411,
"step": 550
},
{
"epoch": 0.17471317919748414,
"grad_norm": 0.3846788704395294,
"learning_rate": 0.0005956355685131195,
"loss": 5.2665,
"step": 600
},
{
"epoch": 0.18927261079727448,
"grad_norm": 0.4610619843006134,
"learning_rate": 0.0005951982507288629,
"loss": 5.2078,
"step": 650
},
{
"epoch": 0.20383204239706482,
"grad_norm": 0.41991209983825684,
"learning_rate": 0.0005947609329446064,
"loss": 5.1301,
"step": 700
},
{
"epoch": 0.21839147399685516,
"grad_norm": 0.4753279685974121,
"learning_rate": 0.0005943236151603498,
"loss": 5.0702,
"step": 750
},
{
"epoch": 0.2329509055966455,
"grad_norm": 0.4781185984611511,
"learning_rate": 0.0005938862973760932,
"loss": 5.0195,
"step": 800
},
{
"epoch": 0.24751033719643586,
"grad_norm": 0.41803014278411865,
"learning_rate": 0.0005934489795918367,
"loss": 4.971,
"step": 850
},
{
"epoch": 0.2620697687962262,
"grad_norm": 0.444289892911911,
"learning_rate": 0.0005930116618075802,
"loss": 4.9305,
"step": 900
},
{
"epoch": 0.2766292003960165,
"grad_norm": 0.4531804025173187,
"learning_rate": 0.0005925743440233235,
"loss": 4.8862,
"step": 950
},
{
"epoch": 0.29118863199580686,
"grad_norm": 0.4998404085636139,
"learning_rate": 0.000592137026239067,
"loss": 4.8266,
"step": 1000
},
{
"epoch": 0.29118863199580686,
"eval_accuracy": 0.25396983481710367,
"eval_loss": 4.760892868041992,
"eval_runtime": 179.1934,
"eval_samples_per_second": 92.877,
"eval_steps_per_second": 5.809,
"step": 1000
},
{
"epoch": 0.30574806359559725,
"grad_norm": 0.44976159930229187,
"learning_rate": 0.0005916997084548104,
"loss": 4.7891,
"step": 1050
},
{
"epoch": 0.3203074951953876,
"grad_norm": 0.38453996181488037,
"learning_rate": 0.0005912623906705539,
"loss": 4.7294,
"step": 1100
},
{
"epoch": 0.33486692679517793,
"grad_norm": 0.4332706928253174,
"learning_rate": 0.0005908250728862974,
"loss": 4.7002,
"step": 1150
},
{
"epoch": 0.3494263583949683,
"grad_norm": 0.42371395230293274,
"learning_rate": 0.0005903877551020407,
"loss": 4.6808,
"step": 1200
},
{
"epoch": 0.3639857899947586,
"grad_norm": 0.45705753564834595,
"learning_rate": 0.0005899504373177842,
"loss": 4.6327,
"step": 1250
},
{
"epoch": 0.37854522159454895,
"grad_norm": 0.42063650488853455,
"learning_rate": 0.0005895131195335277,
"loss": 4.6117,
"step": 1300
},
{
"epoch": 0.3931046531943393,
"grad_norm": 0.43308427929878235,
"learning_rate": 0.0005890758017492711,
"loss": 4.5751,
"step": 1350
},
{
"epoch": 0.40766408479412963,
"grad_norm": 0.43480074405670166,
"learning_rate": 0.0005886384839650145,
"loss": 4.5591,
"step": 1400
},
{
"epoch": 0.42222351639392,
"grad_norm": 0.45868223905563354,
"learning_rate": 0.000588201166180758,
"loss": 4.5263,
"step": 1450
},
{
"epoch": 0.4367829479937103,
"grad_norm": 0.41984814405441284,
"learning_rate": 0.0005877638483965014,
"loss": 4.5044,
"step": 1500
},
{
"epoch": 0.45134237959350065,
"grad_norm": 0.4139959216117859,
"learning_rate": 0.0005873265306122449,
"loss": 4.4968,
"step": 1550
},
{
"epoch": 0.465901811193291,
"grad_norm": 0.38750138878822327,
"learning_rate": 0.0005868892128279882,
"loss": 4.4646,
"step": 1600
},
{
"epoch": 0.48046124279308133,
"grad_norm": 0.41930243372917175,
"learning_rate": 0.0005864518950437317,
"loss": 4.4529,
"step": 1650
},
{
"epoch": 0.49502067439287173,
"grad_norm": 0.41106143593788147,
"learning_rate": 0.0005860145772594752,
"loss": 4.4362,
"step": 1700
},
{
"epoch": 0.509580105992662,
"grad_norm": 0.39897602796554565,
"learning_rate": 0.0005855772594752186,
"loss": 4.4112,
"step": 1750
},
{
"epoch": 0.5241395375924524,
"grad_norm": 0.4214461147785187,
"learning_rate": 0.000585139941690962,
"loss": 4.404,
"step": 1800
},
{
"epoch": 0.5386989691922427,
"grad_norm": 0.3887820541858673,
"learning_rate": 0.0005847026239067055,
"loss": 4.3787,
"step": 1850
},
{
"epoch": 0.553258400792033,
"grad_norm": 0.3768806755542755,
"learning_rate": 0.0005842653061224489,
"loss": 4.3711,
"step": 1900
},
{
"epoch": 0.5678178323918234,
"grad_norm": 0.3779532313346863,
"learning_rate": 0.0005838279883381924,
"loss": 4.3456,
"step": 1950
},
{
"epoch": 0.5823772639916137,
"grad_norm": 0.3921726942062378,
"learning_rate": 0.0005833906705539359,
"loss": 4.3399,
"step": 2000
},
{
"epoch": 0.5823772639916137,
"eval_accuracy": 0.2996934707950652,
"eval_loss": 4.28386926651001,
"eval_runtime": 179.6428,
"eval_samples_per_second": 92.645,
"eval_steps_per_second": 5.795,
"step": 2000
},
{
"epoch": 0.5969366955914042,
"grad_norm": 0.38071900606155396,
"learning_rate": 0.0005829533527696792,
"loss": 4.3206,
"step": 2050
},
{
"epoch": 0.6114961271911945,
"grad_norm": 0.4333866536617279,
"learning_rate": 0.0005825160349854227,
"loss": 4.316,
"step": 2100
},
{
"epoch": 0.6260555587909848,
"grad_norm": 0.3910558223724365,
"learning_rate": 0.0005820787172011661,
"loss": 4.2961,
"step": 2150
},
{
"epoch": 0.6406149903907752,
"grad_norm": 0.3819257318973541,
"learning_rate": 0.0005816413994169096,
"loss": 4.2951,
"step": 2200
},
{
"epoch": 0.6551744219905655,
"grad_norm": 0.4080394506454468,
"learning_rate": 0.000581204081632653,
"loss": 4.2756,
"step": 2250
},
{
"epoch": 0.6697338535903559,
"grad_norm": 0.37072518467903137,
"learning_rate": 0.0005807667638483965,
"loss": 4.2638,
"step": 2300
},
{
"epoch": 0.6842932851901462,
"grad_norm": 0.3981825113296509,
"learning_rate": 0.0005803294460641399,
"loss": 4.2662,
"step": 2350
},
{
"epoch": 0.6988527167899365,
"grad_norm": 0.384818971157074,
"learning_rate": 0.0005798921282798834,
"loss": 4.2509,
"step": 2400
},
{
"epoch": 0.7134121483897269,
"grad_norm": 0.43530362844467163,
"learning_rate": 0.0005794548104956267,
"loss": 4.2352,
"step": 2450
},
{
"epoch": 0.7279715799895172,
"grad_norm": 0.3544856607913971,
"learning_rate": 0.0005790174927113702,
"loss": 4.2268,
"step": 2500
},
{
"epoch": 0.7425310115893076,
"grad_norm": 0.38703247904777527,
"learning_rate": 0.0005785801749271137,
"loss": 4.2107,
"step": 2550
},
{
"epoch": 0.7570904431890979,
"grad_norm": 0.37904635071754456,
"learning_rate": 0.000578142857142857,
"loss": 4.1982,
"step": 2600
},
{
"epoch": 0.7716498747888882,
"grad_norm": 0.41309526562690735,
"learning_rate": 0.0005777055393586005,
"loss": 4.1833,
"step": 2650
},
{
"epoch": 0.7862093063886786,
"grad_norm": 0.42821475863456726,
"learning_rate": 0.000577268221574344,
"loss": 4.1892,
"step": 2700
},
{
"epoch": 0.8007687379884689,
"grad_norm": 0.4209707975387573,
"learning_rate": 0.0005768309037900874,
"loss": 4.1834,
"step": 2750
},
{
"epoch": 0.8153281695882593,
"grad_norm": 0.3531130254268646,
"learning_rate": 0.0005763935860058308,
"loss": 4.1801,
"step": 2800
},
{
"epoch": 0.8298876011880496,
"grad_norm": 0.34633395075798035,
"learning_rate": 0.0005759562682215744,
"loss": 4.1681,
"step": 2850
},
{
"epoch": 0.84444703278784,
"grad_norm": 0.3938649892807007,
"learning_rate": 0.0005755189504373177,
"loss": 4.1636,
"step": 2900
},
{
"epoch": 0.8590064643876303,
"grad_norm": 0.3613823652267456,
"learning_rate": 0.0005750816326530612,
"loss": 4.1578,
"step": 2950
},
{
"epoch": 0.8735658959874206,
"grad_norm": 0.3491958677768707,
"learning_rate": 0.0005746443148688046,
"loss": 4.1452,
"step": 3000
},
{
"epoch": 0.8735658959874206,
"eval_accuracy": 0.31544864157201075,
"eval_loss": 4.095163822174072,
"eval_runtime": 179.6171,
"eval_samples_per_second": 92.658,
"eval_steps_per_second": 5.796,
"step": 3000
},
{
"epoch": 0.888125327587211,
"grad_norm": 0.3552567958831787,
"learning_rate": 0.000574206997084548,
"loss": 4.1285,
"step": 3050
},
{
"epoch": 0.9026847591870013,
"grad_norm": 0.35991519689559937,
"learning_rate": 0.0005737696793002915,
"loss": 4.132,
"step": 3100
},
{
"epoch": 0.9172441907867916,
"grad_norm": 0.3861224949359894,
"learning_rate": 0.000573332361516035,
"loss": 4.1214,
"step": 3150
},
{
"epoch": 0.931803622386582,
"grad_norm": 0.3921383023262024,
"learning_rate": 0.0005728950437317784,
"loss": 4.1157,
"step": 3200
},
{
"epoch": 0.9463630539863723,
"grad_norm": 0.3566656708717346,
"learning_rate": 0.0005724577259475218,
"loss": 4.1088,
"step": 3250
},
{
"epoch": 0.9609224855861627,
"grad_norm": 0.3769164979457855,
"learning_rate": 0.0005720204081632652,
"loss": 4.0963,
"step": 3300
},
{
"epoch": 0.975481917185953,
"grad_norm": 0.3577769100666046,
"learning_rate": 0.0005715830903790087,
"loss": 4.1067,
"step": 3350
},
{
"epoch": 0.9900413487857435,
"grad_norm": 0.35588538646698,
"learning_rate": 0.0005711457725947522,
"loss": 4.0912,
"step": 3400
},
{
"epoch": 1.0043678294799372,
"grad_norm": 0.3580274283885956,
"learning_rate": 0.0005707084548104955,
"loss": 4.0849,
"step": 3450
},
{
"epoch": 1.0189272610797275,
"grad_norm": 0.3520485758781433,
"learning_rate": 0.000570271137026239,
"loss": 4.0188,
"step": 3500
},
{
"epoch": 1.0334866926795179,
"grad_norm": 0.3421690762042999,
"learning_rate": 0.0005698338192419825,
"loss": 4.0128,
"step": 3550
},
{
"epoch": 1.0480461242793082,
"grad_norm": 0.3418625593185425,
"learning_rate": 0.0005693965014577259,
"loss": 4.0056,
"step": 3600
},
{
"epoch": 1.0626055558790986,
"grad_norm": 0.34526926279067993,
"learning_rate": 0.0005689591836734693,
"loss": 4.0084,
"step": 3650
},
{
"epoch": 1.077164987478889,
"grad_norm": 0.35390642285346985,
"learning_rate": 0.0005685218658892128,
"loss": 4.0061,
"step": 3700
},
{
"epoch": 1.0917244190786792,
"grad_norm": 0.3431430459022522,
"learning_rate": 0.0005680845481049562,
"loss": 3.9994,
"step": 3750
},
{
"epoch": 1.1062838506784696,
"grad_norm": 0.357334166765213,
"learning_rate": 0.0005676472303206997,
"loss": 4.0071,
"step": 3800
},
{
"epoch": 1.12084328227826,
"grad_norm": 0.3587090075016022,
"learning_rate": 0.000567209912536443,
"loss": 3.985,
"step": 3850
},
{
"epoch": 1.1354027138780503,
"grad_norm": 0.3586151599884033,
"learning_rate": 0.0005667725947521865,
"loss": 4.0047,
"step": 3900
},
{
"epoch": 1.1499621454778406,
"grad_norm": 0.37636685371398926,
"learning_rate": 0.00056633527696793,
"loss": 3.9987,
"step": 3950
},
{
"epoch": 1.164521577077631,
"grad_norm": 0.35518568754196167,
"learning_rate": 0.0005658979591836735,
"loss": 3.9904,
"step": 4000
},
{
"epoch": 1.164521577077631,
"eval_accuracy": 0.32538388464653073,
"eval_loss": 3.9894351959228516,
"eval_runtime": 179.567,
"eval_samples_per_second": 92.684,
"eval_steps_per_second": 5.797,
"step": 4000
},
{
"epoch": 1.1790810086774213,
"grad_norm": 0.3445068895816803,
"learning_rate": 0.0005654606413994169,
"loss": 3.9831,
"step": 4050
},
{
"epoch": 1.1936404402772116,
"grad_norm": 0.3411754369735718,
"learning_rate": 0.0005650233236151603,
"loss": 3.9741,
"step": 4100
},
{
"epoch": 1.208199871877002,
"grad_norm": 0.3622643053531647,
"learning_rate": 0.0005645860058309037,
"loss": 3.9812,
"step": 4150
},
{
"epoch": 1.2227593034767923,
"grad_norm": 0.35340210795402527,
"learning_rate": 0.0005641486880466472,
"loss": 3.9853,
"step": 4200
},
{
"epoch": 1.2373187350765826,
"grad_norm": 0.34644776582717896,
"learning_rate": 0.0005637113702623907,
"loss": 3.9733,
"step": 4250
},
{
"epoch": 1.251878166676373,
"grad_norm": 0.33221983909606934,
"learning_rate": 0.000563274052478134,
"loss": 3.9601,
"step": 4300
},
{
"epoch": 1.2664375982761633,
"grad_norm": 0.3372167646884918,
"learning_rate": 0.0005628367346938775,
"loss": 3.9708,
"step": 4350
},
{
"epoch": 1.2809970298759537,
"grad_norm": 0.3629266321659088,
"learning_rate": 0.0005623994169096209,
"loss": 3.9556,
"step": 4400
},
{
"epoch": 1.295556461475744,
"grad_norm": 0.31815558671951294,
"learning_rate": 0.0005619620991253644,
"loss": 3.9644,
"step": 4450
},
{
"epoch": 1.3101158930755343,
"grad_norm": 0.3518199622631073,
"learning_rate": 0.0005615247813411078,
"loss": 3.9551,
"step": 4500
},
{
"epoch": 1.3246753246753247,
"grad_norm": 0.3197888135910034,
"learning_rate": 0.0005610874635568513,
"loss": 3.9556,
"step": 4550
},
{
"epoch": 1.339234756275115,
"grad_norm": 0.35236433148384094,
"learning_rate": 0.0005606501457725947,
"loss": 3.9573,
"step": 4600
},
{
"epoch": 1.3537941878749054,
"grad_norm": 0.3366566002368927,
"learning_rate": 0.0005602128279883382,
"loss": 3.9619,
"step": 4650
},
{
"epoch": 1.3683536194746957,
"grad_norm": 0.3635067939758301,
"learning_rate": 0.0005597755102040816,
"loss": 3.9568,
"step": 4700
},
{
"epoch": 1.382913051074486,
"grad_norm": 0.3495481610298157,
"learning_rate": 0.000559338192419825,
"loss": 3.935,
"step": 4750
},
{
"epoch": 1.3974724826742764,
"grad_norm": 0.34598347544670105,
"learning_rate": 0.0005589008746355685,
"loss": 3.9463,
"step": 4800
},
{
"epoch": 1.4120319142740667,
"grad_norm": 0.32707110047340393,
"learning_rate": 0.0005584635568513118,
"loss": 3.9388,
"step": 4850
},
{
"epoch": 1.426591345873857,
"grad_norm": 0.35207509994506836,
"learning_rate": 0.0005580262390670554,
"loss": 3.9363,
"step": 4900
},
{
"epoch": 1.4411507774736474,
"grad_norm": 0.33082953095436096,
"learning_rate": 0.0005575889212827988,
"loss": 3.9443,
"step": 4950
},
{
"epoch": 1.4557102090734377,
"grad_norm": 0.36195048689842224,
"learning_rate": 0.0005571516034985422,
"loss": 3.934,
"step": 5000
},
{
"epoch": 1.4557102090734377,
"eval_accuracy": 0.3320231362585752,
"eval_loss": 3.9117023944854736,
"eval_runtime": 179.5118,
"eval_samples_per_second": 92.713,
"eval_steps_per_second": 5.799,
"step": 5000
},
{
"epoch": 1.470269640673228,
"grad_norm": 0.3603370487689972,
"learning_rate": 0.0005567142857142856,
"loss": 3.9232,
"step": 5050
},
{
"epoch": 1.4848290722730184,
"grad_norm": 0.3303501307964325,
"learning_rate": 0.0005562769679300292,
"loss": 3.929,
"step": 5100
},
{
"epoch": 1.4993885038728088,
"grad_norm": 0.34812071919441223,
"learning_rate": 0.0005558396501457725,
"loss": 3.9186,
"step": 5150
},
{
"epoch": 1.5139479354725993,
"grad_norm": 0.3245297372341156,
"learning_rate": 0.000555402332361516,
"loss": 3.9281,
"step": 5200
},
{
"epoch": 1.5285073670723897,
"grad_norm": 0.32848072052001953,
"learning_rate": 0.0005549650145772595,
"loss": 3.9081,
"step": 5250
},
{
"epoch": 1.54306679867218,
"grad_norm": 0.3524268865585327,
"learning_rate": 0.0005545276967930028,
"loss": 3.9169,
"step": 5300
},
{
"epoch": 1.5576262302719703,
"grad_norm": 0.3273775279521942,
"learning_rate": 0.0005540903790087463,
"loss": 3.9057,
"step": 5350
},
{
"epoch": 1.5721856618717607,
"grad_norm": 0.33142444491386414,
"learning_rate": 0.0005536530612244898,
"loss": 3.9117,
"step": 5400
},
{
"epoch": 1.586745093471551,
"grad_norm": 0.35404613614082336,
"learning_rate": 0.0005532157434402332,
"loss": 3.9,
"step": 5450
},
{
"epoch": 1.6013045250713414,
"grad_norm": 0.3326050341129303,
"learning_rate": 0.0005527784256559766,
"loss": 3.9023,
"step": 5500
},
{
"epoch": 1.6158639566711317,
"grad_norm": 0.32253944873809814,
"learning_rate": 0.00055234110787172,
"loss": 3.9036,
"step": 5550
},
{
"epoch": 1.630423388270922,
"grad_norm": 0.40896502137184143,
"learning_rate": 0.0005519037900874635,
"loss": 3.892,
"step": 5600
},
{
"epoch": 1.6449828198707124,
"grad_norm": 0.33099985122680664,
"learning_rate": 0.000551466472303207,
"loss": 3.8921,
"step": 5650
},
{
"epoch": 1.6595422514705027,
"grad_norm": 0.3134934902191162,
"learning_rate": 0.0005510291545189503,
"loss": 3.8986,
"step": 5700
},
{
"epoch": 1.674101683070293,
"grad_norm": 0.32286426424980164,
"learning_rate": 0.0005505918367346938,
"loss": 3.8705,
"step": 5750
},
{
"epoch": 1.6886611146700834,
"grad_norm": 0.3152390122413635,
"learning_rate": 0.0005501545189504373,
"loss": 3.8843,
"step": 5800
},
{
"epoch": 1.7032205462698737,
"grad_norm": 0.3241208493709564,
"learning_rate": 0.0005497172011661807,
"loss": 3.8915,
"step": 5850
},
{
"epoch": 1.717779977869664,
"grad_norm": 0.3297117054462433,
"learning_rate": 0.0005492798833819241,
"loss": 3.8959,
"step": 5900
},
{
"epoch": 1.7323394094694544,
"grad_norm": 0.34585368633270264,
"learning_rate": 0.0005488425655976676,
"loss": 3.8631,
"step": 5950
},
{
"epoch": 1.7468988410692448,
"grad_norm": 0.32093173265457153,
"learning_rate": 0.000548405247813411,
"loss": 3.8774,
"step": 6000
},
{
"epoch": 1.7468988410692448,
"eval_accuracy": 0.3372265721042079,
"eval_loss": 3.8566107749938965,
"eval_runtime": 179.5862,
"eval_samples_per_second": 92.674,
"eval_steps_per_second": 5.797,
"step": 6000
},
{
"epoch": 1.761458272669035,
"grad_norm": 0.3342028260231018,
"learning_rate": 0.0005479679300291545,
"loss": 3.8767,
"step": 6050
},
{
"epoch": 1.7760177042688254,
"grad_norm": 0.331476628780365,
"learning_rate": 0.000547530612244898,
"loss": 3.8741,
"step": 6100
},
{
"epoch": 1.7905771358686158,
"grad_norm": 0.3178947865962982,
"learning_rate": 0.0005470932944606413,
"loss": 3.8753,
"step": 6150
},
{
"epoch": 1.8051365674684061,
"grad_norm": 0.33139607310295105,
"learning_rate": 0.0005466559766763848,
"loss": 3.8686,
"step": 6200
},
{
"epoch": 1.8196959990681965,
"grad_norm": 0.35270482301712036,
"learning_rate": 0.0005462186588921283,
"loss": 3.8577,
"step": 6250
},
{
"epoch": 1.8342554306679868,
"grad_norm": 0.3247964382171631,
"learning_rate": 0.0005457813411078717,
"loss": 3.8574,
"step": 6300
},
{
"epoch": 1.8488148622677771,
"grad_norm": 0.33985435962677,
"learning_rate": 0.0005453440233236151,
"loss": 3.8546,
"step": 6350
},
{
"epoch": 1.8633742938675675,
"grad_norm": 0.33400237560272217,
"learning_rate": 0.0005449067055393585,
"loss": 3.8636,
"step": 6400
},
{
"epoch": 1.8779337254673578,
"grad_norm": 0.3367692232131958,
"learning_rate": 0.0005444693877551019,
"loss": 3.8718,
"step": 6450
},
{
"epoch": 1.8924931570671482,
"grad_norm": 0.3267197608947754,
"learning_rate": 0.0005440320699708455,
"loss": 3.8507,
"step": 6500
},
{
"epoch": 1.9070525886669385,
"grad_norm": 0.3389538824558258,
"learning_rate": 0.0005435947521865888,
"loss": 3.8546,
"step": 6550
},
{
"epoch": 1.9216120202667288,
"grad_norm": 0.32694804668426514,
"learning_rate": 0.0005431574344023323,
"loss": 3.8391,
"step": 6600
},
{
"epoch": 1.9361714518665192,
"grad_norm": 0.3353123366832733,
"learning_rate": 0.0005427201166180758,
"loss": 3.8435,
"step": 6650
},
{
"epoch": 1.9507308834663095,
"grad_norm": 0.32406482100486755,
"learning_rate": 0.0005422827988338192,
"loss": 3.8409,
"step": 6700
},
{
"epoch": 1.9652903150660999,
"grad_norm": 0.3334747850894928,
"learning_rate": 0.0005418454810495626,
"loss": 3.8506,
"step": 6750
},
{
"epoch": 1.9798497466658902,
"grad_norm": 0.33217740058898926,
"learning_rate": 0.0005414081632653061,
"loss": 3.8396,
"step": 6800
},
{
"epoch": 1.9944091782656805,
"grad_norm": 0.33468008041381836,
"learning_rate": 0.0005409708454810495,
"loss": 3.8407,
"step": 6850
},
{
"epoch": 2.0087356589598744,
"grad_norm": 0.3196060359477997,
"learning_rate": 0.0005405335276967929,
"loss": 3.7913,
"step": 6900
},
{
"epoch": 2.0232950905596647,
"grad_norm": 0.3573300540447235,
"learning_rate": 0.0005400962099125365,
"loss": 3.7409,
"step": 6950
},
{
"epoch": 2.037854522159455,
"grad_norm": 0.3402981460094452,
"learning_rate": 0.0005396588921282798,
"loss": 3.7556,
"step": 7000
},
{
"epoch": 2.037854522159455,
"eval_accuracy": 0.34190017535271905,
"eval_loss": 3.809979200363159,
"eval_runtime": 179.6501,
"eval_samples_per_second": 92.641,
"eval_steps_per_second": 5.795,
"step": 7000
},
{
"epoch": 2.0524139537592454,
"grad_norm": 0.3510541319847107,
"learning_rate": 0.0005392215743440233,
"loss": 3.7422,
"step": 7050
},
{
"epoch": 2.0669733853590357,
"grad_norm": 0.31116750836372375,
"learning_rate": 0.0005387842565597666,
"loss": 3.7475,
"step": 7100
},
{
"epoch": 2.081532816958826,
"grad_norm": 0.3254874050617218,
"learning_rate": 0.0005383469387755102,
"loss": 3.7546,
"step": 7150
},
{
"epoch": 2.0960922485586164,
"grad_norm": 0.3147241771221161,
"learning_rate": 0.0005379096209912536,
"loss": 3.7518,
"step": 7200
},
{
"epoch": 2.1106516801584068,
"grad_norm": 0.3199782073497772,
"learning_rate": 0.000537472303206997,
"loss": 3.7659,
"step": 7250
},
{
"epoch": 2.125211111758197,
"grad_norm": 0.3094785809516907,
"learning_rate": 0.0005370349854227405,
"loss": 3.7481,
"step": 7300
},
{
"epoch": 2.1397705433579874,
"grad_norm": 0.3172190189361572,
"learning_rate": 0.0005365976676384839,
"loss": 3.7408,
"step": 7350
},
{
"epoch": 2.154329974957778,
"grad_norm": 0.3381129801273346,
"learning_rate": 0.0005361603498542273,
"loss": 3.7448,
"step": 7400
},
{
"epoch": 2.168889406557568,
"grad_norm": 0.3302014470100403,
"learning_rate": 0.0005357230320699708,
"loss": 3.7451,
"step": 7450
},
{
"epoch": 2.1834488381573585,
"grad_norm": 0.34532982110977173,
"learning_rate": 0.0005352857142857143,
"loss": 3.7459,
"step": 7500
},
{
"epoch": 2.198008269757149,
"grad_norm": 0.3262939751148224,
"learning_rate": 0.0005348483965014576,
"loss": 3.7466,
"step": 7550
},
{
"epoch": 2.212567701356939,
"grad_norm": 0.33892711997032166,
"learning_rate": 0.0005344110787172011,
"loss": 3.7505,
"step": 7600
},
{
"epoch": 2.2271271329567295,
"grad_norm": 0.3445602059364319,
"learning_rate": 0.0005339737609329446,
"loss": 3.7429,
"step": 7650
},
{
"epoch": 2.24168656455652,
"grad_norm": 0.3161507248878479,
"learning_rate": 0.000533536443148688,
"loss": 3.7541,
"step": 7700
},
{
"epoch": 2.25624599615631,
"grad_norm": 0.31178775429725647,
"learning_rate": 0.0005330991253644314,
"loss": 3.7447,
"step": 7750
},
{
"epoch": 2.2708054277561005,
"grad_norm": 0.3178870975971222,
"learning_rate": 0.0005326618075801749,
"loss": 3.7506,
"step": 7800
},
{
"epoch": 2.285364859355891,
"grad_norm": 0.3333457112312317,
"learning_rate": 0.0005322244897959183,
"loss": 3.7494,
"step": 7850
},
{
"epoch": 2.299924290955681,
"grad_norm": 0.3204410672187805,
"learning_rate": 0.0005317871720116618,
"loss": 3.7474,
"step": 7900
},
{
"epoch": 2.3144837225554715,
"grad_norm": 0.31767410039901733,
"learning_rate": 0.0005313498542274051,
"loss": 3.7368,
"step": 7950
},
{
"epoch": 2.329043154155262,
"grad_norm": 0.33374062180519104,
"learning_rate": 0.0005309125364431486,
"loss": 3.7524,
"step": 8000
},
{
"epoch": 2.329043154155262,
"eval_accuracy": 0.34463388108962084,
"eval_loss": 3.7798807621002197,
"eval_runtime": 179.8167,
"eval_samples_per_second": 92.555,
"eval_steps_per_second": 5.789,
"step": 8000
},
{
"epoch": 2.343602585755052,
"grad_norm": 0.32286617159843445,
"learning_rate": 0.0005304752186588921,
"loss": 3.7492,
"step": 8050
},
{
"epoch": 2.3581620173548425,
"grad_norm": 0.33228906989097595,
"learning_rate": 0.0005300379008746355,
"loss": 3.764,
"step": 8100
},
{
"epoch": 2.372721448954633,
"grad_norm": 0.33857783675193787,
"learning_rate": 0.000529600583090379,
"loss": 3.759,
"step": 8150
},
{
"epoch": 2.3872808805544232,
"grad_norm": 0.3177933394908905,
"learning_rate": 0.0005291632653061224,
"loss": 3.7536,
"step": 8200
},
{
"epoch": 2.4018403121542136,
"grad_norm": 0.3171054720878601,
"learning_rate": 0.0005287259475218658,
"loss": 3.7403,
"step": 8250
},
{
"epoch": 2.416399743754004,
"grad_norm": 0.32724741101264954,
"learning_rate": 0.0005282886297376093,
"loss": 3.7446,
"step": 8300
},
{
"epoch": 2.4309591753537942,
"grad_norm": 0.3406330347061157,
"learning_rate": 0.0005278513119533528,
"loss": 3.7441,
"step": 8350
},
{
"epoch": 2.4455186069535846,
"grad_norm": 0.3245644271373749,
"learning_rate": 0.0005274139941690961,
"loss": 3.7317,
"step": 8400
},
{
"epoch": 2.460078038553375,
"grad_norm": 0.3408276438713074,
"learning_rate": 0.0005269766763848396,
"loss": 3.7373,
"step": 8450
},
{
"epoch": 2.4746374701531653,
"grad_norm": 0.31394264101982117,
"learning_rate": 0.0005265393586005831,
"loss": 3.732,
"step": 8500
},
{
"epoch": 2.4891969017529556,
"grad_norm": 0.3347412645816803,
"learning_rate": 0.0005261020408163265,
"loss": 3.7266,
"step": 8550
},
{
"epoch": 2.503756333352746,
"grad_norm": 0.32223114371299744,
"learning_rate": 0.0005256647230320699,
"loss": 3.7293,
"step": 8600
},
{
"epoch": 2.5183157649525363,
"grad_norm": 0.3145173490047455,
"learning_rate": 0.0005252274052478134,
"loss": 3.7471,
"step": 8650
},
{
"epoch": 2.5328751965523266,
"grad_norm": 0.31143006682395935,
"learning_rate": 0.0005247900874635568,
"loss": 3.7394,
"step": 8700
},
{
"epoch": 2.547434628152117,
"grad_norm": 0.3238007724285126,
"learning_rate": 0.0005243527696793003,
"loss": 3.7283,
"step": 8750
},
{
"epoch": 2.5619940597519073,
"grad_norm": 0.3301667869091034,
"learning_rate": 0.0005239154518950436,
"loss": 3.7463,
"step": 8800
},
{
"epoch": 2.5765534913516976,
"grad_norm": 0.32847797870635986,
"learning_rate": 0.0005234781341107871,
"loss": 3.7397,
"step": 8850
},
{
"epoch": 2.591112922951488,
"grad_norm": 0.32561489939689636,
"learning_rate": 0.0005230408163265306,
"loss": 3.7437,
"step": 8900
},
{
"epoch": 2.6056723545512783,
"grad_norm": 0.30937111377716064,
"learning_rate": 0.000522603498542274,
"loss": 3.7399,
"step": 8950
},
{
"epoch": 2.6202317861510687,
"grad_norm": 0.32154905796051025,
"learning_rate": 0.0005221661807580175,
"loss": 3.7339,
"step": 9000
},
{
"epoch": 2.6202317861510687,
"eval_accuracy": 0.3472925683629266,
"eval_loss": 3.7506699562072754,
"eval_runtime": 179.8206,
"eval_samples_per_second": 92.553,
"eval_steps_per_second": 5.789,
"step": 9000
},
{
"epoch": 2.634791217750859,
"grad_norm": 0.317490816116333,
"learning_rate": 0.0005217288629737609,
"loss": 3.7263,
"step": 9050
},
{
"epoch": 2.6493506493506493,
"grad_norm": 0.32750970125198364,
"learning_rate": 0.0005212915451895043,
"loss": 3.7324,
"step": 9100
},
{
"epoch": 2.6639100809504397,
"grad_norm": 0.3290070593357086,
"learning_rate": 0.0005208542274052477,
"loss": 3.7314,
"step": 9150
},
{
"epoch": 2.67846951255023,
"grad_norm": 0.34482887387275696,
"learning_rate": 0.0005204169096209913,
"loss": 3.7192,
"step": 9200
},
{
"epoch": 2.6930289441500204,
"grad_norm": 0.31812381744384766,
"learning_rate": 0.0005199795918367346,
"loss": 3.7308,
"step": 9250
},
{
"epoch": 2.7075883757498107,
"grad_norm": 0.33570706844329834,
"learning_rate": 0.0005195422740524781,
"loss": 3.7338,
"step": 9300
},
{
"epoch": 2.722147807349601,
"grad_norm": 0.3004995584487915,
"learning_rate": 0.0005191049562682216,
"loss": 3.7224,
"step": 9350
},
{
"epoch": 2.7367072389493914,
"grad_norm": 0.3277261555194855,
"learning_rate": 0.000518667638483965,
"loss": 3.7313,
"step": 9400
},
{
"epoch": 2.7512666705491817,
"grad_norm": 0.3260866701602936,
"learning_rate": 0.0005182303206997084,
"loss": 3.7252,
"step": 9450
},
{
"epoch": 2.765826102148972,
"grad_norm": 0.30772513151168823,
"learning_rate": 0.0005177930029154519,
"loss": 3.7263,
"step": 9500
},
{
"epoch": 2.7803855337487624,
"grad_norm": 0.3158465027809143,
"learning_rate": 0.0005173556851311953,
"loss": 3.728,
"step": 9550
},
{
"epoch": 2.7949449653485527,
"grad_norm": 0.31197673082351685,
"learning_rate": 0.0005169183673469387,
"loss": 3.7135,
"step": 9600
},
{
"epoch": 2.809504396948343,
"grad_norm": 0.33720263838768005,
"learning_rate": 0.0005164810495626821,
"loss": 3.7205,
"step": 9650
},
{
"epoch": 2.8240638285481334,
"grad_norm": 0.3222922086715698,
"learning_rate": 0.0005160437317784256,
"loss": 3.7212,
"step": 9700
},
{
"epoch": 2.8386232601479238,
"grad_norm": 0.32163000106811523,
"learning_rate": 0.0005156064139941691,
"loss": 3.7303,
"step": 9750
},
{
"epoch": 2.853182691747714,
"grad_norm": 0.29815468192100525,
"learning_rate": 0.0005151690962099124,
"loss": 3.7143,
"step": 9800
},
{
"epoch": 2.8677421233475044,
"grad_norm": 0.3258896768093109,
"learning_rate": 0.000514731778425656,
"loss": 3.7076,
"step": 9850
},
{
"epoch": 2.882301554947295,
"grad_norm": 0.32969552278518677,
"learning_rate": 0.0005142944606413994,
"loss": 3.7269,
"step": 9900
},
{
"epoch": 2.896860986547085,
"grad_norm": 0.31835922598838806,
"learning_rate": 0.0005138571428571428,
"loss": 3.7207,
"step": 9950
},
{
"epoch": 2.9114204181468755,
"grad_norm": 0.3245142102241516,
"learning_rate": 0.0005134198250728862,
"loss": 3.7167,
"step": 10000
},
{
"epoch": 2.9114204181468755,
"eval_accuracy": 0.3495557037372717,
"eval_loss": 3.7238857746124268,
"eval_runtime": 179.8397,
"eval_samples_per_second": 92.544,
"eval_steps_per_second": 5.788,
"step": 10000
},
{
"epoch": 2.925979849746666,
"grad_norm": 0.32630476355552673,
"learning_rate": 0.0005129825072886297,
"loss": 3.7083,
"step": 10050
},
{
"epoch": 2.940539281346456,
"grad_norm": 0.3315964341163635,
"learning_rate": 0.0005125451895043731,
"loss": 3.7064,
"step": 10100
},
{
"epoch": 2.9550987129462465,
"grad_norm": 0.31410086154937744,
"learning_rate": 0.0005121078717201166,
"loss": 3.7236,
"step": 10150
},
{
"epoch": 2.969658144546037,
"grad_norm": 0.33839717507362366,
"learning_rate": 0.0005116705539358601,
"loss": 3.7078,
"step": 10200
},
{
"epoch": 2.984217576145827,
"grad_norm": 0.32319313287734985,
"learning_rate": 0.0005112332361516034,
"loss": 3.7141,
"step": 10250
},
{
"epoch": 2.9987770077456175,
"grad_norm": 0.3235074579715729,
"learning_rate": 0.0005107959183673469,
"loss": 3.7063,
"step": 10300
},
{
"epoch": 3.0131034884398114,
"grad_norm": 0.314828097820282,
"learning_rate": 0.0005103586005830903,
"loss": 3.6245,
"step": 10350
},
{
"epoch": 3.0276629200396017,
"grad_norm": 0.31607604026794434,
"learning_rate": 0.0005099212827988338,
"loss": 3.6112,
"step": 10400
},
{
"epoch": 3.042222351639392,
"grad_norm": 0.35359737277030945,
"learning_rate": 0.0005094839650145772,
"loss": 3.609,
"step": 10450
},
{
"epoch": 3.0567817832391824,
"grad_norm": 0.32654085755348206,
"learning_rate": 0.0005090466472303206,
"loss": 3.6166,
"step": 10500
},
{
"epoch": 3.0713412148389727,
"grad_norm": 0.3420456051826477,
"learning_rate": 0.0005086093294460641,
"loss": 3.6039,
"step": 10550
},
{
"epoch": 3.085900646438763,
"grad_norm": 0.32927215099334717,
"learning_rate": 0.0005081720116618076,
"loss": 3.6076,
"step": 10600
},
{
"epoch": 3.1004600780385534,
"grad_norm": 0.32174116373062134,
"learning_rate": 0.0005077346938775509,
"loss": 3.62,
"step": 10650
},
{
"epoch": 3.1150195096383437,
"grad_norm": 0.32081031799316406,
"learning_rate": 0.0005072973760932944,
"loss": 3.6198,
"step": 10700
},
{
"epoch": 3.129578941238134,
"grad_norm": 0.3233294188976288,
"learning_rate": 0.0005068600583090379,
"loss": 3.6221,
"step": 10750
},
{
"epoch": 3.1441383728379244,
"grad_norm": 0.3179484009742737,
"learning_rate": 0.0005064227405247813,
"loss": 3.6265,
"step": 10800
},
{
"epoch": 3.1586978044377148,
"grad_norm": 0.3125128746032715,
"learning_rate": 0.0005059854227405247,
"loss": 3.6316,
"step": 10850
},
{
"epoch": 3.173257236037505,
"grad_norm": 0.32463568449020386,
"learning_rate": 0.0005055481049562682,
"loss": 3.6245,
"step": 10900
},
{
"epoch": 3.1878166676372954,
"grad_norm": 0.31310543417930603,
"learning_rate": 0.0005051107871720116,
"loss": 3.6185,
"step": 10950
},
{
"epoch": 3.2023760992370858,
"grad_norm": 0.3464823067188263,
"learning_rate": 0.0005046734693877551,
"loss": 3.6204,
"step": 11000
},
{
"epoch": 3.2023760992370858,
"eval_accuracy": 0.3516197697403503,
"eval_loss": 3.7100203037261963,
"eval_runtime": 180.2504,
"eval_samples_per_second": 92.333,
"eval_steps_per_second": 5.775,
"step": 11000
},
{
"epoch": 3.216935530836876,
"grad_norm": 0.3277588486671448,
"learning_rate": 0.0005042361516034986,
"loss": 3.6268,
"step": 11050
},
{
"epoch": 3.2314949624366665,
"grad_norm": 0.32706061005592346,
"learning_rate": 0.0005037988338192419,
"loss": 3.6153,
"step": 11100
},
{
"epoch": 3.246054394036457,
"grad_norm": 0.31766435503959656,
"learning_rate": 0.0005033615160349854,
"loss": 3.6336,
"step": 11150
},
{
"epoch": 3.260613825636247,
"grad_norm": 0.3006264269351959,
"learning_rate": 0.0005029241982507288,
"loss": 3.6275,
"step": 11200
},
{
"epoch": 3.2751732572360375,
"grad_norm": 0.32919037342071533,
"learning_rate": 0.0005024868804664723,
"loss": 3.6301,
"step": 11250
},
{
"epoch": 3.289732688835828,
"grad_norm": 0.3155740797519684,
"learning_rate": 0.0005020495626822157,
"loss": 3.6203,
"step": 11300
},
{
"epoch": 3.304292120435618,
"grad_norm": 0.3527681529521942,
"learning_rate": 0.0005016122448979591,
"loss": 3.6288,
"step": 11350
},
{
"epoch": 3.3188515520354085,
"grad_norm": 0.3135804831981659,
"learning_rate": 0.0005011749271137026,
"loss": 3.6415,
"step": 11400
},
{
"epoch": 3.333410983635199,
"grad_norm": 0.3078667223453522,
"learning_rate": 0.0005007376093294461,
"loss": 3.6284,
"step": 11450
},
{
"epoch": 3.347970415234989,
"grad_norm": 0.319755494594574,
"learning_rate": 0.0005003002915451894,
"loss": 3.6314,
"step": 11500
},
{
"epoch": 3.3625298468347795,
"grad_norm": 0.32641854882240295,
"learning_rate": 0.0004998629737609329,
"loss": 3.629,
"step": 11550
},
{
"epoch": 3.37708927843457,
"grad_norm": 0.3268803060054779,
"learning_rate": 0.0004994256559766764,
"loss": 3.6372,
"step": 11600
},
{
"epoch": 3.39164871003436,
"grad_norm": 0.32382065057754517,
"learning_rate": 0.0004989883381924198,
"loss": 3.6286,
"step": 11650
},
{
"epoch": 3.4062081416341505,
"grad_norm": 0.3158361613750458,
"learning_rate": 0.0004985510204081632,
"loss": 3.6329,
"step": 11700
},
{
"epoch": 3.420767573233941,
"grad_norm": 0.31245240569114685,
"learning_rate": 0.0004981137026239067,
"loss": 3.6428,
"step": 11750
},
{
"epoch": 3.435327004833731,
"grad_norm": 0.3362303078174591,
"learning_rate": 0.0004976763848396501,
"loss": 3.6369,
"step": 11800
},
{
"epoch": 3.4498864364335216,
"grad_norm": 0.3208737373352051,
"learning_rate": 0.0004972390670553935,
"loss": 3.6428,
"step": 11850
},
{
"epoch": 3.464445868033312,
"grad_norm": 0.3163570761680603,
"learning_rate": 0.000496801749271137,
"loss": 3.6239,
"step": 11900
},
{
"epoch": 3.4790052996331022,
"grad_norm": 0.3181529641151428,
"learning_rate": 0.0004963644314868804,
"loss": 3.6303,
"step": 11950
},
{
"epoch": 3.4935647312328926,
"grad_norm": 0.33231833577156067,
"learning_rate": 0.0004959271137026239,
"loss": 3.6358,
"step": 12000
},
{
"epoch": 3.4935647312328926,
"eval_accuracy": 0.35356460577150667,
"eval_loss": 3.6901774406433105,
"eval_runtime": 180.1137,
"eval_samples_per_second": 92.403,
"eval_steps_per_second": 5.78,
"step": 12000
},
{
"epoch": 3.508124162832683,
"grad_norm": 0.3368173837661743,
"learning_rate": 0.0004954897959183672,
"loss": 3.6389,
"step": 12050
},
{
"epoch": 3.5226835944324733,
"grad_norm": 0.33402830362319946,
"learning_rate": 0.0004950524781341108,
"loss": 3.645,
"step": 12100
},
{
"epoch": 3.5372430260322636,
"grad_norm": 0.33064502477645874,
"learning_rate": 0.0004946151603498542,
"loss": 3.6336,
"step": 12150
},
{
"epoch": 3.551802457632054,
"grad_norm": 0.31694450974464417,
"learning_rate": 0.0004941778425655976,
"loss": 3.6325,
"step": 12200
},
{
"epoch": 3.5663618892318443,
"grad_norm": 0.3069068193435669,
"learning_rate": 0.0004937405247813411,
"loss": 3.6246,
"step": 12250
},
{
"epoch": 3.5809213208316346,
"grad_norm": 0.3142222464084625,
"learning_rate": 0.0004933032069970845,
"loss": 3.6453,
"step": 12300
},
{
"epoch": 3.595480752431425,
"grad_norm": 0.3237994909286499,
"learning_rate": 0.0004928658892128279,
"loss": 3.6295,
"step": 12350
},
{
"epoch": 3.6100401840312153,
"grad_norm": 0.30255311727523804,
"learning_rate": 0.0004924285714285714,
"loss": 3.6468,
"step": 12400
},
{
"epoch": 3.6245996156310056,
"grad_norm": 0.3128635883331299,
"learning_rate": 0.0004919912536443149,
"loss": 3.6346,
"step": 12450
},
{
"epoch": 3.639159047230796,
"grad_norm": 0.31057000160217285,
"learning_rate": 0.0004915539358600582,
"loss": 3.622,
"step": 12500
},
{
"epoch": 3.6537184788305863,
"grad_norm": 0.3453236520290375,
"learning_rate": 0.0004911166180758017,
"loss": 3.6354,
"step": 12550
},
{
"epoch": 3.6682779104303767,
"grad_norm": 0.3358878493309021,
"learning_rate": 0.0004906793002915452,
"loss": 3.6276,
"step": 12600
},
{
"epoch": 3.682837342030167,
"grad_norm": 0.3207370638847351,
"learning_rate": 0.0004902419825072886,
"loss": 3.6358,
"step": 12650
},
{
"epoch": 3.6973967736299573,
"grad_norm": 0.31057843565940857,
"learning_rate": 0.000489804664723032,
"loss": 3.6433,
"step": 12700
},
{
"epoch": 3.7119562052297477,
"grad_norm": 0.32829779386520386,
"learning_rate": 0.0004893673469387754,
"loss": 3.6282,
"step": 12750
},
{
"epoch": 3.726515636829538,
"grad_norm": 0.32469305396080017,
"learning_rate": 0.0004889300291545189,
"loss": 3.6353,
"step": 12800
},
{
"epoch": 3.7410750684293284,
"grad_norm": 0.32968953251838684,
"learning_rate": 0.0004884927113702624,
"loss": 3.6307,
"step": 12850
},
{
"epoch": 3.755634500029119,
"grad_norm": 0.3125181198120117,
"learning_rate": 0.0004880553935860058,
"loss": 3.6321,
"step": 12900
},
{
"epoch": 3.770193931628909,
"grad_norm": 0.31494152545928955,
"learning_rate": 0.0004876180758017492,
"loss": 3.6359,
"step": 12950
},
{
"epoch": 3.7847533632287,
"grad_norm": 0.32235443592071533,
"learning_rate": 0.0004871807580174927,
"loss": 3.6319,
"step": 13000
},
{
"epoch": 3.7847533632287,
"eval_accuracy": 0.35539115250113085,
"eval_loss": 3.6755480766296387,
"eval_runtime": 179.9398,
"eval_samples_per_second": 92.492,
"eval_steps_per_second": 5.785,
"step": 13000
},
{
"epoch": 3.7993127948284897,
"grad_norm": 0.3097991645336151,
"learning_rate": 0.00048674344023323613,
"loss": 3.6393,
"step": 13050
},
{
"epoch": 3.8138722264282805,
"grad_norm": 0.3186699450016022,
"learning_rate": 0.00048630612244897955,
"loss": 3.6318,
"step": 13100
},
{
"epoch": 3.8284316580280704,
"grad_norm": 0.3037383556365967,
"learning_rate": 0.00048586880466472296,
"loss": 3.6293,
"step": 13150
},
{
"epoch": 3.842991089627861,
"grad_norm": 0.32788893580436707,
"learning_rate": 0.0004854314868804664,
"loss": 3.6152,
"step": 13200
},
{
"epoch": 3.857550521227651,
"grad_norm": 0.3229829967021942,
"learning_rate": 0.0004849941690962099,
"loss": 3.6483,
"step": 13250
},
{
"epoch": 3.872109952827442,
"grad_norm": 0.3292683959007263,
"learning_rate": 0.0004845568513119533,
"loss": 3.6381,
"step": 13300
},
{
"epoch": 3.8866693844272318,
"grad_norm": 0.3210625648498535,
"learning_rate": 0.00048411953352769677,
"loss": 3.6269,
"step": 13350
},
{
"epoch": 3.9012288160270225,
"grad_norm": 0.31549862027168274,
"learning_rate": 0.0004836822157434402,
"loss": 3.6213,
"step": 13400
},
{
"epoch": 3.9157882476268124,
"grad_norm": 0.30793866515159607,
"learning_rate": 0.00048324489795918365,
"loss": 3.6309,
"step": 13450
},
{
"epoch": 3.930347679226603,
"grad_norm": 0.3356075882911682,
"learning_rate": 0.00048280758017492706,
"loss": 3.6262,
"step": 13500
},
{
"epoch": 3.944907110826393,
"grad_norm": 0.32913827896118164,
"learning_rate": 0.00048237026239067053,
"loss": 3.6213,
"step": 13550
},
{
"epoch": 3.959466542426184,
"grad_norm": 0.3327690362930298,
"learning_rate": 0.00048193294460641394,
"loss": 3.6438,
"step": 13600
},
{
"epoch": 3.974025974025974,
"grad_norm": 0.3101835250854492,
"learning_rate": 0.00048149562682215735,
"loss": 3.6296,
"step": 13650
},
{
"epoch": 3.9885854056257646,
"grad_norm": 0.327761709690094,
"learning_rate": 0.0004810583090379009,
"loss": 3.6235,
"step": 13700
},
{
"epoch": 4.002911886319958,
"grad_norm": 0.32982325553894043,
"learning_rate": 0.0004806209912536443,
"loss": 3.6143,
"step": 13750
},
{
"epoch": 4.017471317919749,
"grad_norm": 0.32604551315307617,
"learning_rate": 0.0004801836734693877,
"loss": 3.511,
"step": 13800
},
{
"epoch": 4.032030749519539,
"grad_norm": 0.3375633955001831,
"learning_rate": 0.0004797463556851311,
"loss": 3.5208,
"step": 13850
},
{
"epoch": 4.046590181119329,
"grad_norm": 0.3174295127391815,
"learning_rate": 0.00047930903790087463,
"loss": 3.5217,
"step": 13900
},
{
"epoch": 4.061149612719119,
"grad_norm": 0.3297431170940399,
"learning_rate": 0.00047887172011661805,
"loss": 3.5218,
"step": 13950
},
{
"epoch": 4.07570904431891,
"grad_norm": 0.33495378494262695,
"learning_rate": 0.00047843440233236146,
"loss": 3.5213,
"step": 14000
},
{
"epoch": 4.07570904431891,
"eval_accuracy": 0.35668316328168387,
"eval_loss": 3.6667861938476562,
"eval_runtime": 179.8727,
"eval_samples_per_second": 92.527,
"eval_steps_per_second": 5.787,
"step": 14000
},
{
"epoch": 4.0902684759187,
"grad_norm": 0.3054860234260559,
"learning_rate": 0.0004779970845481049,
"loss": 3.5277,
"step": 14050
},
{
"epoch": 4.104827907518491,
"grad_norm": 0.3263727128505707,
"learning_rate": 0.00047755976676384834,
"loss": 3.5327,
"step": 14100
},
{
"epoch": 4.119387339118281,
"grad_norm": 0.3170093894004822,
"learning_rate": 0.0004771224489795918,
"loss": 3.5418,
"step": 14150
},
{
"epoch": 4.1339467707180715,
"grad_norm": 0.33194735646247864,
"learning_rate": 0.00047668513119533527,
"loss": 3.5364,
"step": 14200
},
{
"epoch": 4.148506202317861,
"grad_norm": 0.32043886184692383,
"learning_rate": 0.0004762478134110787,
"loss": 3.544,
"step": 14250
},
{
"epoch": 4.163065633917652,
"grad_norm": 0.32483235001564026,
"learning_rate": 0.0004758104956268221,
"loss": 3.5385,
"step": 14300
},
{
"epoch": 4.177625065517442,
"grad_norm": 0.3203752934932709,
"learning_rate": 0.0004753731778425656,
"loss": 3.5636,
"step": 14350
},
{
"epoch": 4.192184497117233,
"grad_norm": 0.3080170452594757,
"learning_rate": 0.00047493586005830903,
"loss": 3.5453,
"step": 14400
},
{
"epoch": 4.206743928717023,
"grad_norm": 0.3298153877258301,
"learning_rate": 0.00047449854227405244,
"loss": 3.5491,
"step": 14450
},
{
"epoch": 4.2213033603168135,
"grad_norm": 0.3529611825942993,
"learning_rate": 0.00047406122448979585,
"loss": 3.544,
"step": 14500
},
{
"epoch": 4.235862791916603,
"grad_norm": 0.3273563086986542,
"learning_rate": 0.00047362390670553926,
"loss": 3.5498,
"step": 14550
},
{
"epoch": 4.250422223516394,
"grad_norm": 0.313999205827713,
"learning_rate": 0.0004731865889212828,
"loss": 3.5526,
"step": 14600
},
{
"epoch": 4.264981655116184,
"grad_norm": 0.30790430307388306,
"learning_rate": 0.0004727492711370262,
"loss": 3.5544,
"step": 14650
},
{
"epoch": 4.279541086715975,
"grad_norm": 0.33186236023902893,
"learning_rate": 0.0004723119533527696,
"loss": 3.5506,
"step": 14700
},
{
"epoch": 4.294100518315765,
"grad_norm": 0.32786890864372253,
"learning_rate": 0.0004718746355685131,
"loss": 3.5475,
"step": 14750
},
{
"epoch": 4.308659949915556,
"grad_norm": 0.3234544098377228,
"learning_rate": 0.0004714373177842565,
"loss": 3.5549,
"step": 14800
},
{
"epoch": 4.3232193815153455,
"grad_norm": 0.31056949496269226,
"learning_rate": 0.00047099999999999996,
"loss": 3.5447,
"step": 14850
},
{
"epoch": 4.337778813115136,
"grad_norm": 0.3284071087837219,
"learning_rate": 0.0004705626822157434,
"loss": 3.5585,
"step": 14900
},
{
"epoch": 4.352338244714926,
"grad_norm": 0.32166486978530884,
"learning_rate": 0.00047012536443148683,
"loss": 3.5546,
"step": 14950
},
{
"epoch": 4.366897676314717,
"grad_norm": 0.3296414613723755,
"learning_rate": 0.00046968804664723025,
"loss": 3.5562,
"step": 15000
},
{
"epoch": 4.366897676314717,
"eval_accuracy": 0.3575879706129867,
"eval_loss": 3.6574151515960693,
"eval_runtime": 179.9318,
"eval_samples_per_second": 92.496,
"eval_steps_per_second": 5.786,
"step": 15000
},
{
"epoch": 4.381457107914507,
"grad_norm": 0.31862205266952515,
"learning_rate": 0.00046925072886297377,
"loss": 3.5609,
"step": 15050
},
{
"epoch": 4.396016539514298,
"grad_norm": 0.321135938167572,
"learning_rate": 0.0004688134110787172,
"loss": 3.5592,
"step": 15100
},
{
"epoch": 4.4105759711140875,
"grad_norm": 0.34049704670906067,
"learning_rate": 0.0004683760932944606,
"loss": 3.5666,
"step": 15150
},
{
"epoch": 4.425135402713878,
"grad_norm": 0.32759514451026917,
"learning_rate": 0.000467938775510204,
"loss": 3.5645,
"step": 15200
},
{
"epoch": 4.439694834313668,
"grad_norm": 0.31559038162231445,
"learning_rate": 0.00046750145772594747,
"loss": 3.5424,
"step": 15250
},
{
"epoch": 4.454254265913459,
"grad_norm": 0.31429657340049744,
"learning_rate": 0.00046706413994169094,
"loss": 3.5577,
"step": 15300
},
{
"epoch": 4.468813697513249,
"grad_norm": 0.32119688391685486,
"learning_rate": 0.00046662682215743435,
"loss": 3.5645,
"step": 15350
},
{
"epoch": 4.48337312911304,
"grad_norm": 0.32725510001182556,
"learning_rate": 0.0004661895043731778,
"loss": 3.558,
"step": 15400
},
{
"epoch": 4.4979325607128295,
"grad_norm": 0.3302425742149353,
"learning_rate": 0.00046575218658892123,
"loss": 3.5645,
"step": 15450
},
{
"epoch": 4.51249199231262,
"grad_norm": 0.33752188086509705,
"learning_rate": 0.0004653148688046647,
"loss": 3.5654,
"step": 15500
},
{
"epoch": 4.52705142391241,
"grad_norm": 0.3348866105079651,
"learning_rate": 0.0004648775510204081,
"loss": 3.5587,
"step": 15550
},
{
"epoch": 4.541610855512201,
"grad_norm": 0.33069008588790894,
"learning_rate": 0.0004644402332361516,
"loss": 3.5564,
"step": 15600
},
{
"epoch": 4.556170287111991,
"grad_norm": 0.36258620023727417,
"learning_rate": 0.000464002915451895,
"loss": 3.5586,
"step": 15650
},
{
"epoch": 4.570729718711782,
"grad_norm": 0.3146510422229767,
"learning_rate": 0.0004635655976676384,
"loss": 3.5612,
"step": 15700
},
{
"epoch": 4.585289150311572,
"grad_norm": 0.3268812298774719,
"learning_rate": 0.0004631282798833819,
"loss": 3.5536,
"step": 15750
},
{
"epoch": 4.599848581911362,
"grad_norm": 0.31493905186653137,
"learning_rate": 0.00046269096209912533,
"loss": 3.5717,
"step": 15800
},
{
"epoch": 4.614408013511152,
"grad_norm": 0.3173486590385437,
"learning_rate": 0.00046225364431486875,
"loss": 3.5678,
"step": 15850
},
{
"epoch": 4.628967445110943,
"grad_norm": 0.32398083806037903,
"learning_rate": 0.00046181632653061216,
"loss": 3.557,
"step": 15900
},
{
"epoch": 4.643526876710733,
"grad_norm": 0.31683549284935,
"learning_rate": 0.0004613790087463557,
"loss": 3.5652,
"step": 15950
},
{
"epoch": 4.658086308310524,
"grad_norm": 0.3226284682750702,
"learning_rate": 0.0004609416909620991,
"loss": 3.5583,
"step": 16000
},
{
"epoch": 4.658086308310524,
"eval_accuracy": 0.35886363724551484,
"eval_loss": 3.641108989715576,
"eval_runtime": 179.9045,
"eval_samples_per_second": 92.51,
"eval_steps_per_second": 5.786,
"step": 16000
},
{
"epoch": 4.672645739910314,
"grad_norm": 0.3244362771511078,
"learning_rate": 0.0004605043731778425,
"loss": 3.5653,
"step": 16050
},
{
"epoch": 4.687205171510104,
"grad_norm": 0.3218280076980591,
"learning_rate": 0.00046006705539358597,
"loss": 3.5573,
"step": 16100
},
{
"epoch": 4.701764603109894,
"grad_norm": 0.31557270884513855,
"learning_rate": 0.0004596297376093294,
"loss": 3.5697,
"step": 16150
},
{
"epoch": 4.716324034709685,
"grad_norm": 0.32409724593162537,
"learning_rate": 0.00045919241982507285,
"loss": 3.5727,
"step": 16200
},
{
"epoch": 4.730883466309475,
"grad_norm": 0.32196715474128723,
"learning_rate": 0.0004587551020408163,
"loss": 3.5696,
"step": 16250
},
{
"epoch": 4.745442897909266,
"grad_norm": 0.3190127909183502,
"learning_rate": 0.00045831778425655973,
"loss": 3.5589,
"step": 16300
},
{
"epoch": 4.760002329509056,
"grad_norm": 0.3492906391620636,
"learning_rate": 0.00045788046647230314,
"loss": 3.576,
"step": 16350
},
{
"epoch": 4.7745617611088464,
"grad_norm": 0.3227944076061249,
"learning_rate": 0.00045744314868804666,
"loss": 3.5511,
"step": 16400
},
{
"epoch": 4.789121192708636,
"grad_norm": 0.3169122040271759,
"learning_rate": 0.0004570058309037901,
"loss": 3.5684,
"step": 16450
},
{
"epoch": 4.803680624308427,
"grad_norm": 0.31213343143463135,
"learning_rate": 0.0004565685131195335,
"loss": 3.5667,
"step": 16500
},
{
"epoch": 4.818240055908217,
"grad_norm": 0.32593971490859985,
"learning_rate": 0.0004561311953352769,
"loss": 3.5661,
"step": 16550
},
{
"epoch": 4.832799487508008,
"grad_norm": 0.33425310254096985,
"learning_rate": 0.0004556938775510203,
"loss": 3.5688,
"step": 16600
},
{
"epoch": 4.847358919107798,
"grad_norm": 0.32003140449523926,
"learning_rate": 0.00045525655976676383,
"loss": 3.5601,
"step": 16650
},
{
"epoch": 4.8619183507075885,
"grad_norm": 0.3596481382846832,
"learning_rate": 0.00045481924198250724,
"loss": 3.5688,
"step": 16700
},
{
"epoch": 4.876477782307378,
"grad_norm": 0.3375333547592163,
"learning_rate": 0.00045438192419825066,
"loss": 3.5685,
"step": 16750
},
{
"epoch": 4.891037213907169,
"grad_norm": 0.31676721572875977,
"learning_rate": 0.0004539446064139941,
"loss": 3.5581,
"step": 16800
},
{
"epoch": 4.905596645506959,
"grad_norm": 0.3257509469985962,
"learning_rate": 0.0004535072886297376,
"loss": 3.5537,
"step": 16850
},
{
"epoch": 4.92015607710675,
"grad_norm": 0.3176610767841339,
"learning_rate": 0.000453069970845481,
"loss": 3.5678,
"step": 16900
},
{
"epoch": 4.93471550870654,
"grad_norm": 0.3168198764324188,
"learning_rate": 0.00045263265306122447,
"loss": 3.5499,
"step": 16950
},
{
"epoch": 4.9492749403063305,
"grad_norm": 0.31883013248443604,
"learning_rate": 0.0004521953352769679,
"loss": 3.5668,
"step": 17000
},
{
"epoch": 4.9492749403063305,
"eval_accuracy": 0.360385171601208,
"eval_loss": 3.626793146133423,
"eval_runtime": 180.0874,
"eval_samples_per_second": 92.416,
"eval_steps_per_second": 5.781,
"step": 17000
},
{
"epoch": 4.96383437190612,
"grad_norm": 0.3429825007915497,
"learning_rate": 0.0004517580174927113,
"loss": 3.5693,
"step": 17050
},
{
"epoch": 4.978393803505911,
"grad_norm": 0.31468144059181213,
"learning_rate": 0.0004513206997084548,
"loss": 3.5648,
"step": 17100
},
{
"epoch": 4.992953235105701,
"grad_norm": 0.3186092972755432,
"learning_rate": 0.0004508833819241982,
"loss": 3.5611,
"step": 17150
},
{
"epoch": 5.007279715799895,
"grad_norm": 0.32911449670791626,
"learning_rate": 0.00045044606413994164,
"loss": 3.5003,
"step": 17200
},
{
"epoch": 5.021839147399685,
"grad_norm": 0.32932335138320923,
"learning_rate": 0.00045000874635568505,
"loss": 3.4462,
"step": 17250
},
{
"epoch": 5.036398578999476,
"grad_norm": 0.3199908435344696,
"learning_rate": 0.00044957142857142857,
"loss": 3.4569,
"step": 17300
},
{
"epoch": 5.050958010599266,
"grad_norm": 0.33716824650764465,
"learning_rate": 0.000449134110787172,
"loss": 3.4669,
"step": 17350
},
{
"epoch": 5.065517442199057,
"grad_norm": 0.32985949516296387,
"learning_rate": 0.0004486967930029154,
"loss": 3.4787,
"step": 17400
},
{
"epoch": 5.080076873798847,
"grad_norm": 0.3227981925010681,
"learning_rate": 0.00044825947521865886,
"loss": 3.46,
"step": 17450
},
{
"epoch": 5.094636305398637,
"grad_norm": 0.32830196619033813,
"learning_rate": 0.0004478221574344023,
"loss": 3.4714,
"step": 17500
},
{
"epoch": 5.109195736998427,
"grad_norm": 0.33184128999710083,
"learning_rate": 0.00044738483965014574,
"loss": 3.4636,
"step": 17550
},
{
"epoch": 5.123755168598218,
"grad_norm": 0.3285403251647949,
"learning_rate": 0.00044694752186588915,
"loss": 3.4711,
"step": 17600
},
{
"epoch": 5.138314600198008,
"grad_norm": 0.3541177809238434,
"learning_rate": 0.0004465102040816326,
"loss": 3.4806,
"step": 17650
},
{
"epoch": 5.152874031797799,
"grad_norm": 0.3223034143447876,
"learning_rate": 0.00044607288629737603,
"loss": 3.4813,
"step": 17700
},
{
"epoch": 5.167433463397589,
"grad_norm": 0.3218257427215576,
"learning_rate": 0.0004456355685131195,
"loss": 3.4826,
"step": 17750
},
{
"epoch": 5.1819928949973795,
"grad_norm": 0.3309643566608429,
"learning_rate": 0.00044519825072886297,
"loss": 3.4888,
"step": 17800
},
{
"epoch": 5.196552326597169,
"grad_norm": 0.3264036774635315,
"learning_rate": 0.0004447609329446064,
"loss": 3.483,
"step": 17850
},
{
"epoch": 5.21111175819696,
"grad_norm": 0.324790358543396,
"learning_rate": 0.0004443236151603498,
"loss": 3.4896,
"step": 17900
},
{
"epoch": 5.22567118979675,
"grad_norm": 0.3223564922809601,
"learning_rate": 0.0004438862973760932,
"loss": 3.4711,
"step": 17950
},
{
"epoch": 5.240230621396541,
"grad_norm": 0.33800962567329407,
"learning_rate": 0.0004434489795918367,
"loss": 3.4813,
"step": 18000
},
{
"epoch": 5.240230621396541,
"eval_accuracy": 0.36069406424049744,
"eval_loss": 3.6294045448303223,
"eval_runtime": 179.9519,
"eval_samples_per_second": 92.486,
"eval_steps_per_second": 5.785,
"step": 18000
},
{
"epoch": 5.254790052996331,
"grad_norm": 0.3145381510257721,
"learning_rate": 0.00044301166180758014,
"loss": 3.4916,
"step": 18050
},
{
"epoch": 5.2693494845961215,
"grad_norm": 0.33484500646591187,
"learning_rate": 0.00044257434402332355,
"loss": 3.5041,
"step": 18100
},
{
"epoch": 5.283908916195911,
"grad_norm": 0.3385532796382904,
"learning_rate": 0.000442137026239067,
"loss": 3.4968,
"step": 18150
},
{
"epoch": 5.298468347795702,
"grad_norm": 0.32390275597572327,
"learning_rate": 0.0004416997084548105,
"loss": 3.4945,
"step": 18200
},
{
"epoch": 5.313027779395492,
"grad_norm": 0.3384283483028412,
"learning_rate": 0.0004412623906705539,
"loss": 3.4902,
"step": 18250
},
{
"epoch": 5.327587210995283,
"grad_norm": 0.3334660530090332,
"learning_rate": 0.00044082507288629736,
"loss": 3.4961,
"step": 18300
},
{
"epoch": 5.342146642595073,
"grad_norm": 0.32754096388816833,
"learning_rate": 0.0004403877551020408,
"loss": 3.5051,
"step": 18350
},
{
"epoch": 5.3567060741948636,
"grad_norm": 0.3279802203178406,
"learning_rate": 0.0004399504373177842,
"loss": 3.5005,
"step": 18400
},
{
"epoch": 5.3712655057946534,
"grad_norm": 0.3342030346393585,
"learning_rate": 0.0004395131195335277,
"loss": 3.4931,
"step": 18450
},
{
"epoch": 5.385824937394444,
"grad_norm": 0.3162689805030823,
"learning_rate": 0.0004390758017492711,
"loss": 3.4952,
"step": 18500
},
{
"epoch": 5.400384368994234,
"grad_norm": 0.32114139199256897,
"learning_rate": 0.00043863848396501453,
"loss": 3.4999,
"step": 18550
},
{
"epoch": 5.414943800594025,
"grad_norm": 0.32540494203567505,
"learning_rate": 0.00043820116618075794,
"loss": 3.5038,
"step": 18600
},
{
"epoch": 5.429503232193815,
"grad_norm": 0.33477944135665894,
"learning_rate": 0.00043776384839650147,
"loss": 3.4941,
"step": 18650
},
{
"epoch": 5.444062663793606,
"grad_norm": 0.3286316990852356,
"learning_rate": 0.0004373265306122449,
"loss": 3.502,
"step": 18700
},
{
"epoch": 5.4586220953933955,
"grad_norm": 0.3312956690788269,
"learning_rate": 0.0004368892128279883,
"loss": 3.5054,
"step": 18750
},
{
"epoch": 5.473181526993186,
"grad_norm": 0.33420413732528687,
"learning_rate": 0.0004364518950437317,
"loss": 3.5176,
"step": 18800
},
{
"epoch": 5.487740958592976,
"grad_norm": 0.312959223985672,
"learning_rate": 0.00043601457725947517,
"loss": 3.5112,
"step": 18850
},
{
"epoch": 5.502300390192767,
"grad_norm": 0.31764182448387146,
"learning_rate": 0.00043557725947521864,
"loss": 3.4987,
"step": 18900
},
{
"epoch": 5.516859821792557,
"grad_norm": 0.3139015734195709,
"learning_rate": 0.00043513994169096205,
"loss": 3.4989,
"step": 18950
},
{
"epoch": 5.531419253392348,
"grad_norm": 0.32444003224372864,
"learning_rate": 0.0004347026239067055,
"loss": 3.4996,
"step": 19000
},
{
"epoch": 5.531419253392348,
"eval_accuracy": 0.3617237455660619,
"eval_loss": 3.6197915077209473,
"eval_runtime": 180.0468,
"eval_samples_per_second": 92.437,
"eval_steps_per_second": 5.782,
"step": 19000
},
{
"epoch": 5.5459786849921375,
"grad_norm": 0.32666343450546265,
"learning_rate": 0.0004342653061224489,
"loss": 3.5084,
"step": 19050
},
{
"epoch": 5.560538116591928,
"grad_norm": 0.32281461358070374,
"learning_rate": 0.0004338279883381924,
"loss": 3.5143,
"step": 19100
},
{
"epoch": 5.575097548191718,
"grad_norm": 0.3272330164909363,
"learning_rate": 0.00043339067055393586,
"loss": 3.5108,
"step": 19150
},
{
"epoch": 5.589656979791509,
"grad_norm": 0.31538012623786926,
"learning_rate": 0.00043295335276967927,
"loss": 3.504,
"step": 19200
},
{
"epoch": 5.604216411391299,
"grad_norm": 0.34619444608688354,
"learning_rate": 0.0004325160349854227,
"loss": 3.4962,
"step": 19250
},
{
"epoch": 5.61877584299109,
"grad_norm": 0.33601802587509155,
"learning_rate": 0.0004320787172011661,
"loss": 3.5107,
"step": 19300
},
{
"epoch": 5.6333352745908805,
"grad_norm": 0.32709893584251404,
"learning_rate": 0.0004316413994169096,
"loss": 3.5085,
"step": 19350
},
{
"epoch": 5.64789470619067,
"grad_norm": 0.332736611366272,
"learning_rate": 0.00043120408163265303,
"loss": 3.5027,
"step": 19400
},
{
"epoch": 5.66245413779046,
"grad_norm": 0.32013025879859924,
"learning_rate": 0.00043076676384839644,
"loss": 3.507,
"step": 19450
},
{
"epoch": 5.677013569390251,
"grad_norm": 0.34380871057510376,
"learning_rate": 0.0004303294460641399,
"loss": 3.5091,
"step": 19500
},
{
"epoch": 5.691573000990042,
"grad_norm": 0.3146701455116272,
"learning_rate": 0.0004298921282798834,
"loss": 3.5085,
"step": 19550
},
{
"epoch": 5.706132432589832,
"grad_norm": 0.3258221447467804,
"learning_rate": 0.0004294548104956268,
"loss": 3.513,
"step": 19600
},
{
"epoch": 5.720691864189622,
"grad_norm": 0.3335384726524353,
"learning_rate": 0.0004290174927113702,
"loss": 3.5065,
"step": 19650
},
{
"epoch": 5.735251295789412,
"grad_norm": 0.333322674036026,
"learning_rate": 0.00042858017492711367,
"loss": 3.5061,
"step": 19700
},
{
"epoch": 5.749810727389203,
"grad_norm": 0.3227587342262268,
"learning_rate": 0.0004281428571428571,
"loss": 3.5212,
"step": 19750
},
{
"epoch": 5.764370158988993,
"grad_norm": 0.3334672152996063,
"learning_rate": 0.00042770553935860055,
"loss": 3.4976,
"step": 19800
},
{
"epoch": 5.778929590588783,
"grad_norm": 0.3109551966190338,
"learning_rate": 0.000427268221574344,
"loss": 3.5159,
"step": 19850
},
{
"epoch": 5.793489022188574,
"grad_norm": 0.3229271471500397,
"learning_rate": 0.0004268309037900874,
"loss": 3.511,
"step": 19900
},
{
"epoch": 5.8080484537883645,
"grad_norm": 0.31595003604888916,
"learning_rate": 0.00042639358600583084,
"loss": 3.5083,
"step": 19950
},
{
"epoch": 5.822607885388154,
"grad_norm": 0.3313562273979187,
"learning_rate": 0.00042595626822157436,
"loss": 3.5032,
"step": 20000
},
{
"epoch": 5.822607885388154,
"eval_accuracy": 0.3630212827851557,
"eval_loss": 3.6066231727600098,
"eval_runtime": 180.0689,
"eval_samples_per_second": 92.426,
"eval_steps_per_second": 5.781,
"step": 20000
},
{
"epoch": 5.837167316987944,
"grad_norm": 0.32517609000205994,
"learning_rate": 0.00042551895043731777,
"loss": 3.5075,
"step": 20050
},
{
"epoch": 5.851726748587735,
"grad_norm": 0.3312103748321533,
"learning_rate": 0.0004250816326530612,
"loss": 3.5038,
"step": 20100
},
{
"epoch": 5.866286180187526,
"grad_norm": 0.3302570879459381,
"learning_rate": 0.0004246443148688046,
"loss": 3.5185,
"step": 20150
},
{
"epoch": 5.880845611787316,
"grad_norm": 0.3184104561805725,
"learning_rate": 0.00042420699708454806,
"loss": 3.5069,
"step": 20200
},
{
"epoch": 5.895405043387106,
"grad_norm": 0.31885817646980286,
"learning_rate": 0.00042376967930029153,
"loss": 3.5122,
"step": 20250
},
{
"epoch": 5.9099644749868965,
"grad_norm": 0.3231607973575592,
"learning_rate": 0.00042333236151603494,
"loss": 3.522,
"step": 20300
},
{
"epoch": 5.924523906586687,
"grad_norm": 0.3280011713504791,
"learning_rate": 0.0004228950437317784,
"loss": 3.5191,
"step": 20350
},
{
"epoch": 5.939083338186477,
"grad_norm": 0.32695943117141724,
"learning_rate": 0.0004224577259475218,
"loss": 3.5189,
"step": 20400
},
{
"epoch": 5.953642769786267,
"grad_norm": 0.31571418046951294,
"learning_rate": 0.0004220204081632653,
"loss": 3.506,
"step": 20450
},
{
"epoch": 5.968202201386058,
"grad_norm": 0.3223441243171692,
"learning_rate": 0.0004215830903790087,
"loss": 3.5298,
"step": 20500
},
{
"epoch": 5.982761632985849,
"grad_norm": 0.3090570569038391,
"learning_rate": 0.00042114577259475217,
"loss": 3.5086,
"step": 20550
},
{
"epoch": 5.9973210645856385,
"grad_norm": 0.32136136293411255,
"learning_rate": 0.0004207084548104956,
"loss": 3.5194,
"step": 20600
},
{
"epoch": 6.011647545279832,
"grad_norm": 0.33823925256729126,
"learning_rate": 0.000420271137026239,
"loss": 3.4225,
"step": 20650
},
{
"epoch": 6.026206976879623,
"grad_norm": 0.31168079376220703,
"learning_rate": 0.0004198338192419825,
"loss": 3.4097,
"step": 20700
},
{
"epoch": 6.040766408479413,
"grad_norm": 0.33379727602005005,
"learning_rate": 0.0004193965014577259,
"loss": 3.3974,
"step": 20750
},
{
"epoch": 6.055325840079203,
"grad_norm": 0.3196876645088196,
"learning_rate": 0.00041895918367346934,
"loss": 3.4086,
"step": 20800
},
{
"epoch": 6.069885271678993,
"grad_norm": 0.3263348937034607,
"learning_rate": 0.00041852186588921275,
"loss": 3.4206,
"step": 20850
},
{
"epoch": 6.084444703278784,
"grad_norm": 0.3413217067718506,
"learning_rate": 0.00041808454810495627,
"loss": 3.4143,
"step": 20900
},
{
"epoch": 6.099004134878574,
"grad_norm": 0.3205811381340027,
"learning_rate": 0.0004176472303206997,
"loss": 3.4236,
"step": 20950
},
{
"epoch": 6.113563566478365,
"grad_norm": 0.3402191996574402,
"learning_rate": 0.0004172099125364431,
"loss": 3.4196,
"step": 21000
},
{
"epoch": 6.113563566478365,
"eval_accuracy": 0.36323646113684954,
"eval_loss": 3.610100030899048,
"eval_runtime": 179.9878,
"eval_samples_per_second": 92.467,
"eval_steps_per_second": 5.784,
"step": 21000
},
{
"epoch": 6.128122998078155,
"grad_norm": 0.35015061497688293,
"learning_rate": 0.00041677259475218656,
"loss": 3.4323,
"step": 21050
},
{
"epoch": 6.142682429677945,
"grad_norm": 0.3365619480609894,
"learning_rate": 0.00041633527696792997,
"loss": 3.4316,
"step": 21100
},
{
"epoch": 6.157241861277735,
"grad_norm": 0.32558462023735046,
"learning_rate": 0.00041589795918367344,
"loss": 3.4261,
"step": 21150
},
{
"epoch": 6.171801292877526,
"grad_norm": 0.3229493498802185,
"learning_rate": 0.0004154606413994169,
"loss": 3.4273,
"step": 21200
},
{
"epoch": 6.186360724477316,
"grad_norm": 0.3373366594314575,
"learning_rate": 0.0004150233236151603,
"loss": 3.4241,
"step": 21250
},
{
"epoch": 6.200920156077107,
"grad_norm": 0.33470067381858826,
"learning_rate": 0.00041458600583090373,
"loss": 3.436,
"step": 21300
},
{
"epoch": 6.215479587676897,
"grad_norm": 0.33129194378852844,
"learning_rate": 0.00041414868804664725,
"loss": 3.4464,
"step": 21350
},
{
"epoch": 6.2300390192766875,
"grad_norm": 0.3305993676185608,
"learning_rate": 0.00041371137026239066,
"loss": 3.44,
"step": 21400
},
{
"epoch": 6.244598450876477,
"grad_norm": 0.3288079500198364,
"learning_rate": 0.0004132740524781341,
"loss": 3.4417,
"step": 21450
},
{
"epoch": 6.259157882476268,
"grad_norm": 0.33732712268829346,
"learning_rate": 0.0004128367346938775,
"loss": 3.445,
"step": 21500
},
{
"epoch": 6.273717314076059,
"grad_norm": 0.3398957848548889,
"learning_rate": 0.0004123994169096209,
"loss": 3.4473,
"step": 21550
},
{
"epoch": 6.288276745675849,
"grad_norm": 0.3353675305843353,
"learning_rate": 0.0004119620991253644,
"loss": 3.4428,
"step": 21600
},
{
"epoch": 6.302836177275639,
"grad_norm": 0.3312719464302063,
"learning_rate": 0.00041152478134110783,
"loss": 3.4346,
"step": 21650
},
{
"epoch": 6.3173956088754295,
"grad_norm": 0.32870662212371826,
"learning_rate": 0.00041108746355685125,
"loss": 3.4397,
"step": 21700
},
{
"epoch": 6.33195504047522,
"grad_norm": 0.3326077461242676,
"learning_rate": 0.0004106501457725947,
"loss": 3.4494,
"step": 21750
},
{
"epoch": 6.34651447207501,
"grad_norm": 0.32431626319885254,
"learning_rate": 0.0004102128279883382,
"loss": 3.435,
"step": 21800
},
{
"epoch": 6.3610739036748,
"grad_norm": 0.32606053352355957,
"learning_rate": 0.0004097755102040816,
"loss": 3.4515,
"step": 21850
},
{
"epoch": 6.375633335274591,
"grad_norm": 0.33837705850601196,
"learning_rate": 0.00040933819241982506,
"loss": 3.4578,
"step": 21900
},
{
"epoch": 6.390192766874382,
"grad_norm": 0.35296231508255005,
"learning_rate": 0.00040890087463556847,
"loss": 3.4563,
"step": 21950
},
{
"epoch": 6.4047521984741715,
"grad_norm": 0.3277094066143036,
"learning_rate": 0.0004084635568513119,
"loss": 3.4499,
"step": 22000
},
{
"epoch": 6.4047521984741715,
"eval_accuracy": 0.3639197405913266,
"eval_loss": 3.6030333042144775,
"eval_runtime": 180.1326,
"eval_samples_per_second": 92.393,
"eval_steps_per_second": 5.779,
"step": 22000
},
{
"epoch": 6.419311630073962,
"grad_norm": 0.3193458020687103,
"learning_rate": 0.0004080262390670554,
"loss": 3.4559,
"step": 22050
},
{
"epoch": 6.433871061673752,
"grad_norm": 0.3288237452507019,
"learning_rate": 0.0004075889212827988,
"loss": 3.4597,
"step": 22100
},
{
"epoch": 6.448430493273543,
"grad_norm": 0.3396027088165283,
"learning_rate": 0.00040715160349854223,
"loss": 3.4572,
"step": 22150
},
{
"epoch": 6.462989924873333,
"grad_norm": 0.3147648572921753,
"learning_rate": 0.00040671428571428564,
"loss": 3.4541,
"step": 22200
},
{
"epoch": 6.477549356473124,
"grad_norm": 0.31667134165763855,
"learning_rate": 0.00040627696793002916,
"loss": 3.4621,
"step": 22250
},
{
"epoch": 6.492108788072914,
"grad_norm": 0.34657663106918335,
"learning_rate": 0.0004058396501457726,
"loss": 3.4604,
"step": 22300
},
{
"epoch": 6.506668219672704,
"grad_norm": 0.32261285185813904,
"learning_rate": 0.000405402332361516,
"loss": 3.4612,
"step": 22350
},
{
"epoch": 6.521227651272494,
"grad_norm": 0.3422505855560303,
"learning_rate": 0.00040496501457725945,
"loss": 3.4559,
"step": 22400
},
{
"epoch": 6.535787082872285,
"grad_norm": 0.3316167891025543,
"learning_rate": 0.00040452769679300287,
"loss": 3.4496,
"step": 22450
},
{
"epoch": 6.550346514472075,
"grad_norm": 0.3352113962173462,
"learning_rate": 0.00040409037900874633,
"loss": 3.4656,
"step": 22500
},
{
"epoch": 6.564905946071866,
"grad_norm": 0.3302208185195923,
"learning_rate": 0.00040365306122448974,
"loss": 3.444,
"step": 22550
},
{
"epoch": 6.579465377671656,
"grad_norm": 0.3382601737976074,
"learning_rate": 0.0004032157434402332,
"loss": 3.4749,
"step": 22600
},
{
"epoch": 6.594024809271446,
"grad_norm": 0.32733502984046936,
"learning_rate": 0.0004027784256559766,
"loss": 3.4676,
"step": 22650
},
{
"epoch": 6.608584240871236,
"grad_norm": 0.3271522521972656,
"learning_rate": 0.0004023411078717201,
"loss": 3.4625,
"step": 22700
},
{
"epoch": 6.623143672471027,
"grad_norm": 0.35525378584861755,
"learning_rate": 0.00040190379008746356,
"loss": 3.47,
"step": 22750
},
{
"epoch": 6.637703104070817,
"grad_norm": 0.34130969643592834,
"learning_rate": 0.00040146647230320697,
"loss": 3.4531,
"step": 22800
},
{
"epoch": 6.652262535670608,
"grad_norm": 0.3281981647014618,
"learning_rate": 0.0004010291545189504,
"loss": 3.4675,
"step": 22850
},
{
"epoch": 6.666821967270398,
"grad_norm": 0.3403642475605011,
"learning_rate": 0.0004005918367346938,
"loss": 3.4779,
"step": 22900
},
{
"epoch": 6.6813813988701884,
"grad_norm": 0.35402730107307434,
"learning_rate": 0.0004001545189504373,
"loss": 3.4748,
"step": 22950
},
{
"epoch": 6.695940830469978,
"grad_norm": 0.3444899618625641,
"learning_rate": 0.0003997172011661807,
"loss": 3.4726,
"step": 23000
},
{
"epoch": 6.695940830469978,
"eval_accuracy": 0.36462571371896035,
"eval_loss": 3.593517541885376,
"eval_runtime": 180.0181,
"eval_samples_per_second": 92.452,
"eval_steps_per_second": 5.783,
"step": 23000
},
{
"epoch": 6.710500262069769,
"grad_norm": 0.3251613676548004,
"learning_rate": 0.00039927988338192414,
"loss": 3.4531,
"step": 23050
},
{
"epoch": 6.725059693669559,
"grad_norm": 0.3224335312843323,
"learning_rate": 0.0003988425655976676,
"loss": 3.4575,
"step": 23100
},
{
"epoch": 6.73961912526935,
"grad_norm": 0.3301301598548889,
"learning_rate": 0.00039840524781341107,
"loss": 3.4608,
"step": 23150
},
{
"epoch": 6.75417855686914,
"grad_norm": 0.3267367482185364,
"learning_rate": 0.0003979679300291545,
"loss": 3.4722,
"step": 23200
},
{
"epoch": 6.7687379884689305,
"grad_norm": 0.34791019558906555,
"learning_rate": 0.00039753061224489795,
"loss": 3.4737,
"step": 23250
},
{
"epoch": 6.78329742006872,
"grad_norm": 0.3289180099964142,
"learning_rate": 0.00039709329446064136,
"loss": 3.4727,
"step": 23300
},
{
"epoch": 6.797856851668511,
"grad_norm": 0.326933890581131,
"learning_rate": 0.0003966559766763848,
"loss": 3.4499,
"step": 23350
},
{
"epoch": 6.812416283268301,
"grad_norm": 0.3207686245441437,
"learning_rate": 0.0003962186588921283,
"loss": 3.4833,
"step": 23400
},
{
"epoch": 6.826975714868092,
"grad_norm": 0.34312567114830017,
"learning_rate": 0.0003957813411078717,
"loss": 3.4743,
"step": 23450
},
{
"epoch": 6.841535146467882,
"grad_norm": 0.32261767983436584,
"learning_rate": 0.0003953440233236151,
"loss": 3.4717,
"step": 23500
},
{
"epoch": 6.8560945780676725,
"grad_norm": 0.3239823579788208,
"learning_rate": 0.00039490670553935853,
"loss": 3.4745,
"step": 23550
},
{
"epoch": 6.870654009667462,
"grad_norm": 0.3287251889705658,
"learning_rate": 0.00039446938775510195,
"loss": 3.4706,
"step": 23600
},
{
"epoch": 6.885213441267253,
"grad_norm": 0.33744481205940247,
"learning_rate": 0.00039403206997084547,
"loss": 3.4712,
"step": 23650
},
{
"epoch": 6.899772872867043,
"grad_norm": 0.33784157037734985,
"learning_rate": 0.0003935947521865889,
"loss": 3.4744,
"step": 23700
},
{
"epoch": 6.914332304466834,
"grad_norm": 0.33039554953575134,
"learning_rate": 0.0003931574344023323,
"loss": 3.4635,
"step": 23750
},
{
"epoch": 6.928891736066624,
"grad_norm": 0.3252994418144226,
"learning_rate": 0.00039272011661807576,
"loss": 3.4699,
"step": 23800
},
{
"epoch": 6.943451167666415,
"grad_norm": 0.33139488101005554,
"learning_rate": 0.0003922827988338192,
"loss": 3.4763,
"step": 23850
},
{
"epoch": 6.9580105992662045,
"grad_norm": 0.3324434757232666,
"learning_rate": 0.00039184548104956264,
"loss": 3.467,
"step": 23900
},
{
"epoch": 6.972570030865995,
"grad_norm": 0.32796579599380493,
"learning_rate": 0.0003914081632653061,
"loss": 3.4705,
"step": 23950
},
{
"epoch": 6.987129462465785,
"grad_norm": 0.32173773646354675,
"learning_rate": 0.0003909708454810495,
"loss": 3.4734,
"step": 24000
},
{
"epoch": 6.987129462465785,
"eval_accuracy": 0.36566327315904046,
"eval_loss": 3.5826520919799805,
"eval_runtime": 179.8608,
"eval_samples_per_second": 92.533,
"eval_steps_per_second": 5.788,
"step": 24000
},
{
"epoch": 7.001455943159979,
"grad_norm": 0.3372614085674286,
"learning_rate": 0.00039053352769679293,
"loss": 3.4591,
"step": 24050
},
{
"epoch": 7.016015374759769,
"grad_norm": 0.3403565585613251,
"learning_rate": 0.00039009620991253645,
"loss": 3.3491,
"step": 24100
},
{
"epoch": 7.03057480635956,
"grad_norm": 0.34883126616477966,
"learning_rate": 0.00038965889212827986,
"loss": 3.3559,
"step": 24150
},
{
"epoch": 7.04513423795935,
"grad_norm": 0.3384236693382263,
"learning_rate": 0.0003892215743440233,
"loss": 3.3671,
"step": 24200
},
{
"epoch": 7.059693669559141,
"grad_norm": 0.33237436413764954,
"learning_rate": 0.0003887842565597667,
"loss": 3.3692,
"step": 24250
},
{
"epoch": 7.074253101158931,
"grad_norm": 0.35221633315086365,
"learning_rate": 0.0003883469387755102,
"loss": 3.3795,
"step": 24300
},
{
"epoch": 7.0888125327587215,
"grad_norm": 0.33727243542671204,
"learning_rate": 0.0003879096209912536,
"loss": 3.3848,
"step": 24350
},
{
"epoch": 7.103371964358511,
"grad_norm": 0.34708696603775024,
"learning_rate": 0.00038747230320699703,
"loss": 3.3819,
"step": 24400
},
{
"epoch": 7.117931395958302,
"grad_norm": 0.3307049572467804,
"learning_rate": 0.0003870349854227405,
"loss": 3.3914,
"step": 24450
},
{
"epoch": 7.132490827558092,
"grad_norm": 0.3246367871761322,
"learning_rate": 0.0003865976676384839,
"loss": 3.3867,
"step": 24500
},
{
"epoch": 7.147050259157883,
"grad_norm": 0.33385294675827026,
"learning_rate": 0.0003861603498542274,
"loss": 3.3878,
"step": 24550
},
{
"epoch": 7.161609690757673,
"grad_norm": 0.35358157753944397,
"learning_rate": 0.0003857230320699708,
"loss": 3.3848,
"step": 24600
},
{
"epoch": 7.1761691223574635,
"grad_norm": 0.3381134271621704,
"learning_rate": 0.00038528571428571426,
"loss": 3.381,
"step": 24650
},
{
"epoch": 7.190728553957253,
"grad_norm": 0.33539456129074097,
"learning_rate": 0.00038484839650145767,
"loss": 3.391,
"step": 24700
},
{
"epoch": 7.205287985557044,
"grad_norm": 0.3288535475730896,
"learning_rate": 0.00038441107871720114,
"loss": 3.3846,
"step": 24750
},
{
"epoch": 7.219847417156834,
"grad_norm": 0.3503969609737396,
"learning_rate": 0.0003839737609329446,
"loss": 3.3942,
"step": 24800
},
{
"epoch": 7.234406848756625,
"grad_norm": 0.34089216589927673,
"learning_rate": 0.000383536443148688,
"loss": 3.4039,
"step": 24850
},
{
"epoch": 7.248966280356415,
"grad_norm": 0.33822911977767944,
"learning_rate": 0.00038309912536443143,
"loss": 3.3938,
"step": 24900
},
{
"epoch": 7.2635257119562056,
"grad_norm": 0.34553372859954834,
"learning_rate": 0.00038266180758017484,
"loss": 3.4007,
"step": 24950
},
{
"epoch": 7.2780851435559955,
"grad_norm": 0.34171566367149353,
"learning_rate": 0.00038222448979591836,
"loss": 3.4074,
"step": 25000
},
{
"epoch": 7.2780851435559955,
"eval_accuracy": 0.3654761973352454,
"eval_loss": 3.590606212615967,
"eval_runtime": 179.9086,
"eval_samples_per_second": 92.508,
"eval_steps_per_second": 5.786,
"step": 25000
},
{
"epoch": 7.292644575155786,
"grad_norm": 0.33524560928344727,
"learning_rate": 0.00038178717201166177,
"loss": 3.4011,
"step": 25050
},
{
"epoch": 7.307204006755576,
"grad_norm": 0.3223832845687866,
"learning_rate": 0.0003813498542274052,
"loss": 3.4039,
"step": 25100
},
{
"epoch": 7.321763438355367,
"grad_norm": 0.33097726106643677,
"learning_rate": 0.00038091253644314865,
"loss": 3.4045,
"step": 25150
},
{
"epoch": 7.336322869955157,
"grad_norm": 0.3478914201259613,
"learning_rate": 0.0003804752186588921,
"loss": 3.4058,
"step": 25200
},
{
"epoch": 7.350882301554948,
"grad_norm": 0.36241742968559265,
"learning_rate": 0.00038003790087463553,
"loss": 3.4074,
"step": 25250
},
{
"epoch": 7.3654417331547375,
"grad_norm": 0.3258671164512634,
"learning_rate": 0.000379600583090379,
"loss": 3.4111,
"step": 25300
},
{
"epoch": 7.380001164754528,
"grad_norm": 0.34953588247299194,
"learning_rate": 0.0003791632653061224,
"loss": 3.4106,
"step": 25350
},
{
"epoch": 7.394560596354318,
"grad_norm": 0.33248066902160645,
"learning_rate": 0.0003787259475218658,
"loss": 3.4044,
"step": 25400
},
{
"epoch": 7.409120027954109,
"grad_norm": 0.3584959805011749,
"learning_rate": 0.00037828862973760934,
"loss": 3.4181,
"step": 25450
},
{
"epoch": 7.423679459553899,
"grad_norm": 0.3388174772262573,
"learning_rate": 0.00037785131195335276,
"loss": 3.4183,
"step": 25500
},
{
"epoch": 7.43823889115369,
"grad_norm": 0.3261428773403168,
"learning_rate": 0.00037741399416909617,
"loss": 3.421,
"step": 25550
},
{
"epoch": 7.4527983227534795,
"grad_norm": 0.33287757635116577,
"learning_rate": 0.0003769766763848396,
"loss": 3.4161,
"step": 25600
},
{
"epoch": 7.46735775435327,
"grad_norm": 0.3487517237663269,
"learning_rate": 0.0003765393586005831,
"loss": 3.4124,
"step": 25650
},
{
"epoch": 7.48191718595306,
"grad_norm": 0.3360964357852936,
"learning_rate": 0.0003761020408163265,
"loss": 3.4114,
"step": 25700
},
{
"epoch": 7.496476617552851,
"grad_norm": 0.339276522397995,
"learning_rate": 0.0003756647230320699,
"loss": 3.4243,
"step": 25750
},
{
"epoch": 7.511036049152641,
"grad_norm": 0.34367257356643677,
"learning_rate": 0.00037522740524781334,
"loss": 3.422,
"step": 25800
},
{
"epoch": 7.525595480752432,
"grad_norm": 0.33407968282699585,
"learning_rate": 0.0003747900874635568,
"loss": 3.4223,
"step": 25850
},
{
"epoch": 7.540154912352222,
"grad_norm": 0.3418211042881012,
"learning_rate": 0.00037435276967930027,
"loss": 3.4312,
"step": 25900
},
{
"epoch": 7.554714343952012,
"grad_norm": 0.3525235950946808,
"learning_rate": 0.0003739154518950437,
"loss": 3.436,
"step": 25950
},
{
"epoch": 7.569273775551802,
"grad_norm": 0.35437750816345215,
"learning_rate": 0.00037347813411078715,
"loss": 3.4173,
"step": 26000
},
{
"epoch": 7.569273775551802,
"eval_accuracy": 0.36623061498796,
"eval_loss": 3.5847508907318115,
"eval_runtime": 179.8238,
"eval_samples_per_second": 92.552,
"eval_steps_per_second": 5.789,
"step": 26000
},
{
"epoch": 7.583833207151593,
"grad_norm": 0.36680832505226135,
"learning_rate": 0.00037304081632653056,
"loss": 3.4282,
"step": 26050
},
{
"epoch": 7.598392638751383,
"grad_norm": 0.32975292205810547,
"learning_rate": 0.00037260349854227403,
"loss": 3.4187,
"step": 26100
},
{
"epoch": 7.612952070351174,
"grad_norm": 0.33258336782455444,
"learning_rate": 0.0003721661807580175,
"loss": 3.419,
"step": 26150
},
{
"epoch": 7.627511501950964,
"grad_norm": 0.3520626723766327,
"learning_rate": 0.0003717288629737609,
"loss": 3.4317,
"step": 26200
},
{
"epoch": 7.642070933550754,
"grad_norm": 0.34615185856819153,
"learning_rate": 0.0003712915451895043,
"loss": 3.4397,
"step": 26250
},
{
"epoch": 7.656630365150544,
"grad_norm": 0.3472108542919159,
"learning_rate": 0.00037085422740524773,
"loss": 3.4183,
"step": 26300
},
{
"epoch": 7.671189796750335,
"grad_norm": 0.3401790261268616,
"learning_rate": 0.00037041690962099125,
"loss": 3.4274,
"step": 26350
},
{
"epoch": 7.685749228350125,
"grad_norm": 0.34616005420684814,
"learning_rate": 0.00036997959183673467,
"loss": 3.432,
"step": 26400
},
{
"epoch": 7.700308659949916,
"grad_norm": 0.35238298773765564,
"learning_rate": 0.0003695422740524781,
"loss": 3.4308,
"step": 26450
},
{
"epoch": 7.714868091549706,
"grad_norm": 0.3595859408378601,
"learning_rate": 0.00036910495626822154,
"loss": 3.4368,
"step": 26500
},
{
"epoch": 7.729427523149496,
"grad_norm": 0.3455177843570709,
"learning_rate": 0.000368667638483965,
"loss": 3.4389,
"step": 26550
},
{
"epoch": 7.743986954749286,
"grad_norm": 0.3548458516597748,
"learning_rate": 0.0003682303206997084,
"loss": 3.427,
"step": 26600
},
{
"epoch": 7.758546386349077,
"grad_norm": 0.34287944436073303,
"learning_rate": 0.00036779300291545184,
"loss": 3.4268,
"step": 26650
},
{
"epoch": 7.773105817948867,
"grad_norm": 0.3392084836959839,
"learning_rate": 0.0003673556851311953,
"loss": 3.4348,
"step": 26700
},
{
"epoch": 7.787665249548658,
"grad_norm": 0.3404330015182495,
"learning_rate": 0.0003669183673469387,
"loss": 3.4162,
"step": 26750
},
{
"epoch": 7.802224681148448,
"grad_norm": 0.34505772590637207,
"learning_rate": 0.0003664810495626822,
"loss": 3.4267,
"step": 26800
},
{
"epoch": 7.8167841127482385,
"grad_norm": 0.34470146894454956,
"learning_rate": 0.00036604373177842565,
"loss": 3.444,
"step": 26850
},
{
"epoch": 7.831343544348028,
"grad_norm": 0.33952096104621887,
"learning_rate": 0.00036560641399416906,
"loss": 3.4317,
"step": 26900
},
{
"epoch": 7.845902975947819,
"grad_norm": 0.35519036650657654,
"learning_rate": 0.0003651690962099125,
"loss": 3.4335,
"step": 26950
},
{
"epoch": 7.860462407547609,
"grad_norm": 0.3539314568042755,
"learning_rate": 0.000364731778425656,
"loss": 3.4318,
"step": 27000
},
{
"epoch": 7.860462407547609,
"eval_accuracy": 0.36703947393949116,
"eval_loss": 3.572652578353882,
"eval_runtime": 179.9477,
"eval_samples_per_second": 92.488,
"eval_steps_per_second": 5.785,
"step": 27000
},
{
"epoch": 7.8750218391474,
"grad_norm": 0.3434327244758606,
"learning_rate": 0.0003642944606413994,
"loss": 3.4372,
"step": 27050
},
{
"epoch": 7.88958127074719,
"grad_norm": 0.3285406231880188,
"learning_rate": 0.0003638571428571428,
"loss": 3.4238,
"step": 27100
},
{
"epoch": 7.9041407023469805,
"grad_norm": 0.3453764021396637,
"learning_rate": 0.00036341982507288623,
"loss": 3.4369,
"step": 27150
},
{
"epoch": 7.91870013394677,
"grad_norm": 0.32807591557502747,
"learning_rate": 0.0003629825072886297,
"loss": 3.4333,
"step": 27200
},
{
"epoch": 7.933259565546561,
"grad_norm": 0.33627721667289734,
"learning_rate": 0.00036254518950437316,
"loss": 3.4378,
"step": 27250
},
{
"epoch": 7.947818997146351,
"grad_norm": 0.3372686207294464,
"learning_rate": 0.0003621078717201166,
"loss": 3.4311,
"step": 27300
},
{
"epoch": 7.962378428746142,
"grad_norm": 0.3440007269382477,
"learning_rate": 0.00036167055393586004,
"loss": 3.4387,
"step": 27350
},
{
"epoch": 7.976937860345932,
"grad_norm": 0.3503531813621521,
"learning_rate": 0.00036123323615160346,
"loss": 3.435,
"step": 27400
},
{
"epoch": 7.991497291945723,
"grad_norm": 0.32801827788352966,
"learning_rate": 0.0003607959183673469,
"loss": 3.4472,
"step": 27450
},
{
"epoch": 8.005823772639916,
"grad_norm": 0.35048505663871765,
"learning_rate": 0.0003603586005830904,
"loss": 3.3812,
"step": 27500
},
{
"epoch": 8.020383204239707,
"grad_norm": 0.3340410888195038,
"learning_rate": 0.0003599212827988338,
"loss": 3.3375,
"step": 27550
},
{
"epoch": 8.034942635839498,
"grad_norm": 0.34667739272117615,
"learning_rate": 0.0003594839650145772,
"loss": 3.3225,
"step": 27600
},
{
"epoch": 8.049502067439287,
"grad_norm": 0.3594329059123993,
"learning_rate": 0.0003590466472303206,
"loss": 3.335,
"step": 27650
},
{
"epoch": 8.064061499039077,
"grad_norm": 0.3574943244457245,
"learning_rate": 0.00035860932944606415,
"loss": 3.3357,
"step": 27700
},
{
"epoch": 8.078620930638868,
"grad_norm": 0.3481893539428711,
"learning_rate": 0.00035817201166180756,
"loss": 3.3318,
"step": 27750
},
{
"epoch": 8.093180362238659,
"grad_norm": 0.3395717144012451,
"learning_rate": 0.00035773469387755097,
"loss": 3.3532,
"step": 27800
},
{
"epoch": 8.107739793838448,
"grad_norm": 0.33678963780403137,
"learning_rate": 0.0003572973760932944,
"loss": 3.3457,
"step": 27850
},
{
"epoch": 8.122299225438239,
"grad_norm": 0.3517054617404938,
"learning_rate": 0.0003568600583090379,
"loss": 3.3469,
"step": 27900
},
{
"epoch": 8.13685865703803,
"grad_norm": 0.3345564603805542,
"learning_rate": 0.0003564227405247813,
"loss": 3.3386,
"step": 27950
},
{
"epoch": 8.15141808863782,
"grad_norm": 0.3494488596916199,
"learning_rate": 0.00035598542274052473,
"loss": 3.3639,
"step": 28000
},
{
"epoch": 8.15141808863782,
"eval_accuracy": 0.3670094900708125,
"eval_loss": 3.580695867538452,
"eval_runtime": 179.627,
"eval_samples_per_second": 92.653,
"eval_steps_per_second": 5.795,
"step": 28000
},
{
"epoch": 8.16597752023761,
"grad_norm": 0.33706334233283997,
"learning_rate": 0.0003555481049562682,
"loss": 3.369,
"step": 28050
},
{
"epoch": 8.1805369518374,
"grad_norm": 0.34689971804618835,
"learning_rate": 0.0003551107871720116,
"loss": 3.3533,
"step": 28100
},
{
"epoch": 8.19509638343719,
"grad_norm": 0.34161534905433655,
"learning_rate": 0.0003546734693877551,
"loss": 3.375,
"step": 28150
},
{
"epoch": 8.209655815036982,
"grad_norm": 0.36119794845581055,
"learning_rate": 0.00035423615160349854,
"loss": 3.3611,
"step": 28200
},
{
"epoch": 8.22421524663677,
"grad_norm": 0.3473355174064636,
"learning_rate": 0.00035379883381924195,
"loss": 3.3731,
"step": 28250
},
{
"epoch": 8.238774678236561,
"grad_norm": 0.33798742294311523,
"learning_rate": 0.00035336151603498537,
"loss": 3.3692,
"step": 28300
},
{
"epoch": 8.253334109836352,
"grad_norm": 0.3432019352912903,
"learning_rate": 0.0003529241982507289,
"loss": 3.3687,
"step": 28350
},
{
"epoch": 8.267893541436143,
"grad_norm": 0.35700732469558716,
"learning_rate": 0.0003524868804664723,
"loss": 3.3723,
"step": 28400
},
{
"epoch": 8.282452973035932,
"grad_norm": 0.348431795835495,
"learning_rate": 0.0003520495626822157,
"loss": 3.3593,
"step": 28450
},
{
"epoch": 8.297012404635723,
"grad_norm": 0.34419500827789307,
"learning_rate": 0.0003516122448979591,
"loss": 3.3778,
"step": 28500
},
{
"epoch": 8.311571836235514,
"grad_norm": 0.34864479303359985,
"learning_rate": 0.0003511749271137026,
"loss": 3.3809,
"step": 28550
},
{
"epoch": 8.326131267835304,
"grad_norm": 0.35667717456817627,
"learning_rate": 0.00035073760932944606,
"loss": 3.363,
"step": 28600
},
{
"epoch": 8.340690699435093,
"grad_norm": 0.3501654863357544,
"learning_rate": 0.00035030029154518947,
"loss": 3.378,
"step": 28650
},
{
"epoch": 8.355250131034884,
"grad_norm": 0.3490404486656189,
"learning_rate": 0.0003498629737609329,
"loss": 3.3855,
"step": 28700
},
{
"epoch": 8.369809562634675,
"grad_norm": 0.358019083738327,
"learning_rate": 0.00034942565597667635,
"loss": 3.3784,
"step": 28750
},
{
"epoch": 8.384368994234466,
"grad_norm": 0.33226025104522705,
"learning_rate": 0.0003489883381924198,
"loss": 3.3714,
"step": 28800
},
{
"epoch": 8.398928425834255,
"grad_norm": 0.3402322828769684,
"learning_rate": 0.00034855102040816323,
"loss": 3.3785,
"step": 28850
},
{
"epoch": 8.413487857434045,
"grad_norm": 0.36141642928123474,
"learning_rate": 0.0003481137026239067,
"loss": 3.3745,
"step": 28900
},
{
"epoch": 8.428047289033836,
"grad_norm": 0.36371850967407227,
"learning_rate": 0.0003476763848396501,
"loss": 3.3874,
"step": 28950
},
{
"epoch": 8.442606720633627,
"grad_norm": 0.3497146666049957,
"learning_rate": 0.0003472390670553935,
"loss": 3.3846,
"step": 29000
},
{
"epoch": 8.442606720633627,
"eval_accuracy": 0.3678150566759789,
"eval_loss": 3.572382688522339,
"eval_runtime": 179.6973,
"eval_samples_per_second": 92.617,
"eval_steps_per_second": 5.793,
"step": 29000
},
{
"epoch": 8.457166152233416,
"grad_norm": 0.3523021936416626,
"learning_rate": 0.00034680174927113704,
"loss": 3.3833,
"step": 29050
},
{
"epoch": 8.471725583833207,
"grad_norm": 0.3318672180175781,
"learning_rate": 0.00034636443148688045,
"loss": 3.3856,
"step": 29100
},
{
"epoch": 8.486285015432998,
"grad_norm": 0.34436580538749695,
"learning_rate": 0.00034592711370262386,
"loss": 3.392,
"step": 29150
},
{
"epoch": 8.500844447032788,
"grad_norm": 0.3374488651752472,
"learning_rate": 0.0003454897959183673,
"loss": 3.3784,
"step": 29200
},
{
"epoch": 8.515403878632577,
"grad_norm": 0.3651833236217499,
"learning_rate": 0.0003450524781341108,
"loss": 3.3695,
"step": 29250
},
{
"epoch": 8.529963310232368,
"grad_norm": 0.33650752902030945,
"learning_rate": 0.0003446151603498542,
"loss": 3.3818,
"step": 29300
},
{
"epoch": 8.544522741832159,
"grad_norm": 0.3391404449939728,
"learning_rate": 0.0003441778425655976,
"loss": 3.3997,
"step": 29350
},
{
"epoch": 8.55908217343195,
"grad_norm": 0.3535376787185669,
"learning_rate": 0.0003437405247813411,
"loss": 3.3867,
"step": 29400
},
{
"epoch": 8.573641605031739,
"grad_norm": 0.3420208692550659,
"learning_rate": 0.0003433032069970845,
"loss": 3.3834,
"step": 29450
},
{
"epoch": 8.58820103663153,
"grad_norm": 0.3331069052219391,
"learning_rate": 0.00034286588921282797,
"loss": 3.3903,
"step": 29500
},
{
"epoch": 8.60276046823132,
"grad_norm": 0.3587231934070587,
"learning_rate": 0.00034242857142857143,
"loss": 3.3888,
"step": 29550
},
{
"epoch": 8.617319899831111,
"grad_norm": 0.35839417576789856,
"learning_rate": 0.00034199125364431485,
"loss": 3.404,
"step": 29600
},
{
"epoch": 8.6318793314309,
"grad_norm": 0.3896600902080536,
"learning_rate": 0.00034155393586005826,
"loss": 3.3906,
"step": 29650
},
{
"epoch": 8.646438763030691,
"grad_norm": 0.35471850633621216,
"learning_rate": 0.0003411166180758017,
"loss": 3.3922,
"step": 29700
},
{
"epoch": 8.660998194630482,
"grad_norm": 0.3513423800468445,
"learning_rate": 0.0003406793002915452,
"loss": 3.3873,
"step": 29750
},
{
"epoch": 8.675557626230272,
"grad_norm": 0.34752732515335083,
"learning_rate": 0.0003402419825072886,
"loss": 3.3855,
"step": 29800
},
{
"epoch": 8.690117057830061,
"grad_norm": 0.32745492458343506,
"learning_rate": 0.000339804664723032,
"loss": 3.3934,
"step": 29850
},
{
"epoch": 8.704676489429852,
"grad_norm": 0.3485073745250702,
"learning_rate": 0.00033936734693877543,
"loss": 3.3878,
"step": 29900
},
{
"epoch": 8.719235921029643,
"grad_norm": 0.3374342620372772,
"learning_rate": 0.00033893002915451895,
"loss": 3.388,
"step": 29950
},
{
"epoch": 8.733795352629434,
"grad_norm": 0.3508179187774658,
"learning_rate": 0.00033849271137026236,
"loss": 3.3893,
"step": 30000
},
{
"epoch": 8.733795352629434,
"eval_accuracy": 0.3683345418988114,
"eval_loss": 3.5642998218536377,
"eval_runtime": 179.3326,
"eval_samples_per_second": 92.805,
"eval_steps_per_second": 5.805,
"step": 30000
},
{
"epoch": 8.748354784229225,
"grad_norm": 0.3522128760814667,
"learning_rate": 0.0003380553935860058,
"loss": 3.4013,
"step": 30050
},
{
"epoch": 8.762914215829014,
"grad_norm": 0.3406279385089874,
"learning_rate": 0.00033761807580174924,
"loss": 3.4014,
"step": 30100
},
{
"epoch": 8.777473647428804,
"grad_norm": 0.33040550351142883,
"learning_rate": 0.0003371807580174927,
"loss": 3.392,
"step": 30150
},
{
"epoch": 8.792033079028595,
"grad_norm": 0.35470637679100037,
"learning_rate": 0.0003367434402332361,
"loss": 3.3986,
"step": 30200
},
{
"epoch": 8.806592510628384,
"grad_norm": 0.35664665699005127,
"learning_rate": 0.0003363061224489796,
"loss": 3.4054,
"step": 30250
},
{
"epoch": 8.821151942228175,
"grad_norm": 0.35443365573883057,
"learning_rate": 0.000335868804664723,
"loss": 3.3916,
"step": 30300
},
{
"epoch": 8.835711373827966,
"grad_norm": 0.3552112579345703,
"learning_rate": 0.0003354314868804664,
"loss": 3.4106,
"step": 30350
},
{
"epoch": 8.850270805427757,
"grad_norm": 0.3517363667488098,
"learning_rate": 0.00033499416909620993,
"loss": 3.3959,
"step": 30400
},
{
"epoch": 8.864830237027547,
"grad_norm": 0.3412357568740845,
"learning_rate": 0.00033455685131195335,
"loss": 3.3977,
"step": 30450
},
{
"epoch": 8.879389668627336,
"grad_norm": 0.3659086227416992,
"learning_rate": 0.00033411953352769676,
"loss": 3.4041,
"step": 30500
},
{
"epoch": 8.893949100227127,
"grad_norm": 0.3394777476787567,
"learning_rate": 0.00033368221574344017,
"loss": 3.3925,
"step": 30550
},
{
"epoch": 8.908508531826918,
"grad_norm": 0.3358438014984131,
"learning_rate": 0.0003332448979591837,
"loss": 3.3954,
"step": 30600
},
{
"epoch": 8.923067963426707,
"grad_norm": 0.3618221879005432,
"learning_rate": 0.0003328075801749271,
"loss": 3.3963,
"step": 30650
},
{
"epoch": 8.937627395026498,
"grad_norm": 0.35156282782554626,
"learning_rate": 0.0003323702623906705,
"loss": 3.3973,
"step": 30700
},
{
"epoch": 8.952186826626289,
"grad_norm": 0.3404799997806549,
"learning_rate": 0.00033193294460641393,
"loss": 3.4064,
"step": 30750
},
{
"epoch": 8.96674625822608,
"grad_norm": 0.3573434352874756,
"learning_rate": 0.0003314956268221574,
"loss": 3.3962,
"step": 30800
},
{
"epoch": 8.98130568982587,
"grad_norm": 0.3326402008533478,
"learning_rate": 0.00033105830903790086,
"loss": 3.3904,
"step": 30850
},
{
"epoch": 8.995865121425659,
"grad_norm": 0.3333438038825989,
"learning_rate": 0.0003306209912536443,
"loss": 3.4072,
"step": 30900
},
{
"epoch": 9.010191602119853,
"grad_norm": 0.35445913672447205,
"learning_rate": 0.00033018367346938774,
"loss": 3.3354,
"step": 30950
},
{
"epoch": 9.024751033719644,
"grad_norm": 0.3602832555770874,
"learning_rate": 0.00032974635568513115,
"loss": 3.2997,
"step": 31000
},
{
"epoch": 9.024751033719644,
"eval_accuracy": 0.3683871018568481,
"eval_loss": 3.56929087638855,
"eval_runtime": 180.7113,
"eval_samples_per_second": 92.097,
"eval_steps_per_second": 5.761,
"step": 31000
},
{
"epoch": 9.039310465319433,
"grad_norm": 0.33829203248023987,
"learning_rate": 0.0003293090379008746,
"loss": 3.2994,
"step": 31050
},
{
"epoch": 9.053869896919224,
"grad_norm": 0.36634117364883423,
"learning_rate": 0.0003288717201166181,
"loss": 3.3039,
"step": 31100
},
{
"epoch": 9.068429328519015,
"grad_norm": 0.34743866324424744,
"learning_rate": 0.0003284344023323615,
"loss": 3.3133,
"step": 31150
},
{
"epoch": 9.082988760118806,
"grad_norm": 0.3573026657104492,
"learning_rate": 0.0003279970845481049,
"loss": 3.3067,
"step": 31200
},
{
"epoch": 9.097548191718595,
"grad_norm": 0.3499259650707245,
"learning_rate": 0.0003275597667638483,
"loss": 3.311,
"step": 31250
},
{
"epoch": 9.112107623318385,
"grad_norm": 0.3550528287887573,
"learning_rate": 0.00032712244897959184,
"loss": 3.3146,
"step": 31300
},
{
"epoch": 9.126667054918176,
"grad_norm": 0.3766951262950897,
"learning_rate": 0.00032668513119533526,
"loss": 3.3203,
"step": 31350
},
{
"epoch": 9.141226486517967,
"grad_norm": 0.3506350517272949,
"learning_rate": 0.00032624781341107867,
"loss": 3.3383,
"step": 31400
},
{
"epoch": 9.155785918117756,
"grad_norm": 0.36587440967559814,
"learning_rate": 0.00032581049562682213,
"loss": 3.3249,
"step": 31450
},
{
"epoch": 9.170345349717547,
"grad_norm": 0.3548264503479004,
"learning_rate": 0.0003253731778425656,
"loss": 3.3173,
"step": 31500
},
{
"epoch": 9.184904781317337,
"grad_norm": 0.3574599325656891,
"learning_rate": 0.000324935860058309,
"loss": 3.3277,
"step": 31550
},
{
"epoch": 9.199464212917128,
"grad_norm": 0.3559187948703766,
"learning_rate": 0.0003244985422740524,
"loss": 3.3302,
"step": 31600
},
{
"epoch": 9.214023644516917,
"grad_norm": 0.3626471757888794,
"learning_rate": 0.0003240612244897959,
"loss": 3.3249,
"step": 31650
},
{
"epoch": 9.228583076116708,
"grad_norm": 0.34642550349235535,
"learning_rate": 0.0003236239067055393,
"loss": 3.3288,
"step": 31700
},
{
"epoch": 9.243142507716499,
"grad_norm": 0.3562052249908447,
"learning_rate": 0.00032318658892128277,
"loss": 3.3381,
"step": 31750
},
{
"epoch": 9.25770193931629,
"grad_norm": 0.35299643874168396,
"learning_rate": 0.00032274927113702624,
"loss": 3.3398,
"step": 31800
},
{
"epoch": 9.272261370916079,
"grad_norm": 0.3579034209251404,
"learning_rate": 0.00032231195335276965,
"loss": 3.3376,
"step": 31850
},
{
"epoch": 9.28682080251587,
"grad_norm": 0.3582768738269806,
"learning_rate": 0.00032187463556851306,
"loss": 3.3417,
"step": 31900
},
{
"epoch": 9.30138023411566,
"grad_norm": 0.3462630808353424,
"learning_rate": 0.0003214373177842565,
"loss": 3.3388,
"step": 31950
},
{
"epoch": 9.315939665715451,
"grad_norm": 0.35994312167167664,
"learning_rate": 0.000321,
"loss": 3.3417,
"step": 32000
},
{
"epoch": 9.315939665715451,
"eval_accuracy": 0.3688413280713799,
"eval_loss": 3.5689337253570557,
"eval_runtime": 181.1092,
"eval_samples_per_second": 91.895,
"eval_steps_per_second": 5.748,
"step": 32000
},
{
"epoch": 9.33049909731524,
"grad_norm": 0.34968388080596924,
"learning_rate": 0.0003205626822157434,
"loss": 3.3376,
"step": 32050
},
{
"epoch": 9.34505852891503,
"grad_norm": 0.35291755199432373,
"learning_rate": 0.0003201253644314868,
"loss": 3.3412,
"step": 32100
},
{
"epoch": 9.359617960514822,
"grad_norm": 0.3643549978733063,
"learning_rate": 0.0003196880466472303,
"loss": 3.3411,
"step": 32150
},
{
"epoch": 9.374177392114612,
"grad_norm": 0.3537770211696625,
"learning_rate": 0.00031925072886297375,
"loss": 3.3642,
"step": 32200
},
{
"epoch": 9.388736823714403,
"grad_norm": 0.3553234338760376,
"learning_rate": 0.00031881341107871717,
"loss": 3.353,
"step": 32250
},
{
"epoch": 9.403296255314192,
"grad_norm": 0.35173216462135315,
"learning_rate": 0.00031837609329446063,
"loss": 3.3433,
"step": 32300
},
{
"epoch": 9.417855686913983,
"grad_norm": 0.3561984598636627,
"learning_rate": 0.00031793877551020405,
"loss": 3.3459,
"step": 32350
},
{
"epoch": 9.432415118513774,
"grad_norm": 0.3734908699989319,
"learning_rate": 0.00031750145772594746,
"loss": 3.3495,
"step": 32400
},
{
"epoch": 9.446974550113563,
"grad_norm": 0.3848966658115387,
"learning_rate": 0.000317064139941691,
"loss": 3.3486,
"step": 32450
},
{
"epoch": 9.461533981713353,
"grad_norm": 0.36939284205436707,
"learning_rate": 0.0003166268221574344,
"loss": 3.3527,
"step": 32500
},
{
"epoch": 9.476093413313144,
"grad_norm": 0.3429546654224396,
"learning_rate": 0.0003161895043731778,
"loss": 3.3402,
"step": 32550
},
{
"epoch": 9.490652844912935,
"grad_norm": 0.34233972430229187,
"learning_rate": 0.0003157521865889212,
"loss": 3.3508,
"step": 32600
},
{
"epoch": 9.505212276512726,
"grad_norm": 0.3572950065135956,
"learning_rate": 0.00031531486880466474,
"loss": 3.3599,
"step": 32650
},
{
"epoch": 9.519771708112515,
"grad_norm": 0.34846094250679016,
"learning_rate": 0.00031487755102040815,
"loss": 3.3585,
"step": 32700
},
{
"epoch": 9.534331139712306,
"grad_norm": 0.3666765093803406,
"learning_rate": 0.00031444023323615156,
"loss": 3.3535,
"step": 32750
},
{
"epoch": 9.548890571312096,
"grad_norm": 0.3483474850654602,
"learning_rate": 0.000314002915451895,
"loss": 3.3492,
"step": 32800
},
{
"epoch": 9.563450002911885,
"grad_norm": 0.3478499948978424,
"learning_rate": 0.00031356559766763844,
"loss": 3.3664,
"step": 32850
},
{
"epoch": 9.578009434511676,
"grad_norm": 0.3615437150001526,
"learning_rate": 0.0003131282798833819,
"loss": 3.3739,
"step": 32900
},
{
"epoch": 9.592568866111467,
"grad_norm": 0.35250964760780334,
"learning_rate": 0.0003126909620991253,
"loss": 3.3745,
"step": 32950
},
{
"epoch": 9.607128297711258,
"grad_norm": 0.35164180397987366,
"learning_rate": 0.0003122536443148688,
"loss": 3.3562,
"step": 33000
},
{
"epoch": 9.607128297711258,
"eval_accuracy": 0.3692384085597243,
"eval_loss": 3.5602471828460693,
"eval_runtime": 180.7731,
"eval_samples_per_second": 92.066,
"eval_steps_per_second": 5.759,
"step": 33000
},
{
"epoch": 9.621687729311049,
"grad_norm": 0.37672215700149536,
"learning_rate": 0.0003118163265306122,
"loss": 3.3735,
"step": 33050
},
{
"epoch": 9.636247160910838,
"grad_norm": 0.36633849143981934,
"learning_rate": 0.00031137900874635566,
"loss": 3.3498,
"step": 33100
},
{
"epoch": 9.650806592510628,
"grad_norm": 0.3514011800289154,
"learning_rate": 0.00031094169096209913,
"loss": 3.3604,
"step": 33150
},
{
"epoch": 9.66536602411042,
"grad_norm": 0.35586225986480713,
"learning_rate": 0.00031050437317784254,
"loss": 3.3574,
"step": 33200
},
{
"epoch": 9.67992545571021,
"grad_norm": 0.33317190408706665,
"learning_rate": 0.00031006705539358596,
"loss": 3.3546,
"step": 33250
},
{
"epoch": 9.694484887309999,
"grad_norm": 0.35271352529525757,
"learning_rate": 0.00030962973760932937,
"loss": 3.3632,
"step": 33300
},
{
"epoch": 9.70904431890979,
"grad_norm": 0.3521358370780945,
"learning_rate": 0.0003091924198250729,
"loss": 3.3584,
"step": 33350
},
{
"epoch": 9.72360375050958,
"grad_norm": 0.3574683666229248,
"learning_rate": 0.0003087551020408163,
"loss": 3.3581,
"step": 33400
},
{
"epoch": 9.738163182109371,
"grad_norm": 0.3643791377544403,
"learning_rate": 0.0003083177842565597,
"loss": 3.3691,
"step": 33450
},
{
"epoch": 9.75272261370916,
"grad_norm": 0.35385361313819885,
"learning_rate": 0.0003078804664723032,
"loss": 3.3547,
"step": 33500
},
{
"epoch": 9.767282045308951,
"grad_norm": 0.35955286026000977,
"learning_rate": 0.00030744314868804665,
"loss": 3.3496,
"step": 33550
},
{
"epoch": 9.781841476908742,
"grad_norm": 0.3493342697620392,
"learning_rate": 0.00030700583090379006,
"loss": 3.3629,
"step": 33600
},
{
"epoch": 9.796400908508533,
"grad_norm": 0.3883078396320343,
"learning_rate": 0.00030656851311953347,
"loss": 3.3643,
"step": 33650
},
{
"epoch": 9.810960340108322,
"grad_norm": 0.34926533699035645,
"learning_rate": 0.00030613119533527694,
"loss": 3.3662,
"step": 33700
},
{
"epoch": 9.825519771708112,
"grad_norm": 0.37770354747772217,
"learning_rate": 0.00030569387755102035,
"loss": 3.3813,
"step": 33750
},
{
"epoch": 9.840079203307903,
"grad_norm": 0.3666662275791168,
"learning_rate": 0.0003052565597667638,
"loss": 3.3669,
"step": 33800
},
{
"epoch": 9.854638634907694,
"grad_norm": 0.3690825402736664,
"learning_rate": 0.0003048192419825073,
"loss": 3.3718,
"step": 33850
},
{
"epoch": 9.869198066507483,
"grad_norm": 0.3668816387653351,
"learning_rate": 0.0003043819241982507,
"loss": 3.3572,
"step": 33900
},
{
"epoch": 9.883757498107274,
"grad_norm": 0.35026848316192627,
"learning_rate": 0.0003039446064139941,
"loss": 3.3714,
"step": 33950
},
{
"epoch": 9.898316929707065,
"grad_norm": 0.36591610312461853,
"learning_rate": 0.00030350728862973763,
"loss": 3.3759,
"step": 34000
},
{
"epoch": 9.898316929707065,
"eval_accuracy": 0.36981339333556196,
"eval_loss": 3.553095817565918,
"eval_runtime": 180.8581,
"eval_samples_per_second": 92.022,
"eval_steps_per_second": 5.756,
"step": 34000
},
{
"epoch": 9.912876361306855,
"grad_norm": 0.3765810430049896,
"learning_rate": 0.00030306997084548104,
"loss": 3.363,
"step": 34050
},
{
"epoch": 9.927435792906644,
"grad_norm": 0.3594549000263214,
"learning_rate": 0.00030263265306122445,
"loss": 3.3671,
"step": 34100
},
{
"epoch": 9.941995224506435,
"grad_norm": 0.35946381092071533,
"learning_rate": 0.00030219533527696787,
"loss": 3.3735,
"step": 34150
},
{
"epoch": 9.956554656106226,
"grad_norm": 0.37179645895957947,
"learning_rate": 0.00030175801749271133,
"loss": 3.3874,
"step": 34200
},
{
"epoch": 9.971114087706017,
"grad_norm": 0.36117124557495117,
"learning_rate": 0.0003013206997084548,
"loss": 3.3806,
"step": 34250
},
{
"epoch": 9.985673519305806,
"grad_norm": 0.34759020805358887,
"learning_rate": 0.0003008833819241982,
"loss": 3.3681,
"step": 34300
},
{
"epoch": 10.0,
"grad_norm": Infinity,
"learning_rate": 0.0003004460641399417,
"loss": 3.3643,
"step": 34350
},
{
"epoch": 10.01455943159979,
"grad_norm": 0.35527414083480835,
"learning_rate": 0.0003000087463556851,
"loss": 3.2614,
"step": 34400
},
{
"epoch": 10.029118863199582,
"grad_norm": 0.3797459304332733,
"learning_rate": 0.00029957142857142856,
"loss": 3.269,
"step": 34450
},
{
"epoch": 10.04367829479937,
"grad_norm": 0.36752596497535706,
"learning_rate": 0.000299134110787172,
"loss": 3.2838,
"step": 34500
},
{
"epoch": 10.058237726399161,
"grad_norm": 0.34516459703445435,
"learning_rate": 0.00029869679300291544,
"loss": 3.272,
"step": 34550
},
{
"epoch": 10.072797157998952,
"grad_norm": 0.3728445768356323,
"learning_rate": 0.00029825947521865885,
"loss": 3.2696,
"step": 34600
},
{
"epoch": 10.087356589598743,
"grad_norm": 0.3747389018535614,
"learning_rate": 0.0002978221574344023,
"loss": 3.2888,
"step": 34650
},
{
"epoch": 10.101916021198532,
"grad_norm": 0.34447789192199707,
"learning_rate": 0.00029738483965014573,
"loss": 3.2916,
"step": 34700
},
{
"epoch": 10.116475452798323,
"grad_norm": 0.35870856046676636,
"learning_rate": 0.0002969475218658892,
"loss": 3.2913,
"step": 34750
},
{
"epoch": 10.131034884398114,
"grad_norm": 0.35672426223754883,
"learning_rate": 0.0002965102040816326,
"loss": 3.2963,
"step": 34800
},
{
"epoch": 10.145594315997904,
"grad_norm": 0.36722877621650696,
"learning_rate": 0.0002960728862973761,
"loss": 3.2886,
"step": 34850
},
{
"epoch": 10.160153747597693,
"grad_norm": 0.3597167432308197,
"learning_rate": 0.0002956355685131195,
"loss": 3.3118,
"step": 34900
},
{
"epoch": 10.174713179197484,
"grad_norm": 0.3561251759529114,
"learning_rate": 0.00029519825072886295,
"loss": 3.2997,
"step": 34950
},
{
"epoch": 10.189272610797275,
"grad_norm": 0.37824273109436035,
"learning_rate": 0.00029476093294460637,
"loss": 3.3017,
"step": 35000
},
{
"epoch": 10.189272610797275,
"eval_accuracy": 0.3693485845791435,
"eval_loss": 3.5645644664764404,
"eval_runtime": 180.1256,
"eval_samples_per_second": 92.397,
"eval_steps_per_second": 5.779,
"step": 35000
},
{
"epoch": 10.203832042397066,
"grad_norm": 0.37217044830322266,
"learning_rate": 0.00029432361516034983,
"loss": 3.3035,
"step": 35050
},
{
"epoch": 10.218391473996855,
"grad_norm": 0.3471571207046509,
"learning_rate": 0.0002938862973760933,
"loss": 3.3116,
"step": 35100
},
{
"epoch": 10.232950905596645,
"grad_norm": 0.3539142310619354,
"learning_rate": 0.0002934489795918367,
"loss": 3.296,
"step": 35150
},
{
"epoch": 10.247510337196436,
"grad_norm": 0.36773473024368286,
"learning_rate": 0.0002930116618075802,
"loss": 3.3028,
"step": 35200
},
{
"epoch": 10.262069768796227,
"grad_norm": 0.3689476549625397,
"learning_rate": 0.0002925743440233236,
"loss": 3.3111,
"step": 35250
},
{
"epoch": 10.276629200396016,
"grad_norm": 0.3640798032283783,
"learning_rate": 0.00029213702623906706,
"loss": 3.3157,
"step": 35300
},
{
"epoch": 10.291188631995807,
"grad_norm": 0.3602818250656128,
"learning_rate": 0.00029169970845481047,
"loss": 3.3138,
"step": 35350
},
{
"epoch": 10.305748063595598,
"grad_norm": 0.38390350341796875,
"learning_rate": 0.00029126239067055394,
"loss": 3.3077,
"step": 35400
},
{
"epoch": 10.320307495195388,
"grad_norm": 0.36689597368240356,
"learning_rate": 0.00029082507288629735,
"loss": 3.309,
"step": 35450
},
{
"epoch": 10.334866926795177,
"grad_norm": 0.3611031770706177,
"learning_rate": 0.00029038775510204076,
"loss": 3.3119,
"step": 35500
},
{
"epoch": 10.349426358394968,
"grad_norm": 0.36774659156799316,
"learning_rate": 0.0002899504373177842,
"loss": 3.3189,
"step": 35550
},
{
"epoch": 10.363985789994759,
"grad_norm": 0.36395514011383057,
"learning_rate": 0.00028951311953352764,
"loss": 3.3109,
"step": 35600
},
{
"epoch": 10.37854522159455,
"grad_norm": 0.362166166305542,
"learning_rate": 0.0002890758017492711,
"loss": 3.3192,
"step": 35650
},
{
"epoch": 10.393104653194339,
"grad_norm": 0.3618522882461548,
"learning_rate": 0.0002886384839650145,
"loss": 3.3183,
"step": 35700
},
{
"epoch": 10.40766408479413,
"grad_norm": 0.3681625723838806,
"learning_rate": 0.000288201166180758,
"loss": 3.318,
"step": 35750
},
{
"epoch": 10.42222351639392,
"grad_norm": 0.3899301588535309,
"learning_rate": 0.00028776384839650145,
"loss": 3.3182,
"step": 35800
},
{
"epoch": 10.436782947993711,
"grad_norm": 0.35318905115127563,
"learning_rate": 0.00028732653061224486,
"loss": 3.3284,
"step": 35850
},
{
"epoch": 10.4513423795935,
"grad_norm": 0.38061952590942383,
"learning_rate": 0.00028688921282798833,
"loss": 3.3173,
"step": 35900
},
{
"epoch": 10.46590181119329,
"grad_norm": 0.3645211160182953,
"learning_rate": 0.00028645189504373174,
"loss": 3.3272,
"step": 35950
},
{
"epoch": 10.480461242793082,
"grad_norm": 0.36433538794517517,
"learning_rate": 0.0002860145772594752,
"loss": 3.3164,
"step": 36000
},
{
"epoch": 10.480461242793082,
"eval_accuracy": 0.37010747041621017,
"eval_loss": 3.556314706802368,
"eval_runtime": 180.1676,
"eval_samples_per_second": 92.375,
"eval_steps_per_second": 5.778,
"step": 36000
},
{
"epoch": 10.495020674392872,
"grad_norm": 0.36834511160850525,
"learning_rate": 0.0002855772594752186,
"loss": 3.3291,
"step": 36050
},
{
"epoch": 10.509580105992661,
"grad_norm": 0.3711186945438385,
"learning_rate": 0.0002851399416909621,
"loss": 3.3161,
"step": 36100
},
{
"epoch": 10.524139537592452,
"grad_norm": 0.356585294008255,
"learning_rate": 0.0002847026239067055,
"loss": 3.3329,
"step": 36150
},
{
"epoch": 10.538698969192243,
"grad_norm": 0.36765870451927185,
"learning_rate": 0.00028426530612244897,
"loss": 3.3352,
"step": 36200
},
{
"epoch": 10.553258400792034,
"grad_norm": 0.3481246531009674,
"learning_rate": 0.0002838279883381924,
"loss": 3.3271,
"step": 36250
},
{
"epoch": 10.567817832391823,
"grad_norm": 0.35420429706573486,
"learning_rate": 0.00028339067055393585,
"loss": 3.3268,
"step": 36300
},
{
"epoch": 10.582377263991614,
"grad_norm": 0.3609519302845001,
"learning_rate": 0.00028295335276967926,
"loss": 3.3295,
"step": 36350
},
{
"epoch": 10.596936695591404,
"grad_norm": 0.3677191138267517,
"learning_rate": 0.0002825160349854227,
"loss": 3.3333,
"step": 36400
},
{
"epoch": 10.611496127191195,
"grad_norm": 0.37628525495529175,
"learning_rate": 0.00028207871720116614,
"loss": 3.3399,
"step": 36450
},
{
"epoch": 10.626055558790984,
"grad_norm": 0.3637225925922394,
"learning_rate": 0.0002816413994169096,
"loss": 3.3349,
"step": 36500
},
{
"epoch": 10.640614990390775,
"grad_norm": 0.3519335091114044,
"learning_rate": 0.00028120408163265307,
"loss": 3.3346,
"step": 36550
},
{
"epoch": 10.655174421990566,
"grad_norm": 0.348203182220459,
"learning_rate": 0.0002807667638483965,
"loss": 3.3289,
"step": 36600
},
{
"epoch": 10.669733853590357,
"grad_norm": 0.36233091354370117,
"learning_rate": 0.00028032944606413995,
"loss": 3.3264,
"step": 36650
},
{
"epoch": 10.684293285190146,
"grad_norm": 0.3718380630016327,
"learning_rate": 0.00027989212827988336,
"loss": 3.3317,
"step": 36700
},
{
"epoch": 10.698852716789936,
"grad_norm": 0.35991501808166504,
"learning_rate": 0.00027945481049562683,
"loss": 3.3365,
"step": 36750
},
{
"epoch": 10.713412148389727,
"grad_norm": 0.37417152523994446,
"learning_rate": 0.00027901749271137024,
"loss": 3.3266,
"step": 36800
},
{
"epoch": 10.727971579989518,
"grad_norm": 0.3618806004524231,
"learning_rate": 0.00027858017492711365,
"loss": 3.3338,
"step": 36850
},
{
"epoch": 10.742531011589307,
"grad_norm": 0.3808761239051819,
"learning_rate": 0.0002781428571428571,
"loss": 3.3322,
"step": 36900
},
{
"epoch": 10.757090443189098,
"grad_norm": 0.35829290747642517,
"learning_rate": 0.00027770553935860053,
"loss": 3.3405,
"step": 36950
},
{
"epoch": 10.771649874788888,
"grad_norm": 0.35556626319885254,
"learning_rate": 0.000277268221574344,
"loss": 3.3349,
"step": 37000
},
{
"epoch": 10.771649874788888,
"eval_accuracy": 0.3708527165326231,
"eval_loss": 3.548063039779663,
"eval_runtime": 180.2516,
"eval_samples_per_second": 92.332,
"eval_steps_per_second": 5.775,
"step": 37000
},
{
"epoch": 10.78620930638868,
"grad_norm": 0.36781635880470276,
"learning_rate": 0.0002768309037900874,
"loss": 3.3393,
"step": 37050
},
{
"epoch": 10.800768737988468,
"grad_norm": 0.3739968538284302,
"learning_rate": 0.0002763935860058309,
"loss": 3.3362,
"step": 37100
},
{
"epoch": 10.815328169588259,
"grad_norm": 0.37725409865379333,
"learning_rate": 0.0002759562682215743,
"loss": 3.343,
"step": 37150
},
{
"epoch": 10.82988760118805,
"grad_norm": 0.3467895984649658,
"learning_rate": 0.00027551895043731776,
"loss": 3.3424,
"step": 37200
},
{
"epoch": 10.84444703278784,
"grad_norm": 0.3589009940624237,
"learning_rate": 0.0002750816326530612,
"loss": 3.3149,
"step": 37250
},
{
"epoch": 10.85900646438763,
"grad_norm": 0.36413517594337463,
"learning_rate": 0.00027464431486880464,
"loss": 3.3448,
"step": 37300
},
{
"epoch": 10.87356589598742,
"grad_norm": 0.3594954311847687,
"learning_rate": 0.0002742069970845481,
"loss": 3.3452,
"step": 37350
},
{
"epoch": 10.888125327587211,
"grad_norm": 0.36977705359458923,
"learning_rate": 0.0002737696793002915,
"loss": 3.3387,
"step": 37400
},
{
"epoch": 10.902684759187002,
"grad_norm": 0.3728332817554474,
"learning_rate": 0.000273332361516035,
"loss": 3.3554,
"step": 37450
},
{
"epoch": 10.917244190786791,
"grad_norm": 0.3603312075138092,
"learning_rate": 0.0002728950437317784,
"loss": 3.3495,
"step": 37500
},
{
"epoch": 10.931803622386582,
"grad_norm": 0.37357112765312195,
"learning_rate": 0.00027245772594752186,
"loss": 3.3509,
"step": 37550
},
{
"epoch": 10.946363053986373,
"grad_norm": 0.3870396316051483,
"learning_rate": 0.00027202040816326527,
"loss": 3.3451,
"step": 37600
},
{
"epoch": 10.960922485586163,
"grad_norm": 0.36924847960472107,
"learning_rate": 0.00027158309037900874,
"loss": 3.3482,
"step": 37650
},
{
"epoch": 10.975481917185952,
"grad_norm": 0.3659966289997101,
"learning_rate": 0.00027114577259475215,
"loss": 3.3429,
"step": 37700
},
{
"epoch": 10.990041348785743,
"grad_norm": 0.3750581741333008,
"learning_rate": 0.00027070845481049556,
"loss": 3.3467,
"step": 37750
},
{
"epoch": 11.004367829479937,
"grad_norm": 0.3540584444999695,
"learning_rate": 0.00027027113702623903,
"loss": 3.3111,
"step": 37800
},
{
"epoch": 11.018927261079728,
"grad_norm": 0.36422842741012573,
"learning_rate": 0.0002698338192419825,
"loss": 3.2475,
"step": 37850
},
{
"epoch": 11.033486692679517,
"grad_norm": 0.36595383286476135,
"learning_rate": 0.0002693965014577259,
"loss": 3.2508,
"step": 37900
},
{
"epoch": 11.048046124279308,
"grad_norm": 0.3714156448841095,
"learning_rate": 0.0002689591836734694,
"loss": 3.2548,
"step": 37950
},
{
"epoch": 11.062605555879099,
"grad_norm": 0.38618841767311096,
"learning_rate": 0.00026852186588921284,
"loss": 3.2621,
"step": 38000
},
{
"epoch": 11.062605555879099,
"eval_accuracy": 0.370781343166788,
"eval_loss": 3.5563323497772217,
"eval_runtime": 179.9746,
"eval_samples_per_second": 92.474,
"eval_steps_per_second": 5.784,
"step": 38000
},
{
"epoch": 11.07716498747889,
"grad_norm": 0.3659396469593048,
"learning_rate": 0.00026808454810495625,
"loss": 3.2429,
"step": 38050
},
{
"epoch": 11.091724419078679,
"grad_norm": 0.3699627220630646,
"learning_rate": 0.0002676472303206997,
"loss": 3.2574,
"step": 38100
},
{
"epoch": 11.10628385067847,
"grad_norm": 0.371509313583374,
"learning_rate": 0.00026720991253644313,
"loss": 3.2581,
"step": 38150
},
{
"epoch": 11.12084328227826,
"grad_norm": 0.3545081317424774,
"learning_rate": 0.00026677259475218655,
"loss": 3.2631,
"step": 38200
},
{
"epoch": 11.135402713878051,
"grad_norm": 0.36414968967437744,
"learning_rate": 0.00026633527696793,
"loss": 3.2715,
"step": 38250
},
{
"epoch": 11.14996214547784,
"grad_norm": 0.36221858859062195,
"learning_rate": 0.0002658979591836734,
"loss": 3.2702,
"step": 38300
},
{
"epoch": 11.16452157707763,
"grad_norm": 0.35454094409942627,
"learning_rate": 0.0002654606413994169,
"loss": 3.2636,
"step": 38350
},
{
"epoch": 11.179081008677421,
"grad_norm": 0.38314637541770935,
"learning_rate": 0.0002650233236151603,
"loss": 3.2647,
"step": 38400
},
{
"epoch": 11.193640440277212,
"grad_norm": 0.36567234992980957,
"learning_rate": 0.00026458600583090377,
"loss": 3.2764,
"step": 38450
},
{
"epoch": 11.208199871877001,
"grad_norm": 0.36688846349716187,
"learning_rate": 0.0002641486880466472,
"loss": 3.2722,
"step": 38500
},
{
"epoch": 11.222759303476792,
"grad_norm": 0.38438311219215393,
"learning_rate": 0.00026371137026239065,
"loss": 3.2675,
"step": 38550
},
{
"epoch": 11.237318735076583,
"grad_norm": 0.3896602392196655,
"learning_rate": 0.0002632740524781341,
"loss": 3.2684,
"step": 38600
},
{
"epoch": 11.251878166676374,
"grad_norm": 0.3787361681461334,
"learning_rate": 0.00026283673469387753,
"loss": 3.2774,
"step": 38650
},
{
"epoch": 11.266437598276163,
"grad_norm": 0.36523544788360596,
"learning_rate": 0.000262399416909621,
"loss": 3.2731,
"step": 38700
},
{
"epoch": 11.280997029875953,
"grad_norm": 0.38595035672187805,
"learning_rate": 0.0002619620991253644,
"loss": 3.2802,
"step": 38750
},
{
"epoch": 11.295556461475744,
"grad_norm": 0.3597980737686157,
"learning_rate": 0.0002615247813411079,
"loss": 3.2836,
"step": 38800
},
{
"epoch": 11.310115893075535,
"grad_norm": 0.375411719083786,
"learning_rate": 0.0002610874635568513,
"loss": 3.2809,
"step": 38850
},
{
"epoch": 11.324675324675324,
"grad_norm": 0.3683791756629944,
"learning_rate": 0.00026065014577259475,
"loss": 3.2832,
"step": 38900
},
{
"epoch": 11.339234756275115,
"grad_norm": 0.36232179403305054,
"learning_rate": 0.00026021282798833817,
"loss": 3.279,
"step": 38950
},
{
"epoch": 11.353794187874906,
"grad_norm": 0.3584194779396057,
"learning_rate": 0.0002597755102040816,
"loss": 3.2832,
"step": 39000
},
{
"epoch": 11.353794187874906,
"eval_accuracy": 0.37091233151858416,
"eval_loss": 3.553581953048706,
"eval_runtime": 179.822,
"eval_samples_per_second": 92.553,
"eval_steps_per_second": 5.789,
"step": 39000
},
{
"epoch": 11.368353619474696,
"grad_norm": 0.36148691177368164,
"learning_rate": 0.00025933819241982504,
"loss": 3.2926,
"step": 39050
},
{
"epoch": 11.382913051074485,
"grad_norm": 0.36825209856033325,
"learning_rate": 0.00025890087463556846,
"loss": 3.2974,
"step": 39100
},
{
"epoch": 11.397472482674276,
"grad_norm": 0.3690287470817566,
"learning_rate": 0.0002584635568513119,
"loss": 3.285,
"step": 39150
},
{
"epoch": 11.412031914274067,
"grad_norm": 0.37193694710731506,
"learning_rate": 0.00025802623906705534,
"loss": 3.3057,
"step": 39200
},
{
"epoch": 11.426591345873858,
"grad_norm": 0.3798997700214386,
"learning_rate": 0.0002575889212827988,
"loss": 3.2901,
"step": 39250
},
{
"epoch": 11.441150777473647,
"grad_norm": 0.3867810368537903,
"learning_rate": 0.00025715160349854227,
"loss": 3.2994,
"step": 39300
},
{
"epoch": 11.455710209073438,
"grad_norm": 0.3750901520252228,
"learning_rate": 0.0002567142857142857,
"loss": 3.2932,
"step": 39350
},
{
"epoch": 11.470269640673228,
"grad_norm": 0.35880762338638306,
"learning_rate": 0.00025627696793002915,
"loss": 3.2873,
"step": 39400
},
{
"epoch": 11.484829072273019,
"grad_norm": 0.3917964994907379,
"learning_rate": 0.00025583965014577256,
"loss": 3.2952,
"step": 39450
},
{
"epoch": 11.499388503872808,
"grad_norm": 0.3772904574871063,
"learning_rate": 0.000255402332361516,
"loss": 3.3044,
"step": 39500
},
{
"epoch": 11.513947935472599,
"grad_norm": 0.3691461980342865,
"learning_rate": 0.00025496501457725944,
"loss": 3.3156,
"step": 39550
},
{
"epoch": 11.52850736707239,
"grad_norm": 0.36424410343170166,
"learning_rate": 0.0002545276967930029,
"loss": 3.3019,
"step": 39600
},
{
"epoch": 11.54306679867218,
"grad_norm": 0.3689974844455719,
"learning_rate": 0.0002540903790087463,
"loss": 3.3016,
"step": 39650
},
{
"epoch": 11.55762623027197,
"grad_norm": 0.38458317518234253,
"learning_rate": 0.0002536530612244898,
"loss": 3.3063,
"step": 39700
},
{
"epoch": 11.57218566187176,
"grad_norm": 0.3871372640132904,
"learning_rate": 0.0002532157434402332,
"loss": 3.3034,
"step": 39750
},
{
"epoch": 11.586745093471551,
"grad_norm": 0.3936833143234253,
"learning_rate": 0.00025277842565597666,
"loss": 3.3101,
"step": 39800
},
{
"epoch": 11.601304525071342,
"grad_norm": 0.3917473256587982,
"learning_rate": 0.0002523411078717201,
"loss": 3.3225,
"step": 39850
},
{
"epoch": 11.61586395667113,
"grad_norm": 0.3640928864479065,
"learning_rate": 0.00025190379008746354,
"loss": 3.304,
"step": 39900
},
{
"epoch": 11.630423388270922,
"grad_norm": 0.4092429578304291,
"learning_rate": 0.00025146647230320696,
"loss": 3.3123,
"step": 39950
},
{
"epoch": 11.644982819870712,
"grad_norm": 0.3751949071884155,
"learning_rate": 0.0002510291545189504,
"loss": 3.3111,
"step": 40000
},
{
"epoch": 11.644982819870712,
"eval_accuracy": 0.371191240289195,
"eval_loss": 3.5466434955596924,
"eval_runtime": 179.6345,
"eval_samples_per_second": 92.649,
"eval_steps_per_second": 5.795,
"step": 40000
},
{
"epoch": 11.659542251470503,
"grad_norm": 0.3745579719543457,
"learning_rate": 0.0002505918367346939,
"loss": 3.3036,
"step": 40050
},
{
"epoch": 11.674101683070292,
"grad_norm": 0.3651117980480194,
"learning_rate": 0.0002501545189504373,
"loss": 3.3111,
"step": 40100
},
{
"epoch": 11.688661114670083,
"grad_norm": 0.3845028579235077,
"learning_rate": 0.00024971720116618077,
"loss": 3.3049,
"step": 40150
},
{
"epoch": 11.703220546269874,
"grad_norm": 0.3631397783756256,
"learning_rate": 0.0002492798833819242,
"loss": 3.3031,
"step": 40200
},
{
"epoch": 11.717779977869665,
"grad_norm": 0.37985455989837646,
"learning_rate": 0.00024884256559766765,
"loss": 3.3111,
"step": 40250
},
{
"epoch": 11.732339409469454,
"grad_norm": 0.3810136616230011,
"learning_rate": 0.00024840524781341106,
"loss": 3.304,
"step": 40300
},
{
"epoch": 11.746898841069244,
"grad_norm": 0.37321019172668457,
"learning_rate": 0.00024796793002915447,
"loss": 3.3037,
"step": 40350
},
{
"epoch": 11.761458272669035,
"grad_norm": 0.37630370259284973,
"learning_rate": 0.00024753061224489794,
"loss": 3.3086,
"step": 40400
},
{
"epoch": 11.776017704268826,
"grad_norm": 0.36770492792129517,
"learning_rate": 0.00024709329446064135,
"loss": 3.3181,
"step": 40450
},
{
"epoch": 11.790577135868615,
"grad_norm": 0.3986623287200928,
"learning_rate": 0.0002466559766763848,
"loss": 3.3203,
"step": 40500
},
{
"epoch": 11.805136567468406,
"grad_norm": 0.3720547556877136,
"learning_rate": 0.00024621865889212823,
"loss": 3.3183,
"step": 40550
},
{
"epoch": 11.819695999068196,
"grad_norm": 0.37636956572532654,
"learning_rate": 0.0002457813411078717,
"loss": 3.3105,
"step": 40600
},
{
"epoch": 11.834255430667987,
"grad_norm": 0.3576502501964569,
"learning_rate": 0.00024534402332361516,
"loss": 3.3248,
"step": 40650
},
{
"epoch": 11.848814862267776,
"grad_norm": 0.3808489739894867,
"learning_rate": 0.0002449067055393586,
"loss": 3.3177,
"step": 40700
},
{
"epoch": 11.863374293867567,
"grad_norm": 0.3698633313179016,
"learning_rate": 0.00024446938775510204,
"loss": 3.3259,
"step": 40750
},
{
"epoch": 11.877933725467358,
"grad_norm": 0.3820943534374237,
"learning_rate": 0.00024403206997084545,
"loss": 3.3181,
"step": 40800
},
{
"epoch": 11.892493157067149,
"grad_norm": 0.385206401348114,
"learning_rate": 0.0002435947521865889,
"loss": 3.3164,
"step": 40850
},
{
"epoch": 11.90705258866694,
"grad_norm": 0.3971438705921173,
"learning_rate": 0.00024315743440233233,
"loss": 3.3099,
"step": 40900
},
{
"epoch": 11.921612020266728,
"grad_norm": 0.38719749450683594,
"learning_rate": 0.0002427201166180758,
"loss": 3.3164,
"step": 40950
},
{
"epoch": 11.93617145186652,
"grad_norm": 0.3685580790042877,
"learning_rate": 0.0002422827988338192,
"loss": 3.326,
"step": 41000
},
{
"epoch": 11.93617145186652,
"eval_accuracy": 0.37202020607031067,
"eval_loss": 3.5386884212493896,
"eval_runtime": 180.0955,
"eval_samples_per_second": 92.412,
"eval_steps_per_second": 5.78,
"step": 41000
},
{
"epoch": 11.95073088346631,
"grad_norm": 0.37299463152885437,
"learning_rate": 0.00024184548104956268,
"loss": 3.3102,
"step": 41050
},
{
"epoch": 11.965290315066099,
"grad_norm": 0.40013447403907776,
"learning_rate": 0.0002414081632653061,
"loss": 3.3126,
"step": 41100
},
{
"epoch": 11.97984974666589,
"grad_norm": 0.3682084083557129,
"learning_rate": 0.00024097084548104956,
"loss": 3.3236,
"step": 41150
},
{
"epoch": 11.99440917826568,
"grad_norm": 0.37663114070892334,
"learning_rate": 0.00024053352769679297,
"loss": 3.3187,
"step": 41200
},
{
"epoch": 12.008735658959875,
"grad_norm": 0.3868860900402069,
"learning_rate": 0.0002400962099125364,
"loss": 3.2557,
"step": 41250
},
{
"epoch": 12.023295090559664,
"grad_norm": 0.39282509684562683,
"learning_rate": 0.00023965889212827988,
"loss": 3.2263,
"step": 41300
},
{
"epoch": 12.037854522159455,
"grad_norm": 0.38918325304985046,
"learning_rate": 0.0002392215743440233,
"loss": 3.2284,
"step": 41350
},
{
"epoch": 12.052413953759245,
"grad_norm": 0.37995877861976624,
"learning_rate": 0.00023878425655976675,
"loss": 3.232,
"step": 41400
},
{
"epoch": 12.066973385359036,
"grad_norm": 0.38068705797195435,
"learning_rate": 0.00023834693877551017,
"loss": 3.2244,
"step": 41450
},
{
"epoch": 12.081532816958825,
"grad_norm": 0.3786507844924927,
"learning_rate": 0.00023790962099125363,
"loss": 3.2388,
"step": 41500
},
{
"epoch": 12.096092248558616,
"grad_norm": 0.39703622460365295,
"learning_rate": 0.00023747230320699707,
"loss": 3.2361,
"step": 41550
},
{
"epoch": 12.110651680158407,
"grad_norm": 0.3827173709869385,
"learning_rate": 0.0002370349854227405,
"loss": 3.2446,
"step": 41600
},
{
"epoch": 12.125211111758198,
"grad_norm": 0.38849908113479614,
"learning_rate": 0.00023659766763848395,
"loss": 3.2449,
"step": 41650
},
{
"epoch": 12.139770543357987,
"grad_norm": 0.3723032772541046,
"learning_rate": 0.00023616034985422736,
"loss": 3.2473,
"step": 41700
},
{
"epoch": 12.154329974957777,
"grad_norm": 0.3933330774307251,
"learning_rate": 0.00023572303206997083,
"loss": 3.2458,
"step": 41750
},
{
"epoch": 12.168889406557568,
"grad_norm": 0.3799208998680115,
"learning_rate": 0.00023528571428571424,
"loss": 3.2397,
"step": 41800
},
{
"epoch": 12.183448838157359,
"grad_norm": 0.37628769874572754,
"learning_rate": 0.0002348483965014577,
"loss": 3.2561,
"step": 41850
},
{
"epoch": 12.198008269757148,
"grad_norm": 0.39289426803588867,
"learning_rate": 0.00023441107871720115,
"loss": 3.2559,
"step": 41900
},
{
"epoch": 12.212567701356939,
"grad_norm": 0.38283148407936096,
"learning_rate": 0.0002339737609329446,
"loss": 3.2436,
"step": 41950
},
{
"epoch": 12.22712713295673,
"grad_norm": 0.3877449929714203,
"learning_rate": 0.00023353644314868803,
"loss": 3.2475,
"step": 42000
},
{
"epoch": 12.22712713295673,
"eval_accuracy": 0.3715757393110742,
"eval_loss": 3.5504579544067383,
"eval_runtime": 180.0981,
"eval_samples_per_second": 92.411,
"eval_steps_per_second": 5.78,
"step": 42000
},
{
"epoch": 12.24168656455652,
"grad_norm": 0.382993221282959,
"learning_rate": 0.0002330991253644315,
"loss": 3.2507,
"step": 42050
},
{
"epoch": 12.25624599615631,
"grad_norm": 0.3783535063266754,
"learning_rate": 0.0002326618075801749,
"loss": 3.2515,
"step": 42100
},
{
"epoch": 12.2708054277561,
"grad_norm": 0.3950106203556061,
"learning_rate": 0.00023222448979591835,
"loss": 3.2695,
"step": 42150
},
{
"epoch": 12.28536485935589,
"grad_norm": 0.368351548910141,
"learning_rate": 0.00023178717201166179,
"loss": 3.264,
"step": 42200
},
{
"epoch": 12.299924290955682,
"grad_norm": 0.39170610904693604,
"learning_rate": 0.00023134985422740523,
"loss": 3.2618,
"step": 42250
},
{
"epoch": 12.31448372255547,
"grad_norm": 0.3892439901828766,
"learning_rate": 0.00023091253644314866,
"loss": 3.2575,
"step": 42300
},
{
"epoch": 12.329043154155261,
"grad_norm": 0.40385738015174866,
"learning_rate": 0.0002304752186588921,
"loss": 3.2627,
"step": 42350
},
{
"epoch": 12.343602585755052,
"grad_norm": 0.3920726776123047,
"learning_rate": 0.00023003790087463557,
"loss": 3.2548,
"step": 42400
},
{
"epoch": 12.358162017354843,
"grad_norm": 0.389912873506546,
"learning_rate": 0.00022960058309037898,
"loss": 3.2589,
"step": 42450
},
{
"epoch": 12.372721448954632,
"grad_norm": 0.3855544924736023,
"learning_rate": 0.00022916326530612245,
"loss": 3.277,
"step": 42500
},
{
"epoch": 12.387280880554423,
"grad_norm": 0.4056563973426819,
"learning_rate": 0.00022872594752186586,
"loss": 3.2563,
"step": 42550
},
{
"epoch": 12.401840312154214,
"grad_norm": 0.3610406816005707,
"learning_rate": 0.0002282886297376093,
"loss": 3.2695,
"step": 42600
},
{
"epoch": 12.416399743754004,
"grad_norm": 0.3934992253780365,
"learning_rate": 0.00022785131195335277,
"loss": 3.2654,
"step": 42650
},
{
"epoch": 12.430959175353793,
"grad_norm": 0.3924916386604309,
"learning_rate": 0.00022741399416909618,
"loss": 3.2659,
"step": 42700
},
{
"epoch": 12.445518606953584,
"grad_norm": 0.3711550533771515,
"learning_rate": 0.00022697667638483965,
"loss": 3.2705,
"step": 42750
},
{
"epoch": 12.460078038553375,
"grad_norm": 0.38350147008895874,
"learning_rate": 0.00022653935860058306,
"loss": 3.2869,
"step": 42800
},
{
"epoch": 12.474637470153166,
"grad_norm": 0.38396674394607544,
"learning_rate": 0.00022610204081632653,
"loss": 3.2721,
"step": 42850
},
{
"epoch": 12.489196901752955,
"grad_norm": 0.41561633348464966,
"learning_rate": 0.00022566472303206994,
"loss": 3.2791,
"step": 42900
},
{
"epoch": 12.503756333352746,
"grad_norm": 0.384716659784317,
"learning_rate": 0.00022522740524781338,
"loss": 3.259,
"step": 42950
},
{
"epoch": 12.518315764952536,
"grad_norm": 0.36449480056762695,
"learning_rate": 0.00022479008746355684,
"loss": 3.2651,
"step": 43000
},
{
"epoch": 12.518315764952536,
"eval_accuracy": 0.3719537712240227,
"eval_loss": 3.545558214187622,
"eval_runtime": 179.9007,
"eval_samples_per_second": 92.512,
"eval_steps_per_second": 5.787,
"step": 43000
},
{
"epoch": 12.532875196552327,
"grad_norm": 0.36965641379356384,
"learning_rate": 0.00022435276967930026,
"loss": 3.2749,
"step": 43050
},
{
"epoch": 12.547434628152118,
"grad_norm": 0.3777758777141571,
"learning_rate": 0.00022391545189504372,
"loss": 3.2745,
"step": 43100
},
{
"epoch": 12.561994059751907,
"grad_norm": 0.37569695711135864,
"learning_rate": 0.00022347813411078714,
"loss": 3.2759,
"step": 43150
},
{
"epoch": 12.576553491351698,
"grad_norm": 0.40388092398643494,
"learning_rate": 0.0002230408163265306,
"loss": 3.2815,
"step": 43200
},
{
"epoch": 12.591112922951488,
"grad_norm": 0.3894331455230713,
"learning_rate": 0.00022260349854227402,
"loss": 3.2916,
"step": 43250
},
{
"epoch": 12.605672354551277,
"grad_norm": 0.4040619432926178,
"learning_rate": 0.00022216618075801748,
"loss": 3.2824,
"step": 43300
},
{
"epoch": 12.620231786151068,
"grad_norm": 0.3677726089954376,
"learning_rate": 0.00022172886297376092,
"loss": 3.2909,
"step": 43350
},
{
"epoch": 12.634791217750859,
"grad_norm": 0.4069276750087738,
"learning_rate": 0.00022129154518950433,
"loss": 3.2797,
"step": 43400
},
{
"epoch": 12.64935064935065,
"grad_norm": 0.37287694215774536,
"learning_rate": 0.0002208542274052478,
"loss": 3.2765,
"step": 43450
},
{
"epoch": 12.66391008095044,
"grad_norm": 0.3790263831615448,
"learning_rate": 0.0002204169096209912,
"loss": 3.2883,
"step": 43500
},
{
"epoch": 12.67846951255023,
"grad_norm": 0.3825741708278656,
"learning_rate": 0.00021997959183673468,
"loss": 3.2682,
"step": 43550
},
{
"epoch": 12.69302894415002,
"grad_norm": 0.3945907652378082,
"learning_rate": 0.00021954227405247812,
"loss": 3.2781,
"step": 43600
},
{
"epoch": 12.707588375749811,
"grad_norm": 0.3808131515979767,
"learning_rate": 0.00021910495626822156,
"loss": 3.28,
"step": 43650
},
{
"epoch": 12.7221478073496,
"grad_norm": 0.3769155442714691,
"learning_rate": 0.000218667638483965,
"loss": 3.2763,
"step": 43700
},
{
"epoch": 12.736707238949391,
"grad_norm": 0.39110711216926575,
"learning_rate": 0.00021823032069970844,
"loss": 3.288,
"step": 43750
},
{
"epoch": 12.751266670549182,
"grad_norm": 0.3744281828403473,
"learning_rate": 0.00021779300291545188,
"loss": 3.2741,
"step": 43800
},
{
"epoch": 12.765826102148973,
"grad_norm": 0.3719866871833801,
"learning_rate": 0.0002173556851311953,
"loss": 3.2686,
"step": 43850
},
{
"epoch": 12.780385533748763,
"grad_norm": 0.39358916878700256,
"learning_rate": 0.00021691836734693876,
"loss": 3.2931,
"step": 43900
},
{
"epoch": 12.794944965348552,
"grad_norm": 0.3743979334831238,
"learning_rate": 0.0002164810495626822,
"loss": 3.2879,
"step": 43950
},
{
"epoch": 12.809504396948343,
"grad_norm": 0.3975497782230377,
"learning_rate": 0.00021604373177842563,
"loss": 3.2914,
"step": 44000
},
{
"epoch": 12.809504396948343,
"eval_accuracy": 0.3726891403013414,
"eval_loss": 3.5382912158966064,
"eval_runtime": 179.7659,
"eval_samples_per_second": 92.582,
"eval_steps_per_second": 5.791,
"step": 44000
},
{
"epoch": 12.824063828548134,
"grad_norm": 0.3927364647388458,
"learning_rate": 0.00021560641399416907,
"loss": 3.2988,
"step": 44050
},
{
"epoch": 12.838623260147925,
"grad_norm": 0.38748300075531006,
"learning_rate": 0.00021516909620991254,
"loss": 3.2933,
"step": 44100
},
{
"epoch": 12.853182691747714,
"grad_norm": 0.3822428584098816,
"learning_rate": 0.00021473177842565595,
"loss": 3.2882,
"step": 44150
},
{
"epoch": 12.867742123347504,
"grad_norm": 0.3862573504447937,
"learning_rate": 0.00021429446064139942,
"loss": 3.2869,
"step": 44200
},
{
"epoch": 12.882301554947295,
"grad_norm": 0.37098950147628784,
"learning_rate": 0.00021385714285714283,
"loss": 3.2866,
"step": 44250
},
{
"epoch": 12.896860986547086,
"grad_norm": 0.3941670358181,
"learning_rate": 0.00021341982507288627,
"loss": 3.293,
"step": 44300
},
{
"epoch": 12.911420418146875,
"grad_norm": 0.36743009090423584,
"learning_rate": 0.0002129825072886297,
"loss": 3.296,
"step": 44350
},
{
"epoch": 12.925979849746666,
"grad_norm": 0.38341784477233887,
"learning_rate": 0.00021254518950437315,
"loss": 3.2897,
"step": 44400
},
{
"epoch": 12.940539281346457,
"grad_norm": 0.36301809549331665,
"learning_rate": 0.00021210787172011662,
"loss": 3.3029,
"step": 44450
},
{
"epoch": 12.955098712946247,
"grad_norm": 0.38180243968963623,
"learning_rate": 0.00021167055393586003,
"loss": 3.286,
"step": 44500
},
{
"epoch": 12.969658144546036,
"grad_norm": 0.3855466842651367,
"learning_rate": 0.0002112332361516035,
"loss": 3.2899,
"step": 44550
},
{
"epoch": 12.984217576145827,
"grad_norm": 0.38620680570602417,
"learning_rate": 0.0002107959183673469,
"loss": 3.3077,
"step": 44600
},
{
"epoch": 12.998777007745618,
"grad_norm": 0.38125503063201904,
"learning_rate": 0.00021035860058309037,
"loss": 3.2907,
"step": 44650
},
{
"epoch": 13.01310348843981,
"grad_norm": 0.3893320858478546,
"learning_rate": 0.00020992128279883381,
"loss": 3.199,
"step": 44700
},
{
"epoch": 13.027662920039601,
"grad_norm": 0.4010103642940521,
"learning_rate": 0.00020948396501457723,
"loss": 3.2042,
"step": 44750
},
{
"epoch": 13.042222351639392,
"grad_norm": 0.40282300114631653,
"learning_rate": 0.0002090466472303207,
"loss": 3.212,
"step": 44800
},
{
"epoch": 13.056781783239183,
"grad_norm": 0.39462268352508545,
"learning_rate": 0.0002086093294460641,
"loss": 3.2093,
"step": 44850
},
{
"epoch": 13.071341214838972,
"grad_norm": 0.4072793126106262,
"learning_rate": 0.00020817201166180757,
"loss": 3.209,
"step": 44900
},
{
"epoch": 13.085900646438763,
"grad_norm": 0.40736645460128784,
"learning_rate": 0.00020773469387755098,
"loss": 3.2128,
"step": 44950
},
{
"epoch": 13.100460078038553,
"grad_norm": 0.4051344096660614,
"learning_rate": 0.00020729737609329445,
"loss": 3.2055,
"step": 45000
},
{
"epoch": 13.100460078038553,
"eval_accuracy": 0.37210968734115163,
"eval_loss": 3.5462467670440674,
"eval_runtime": 179.64,
"eval_samples_per_second": 92.646,
"eval_steps_per_second": 5.795,
"step": 45000
},
{
"epoch": 13.115019509638344,
"grad_norm": 0.3923526406288147,
"learning_rate": 0.0002068600583090379,
"loss": 3.2176,
"step": 45050
},
{
"epoch": 13.129578941238133,
"grad_norm": 0.4024291932582855,
"learning_rate": 0.00020642274052478133,
"loss": 3.2117,
"step": 45100
},
{
"epoch": 13.144138372837924,
"grad_norm": 0.3926296532154083,
"learning_rate": 0.00020598542274052477,
"loss": 3.2315,
"step": 45150
},
{
"epoch": 13.158697804437715,
"grad_norm": 0.39263248443603516,
"learning_rate": 0.00020554810495626818,
"loss": 3.2206,
"step": 45200
},
{
"epoch": 13.173257236037506,
"grad_norm": 0.389493852853775,
"learning_rate": 0.00020511078717201165,
"loss": 3.2258,
"step": 45250
},
{
"epoch": 13.187816667637296,
"grad_norm": 0.41162851452827454,
"learning_rate": 0.00020467346938775506,
"loss": 3.2259,
"step": 45300
},
{
"epoch": 13.202376099237085,
"grad_norm": 0.42013758420944214,
"learning_rate": 0.00020423615160349853,
"loss": 3.2225,
"step": 45350
},
{
"epoch": 13.216935530836876,
"grad_norm": 0.37507620453834534,
"learning_rate": 0.00020379883381924197,
"loss": 3.2382,
"step": 45400
},
{
"epoch": 13.231494962436667,
"grad_norm": 0.39401939511299133,
"learning_rate": 0.0002033615160349854,
"loss": 3.2174,
"step": 45450
},
{
"epoch": 13.246054394036458,
"grad_norm": 0.39347732067108154,
"learning_rate": 0.00020292419825072885,
"loss": 3.2407,
"step": 45500
},
{
"epoch": 13.260613825636247,
"grad_norm": 0.41897672414779663,
"learning_rate": 0.0002024868804664723,
"loss": 3.239,
"step": 45550
},
{
"epoch": 13.275173257236037,
"grad_norm": 0.3950546681880951,
"learning_rate": 0.00020204956268221572,
"loss": 3.236,
"step": 45600
},
{
"epoch": 13.289732688835828,
"grad_norm": 0.38286805152893066,
"learning_rate": 0.00020161224489795916,
"loss": 3.2309,
"step": 45650
},
{
"epoch": 13.304292120435619,
"grad_norm": 0.39312657713890076,
"learning_rate": 0.0002011749271137026,
"loss": 3.2248,
"step": 45700
},
{
"epoch": 13.318851552035408,
"grad_norm": 0.4044516980648041,
"learning_rate": 0.00020073760932944604,
"loss": 3.2353,
"step": 45750
},
{
"epoch": 13.333410983635199,
"grad_norm": 0.3935719132423401,
"learning_rate": 0.00020030029154518948,
"loss": 3.2494,
"step": 45800
},
{
"epoch": 13.34797041523499,
"grad_norm": 0.39138996601104736,
"learning_rate": 0.00019986297376093292,
"loss": 3.2334,
"step": 45850
},
{
"epoch": 13.36252984683478,
"grad_norm": 0.40938061475753784,
"learning_rate": 0.0001994256559766764,
"loss": 3.2537,
"step": 45900
},
{
"epoch": 13.37708927843457,
"grad_norm": 0.3899918496608734,
"learning_rate": 0.0001989883381924198,
"loss": 3.2348,
"step": 45950
},
{
"epoch": 13.39164871003436,
"grad_norm": 0.39088475704193115,
"learning_rate": 0.00019855102040816327,
"loss": 3.2443,
"step": 46000
},
{
"epoch": 13.39164871003436,
"eval_accuracy": 0.3726506903991535,
"eval_loss": 3.5426511764526367,
"eval_runtime": 179.8521,
"eval_samples_per_second": 92.537,
"eval_steps_per_second": 5.788,
"step": 46000
},
{
"epoch": 13.406208141634151,
"grad_norm": 0.3731467127799988,
"learning_rate": 0.00019811370262390668,
"loss": 3.2468,
"step": 46050
},
{
"epoch": 13.420767573233942,
"grad_norm": 0.4023798704147339,
"learning_rate": 0.00019767638483965012,
"loss": 3.2414,
"step": 46100
},
{
"epoch": 13.43532700483373,
"grad_norm": 0.40062659978866577,
"learning_rate": 0.00019723906705539359,
"loss": 3.2514,
"step": 46150
},
{
"epoch": 13.449886436433522,
"grad_norm": 0.41188135743141174,
"learning_rate": 0.000196801749271137,
"loss": 3.2714,
"step": 46200
},
{
"epoch": 13.464445868033312,
"grad_norm": 0.3918535113334656,
"learning_rate": 0.00019636443148688047,
"loss": 3.2557,
"step": 46250
},
{
"epoch": 13.479005299633103,
"grad_norm": 0.39971768856048584,
"learning_rate": 0.00019592711370262388,
"loss": 3.2398,
"step": 46300
},
{
"epoch": 13.493564731232892,
"grad_norm": 0.39411237835884094,
"learning_rate": 0.00019548979591836734,
"loss": 3.2663,
"step": 46350
},
{
"epoch": 13.508124162832683,
"grad_norm": 0.38199934363365173,
"learning_rate": 0.00019505247813411076,
"loss": 3.2564,
"step": 46400
},
{
"epoch": 13.522683594432474,
"grad_norm": 0.3960552513599396,
"learning_rate": 0.00019461516034985422,
"loss": 3.2518,
"step": 46450
},
{
"epoch": 13.537243026032264,
"grad_norm": 0.397707462310791,
"learning_rate": 0.00019417784256559766,
"loss": 3.2318,
"step": 46500
},
{
"epoch": 13.551802457632053,
"grad_norm": 0.38950562477111816,
"learning_rate": 0.00019374052478134108,
"loss": 3.2526,
"step": 46550
},
{
"epoch": 13.566361889231844,
"grad_norm": 0.40381306409835815,
"learning_rate": 0.00019330320699708454,
"loss": 3.2461,
"step": 46600
},
{
"epoch": 13.580921320831635,
"grad_norm": 0.3885638117790222,
"learning_rate": 0.00019286588921282795,
"loss": 3.2479,
"step": 46650
},
{
"epoch": 13.595480752431426,
"grad_norm": 0.38914021849632263,
"learning_rate": 0.00019242857142857142,
"loss": 3.2615,
"step": 46700
},
{
"epoch": 13.610040184031215,
"grad_norm": 0.38882437348365784,
"learning_rate": 0.00019199125364431483,
"loss": 3.2557,
"step": 46750
},
{
"epoch": 13.624599615631006,
"grad_norm": 0.4011668264865875,
"learning_rate": 0.0001915539358600583,
"loss": 3.2575,
"step": 46800
},
{
"epoch": 13.639159047230796,
"grad_norm": 0.38090625405311584,
"learning_rate": 0.00019111661807580174,
"loss": 3.2482,
"step": 46850
},
{
"epoch": 13.653718478830587,
"grad_norm": 0.3797299861907959,
"learning_rate": 0.00019067930029154515,
"loss": 3.2633,
"step": 46900
},
{
"epoch": 13.668277910430376,
"grad_norm": 0.42316409945487976,
"learning_rate": 0.00019024198250728862,
"loss": 3.2572,
"step": 46950
},
{
"epoch": 13.682837342030167,
"grad_norm": 0.3918803930282593,
"learning_rate": 0.00018980466472303203,
"loss": 3.2584,
"step": 47000
},
{
"epoch": 13.682837342030167,
"eval_accuracy": 0.3731333718929804,
"eval_loss": 3.5378518104553223,
"eval_runtime": 179.8803,
"eval_samples_per_second": 92.523,
"eval_steps_per_second": 5.787,
"step": 47000
},
{
"epoch": 13.697396773629958,
"grad_norm": 0.4246753752231598,
"learning_rate": 0.0001893673469387755,
"loss": 3.2589,
"step": 47050
},
{
"epoch": 13.711956205229749,
"grad_norm": 0.3818982243537903,
"learning_rate": 0.00018893002915451894,
"loss": 3.2612,
"step": 47100
},
{
"epoch": 13.726515636829538,
"grad_norm": 0.3882550597190857,
"learning_rate": 0.00018849271137026238,
"loss": 3.268,
"step": 47150
},
{
"epoch": 13.741075068429328,
"grad_norm": 0.4041791260242462,
"learning_rate": 0.00018805539358600582,
"loss": 3.2534,
"step": 47200
},
{
"epoch": 13.75563450002912,
"grad_norm": 0.37502363324165344,
"learning_rate": 0.00018761807580174925,
"loss": 3.2593,
"step": 47250
},
{
"epoch": 13.77019393162891,
"grad_norm": 0.38316377997398376,
"learning_rate": 0.0001871807580174927,
"loss": 3.2434,
"step": 47300
},
{
"epoch": 13.784753363228699,
"grad_norm": 0.3869149386882782,
"learning_rate": 0.0001867434402332361,
"loss": 3.2641,
"step": 47350
},
{
"epoch": 13.79931279482849,
"grad_norm": 0.38152143359184265,
"learning_rate": 0.00018630612244897957,
"loss": 3.2716,
"step": 47400
},
{
"epoch": 13.81387222642828,
"grad_norm": 0.4002053439617157,
"learning_rate": 0.000185868804664723,
"loss": 3.2533,
"step": 47450
},
{
"epoch": 13.828431658028071,
"grad_norm": 0.3985072374343872,
"learning_rate": 0.00018543148688046645,
"loss": 3.2614,
"step": 47500
},
{
"epoch": 13.84299108962786,
"grad_norm": 0.4056275486946106,
"learning_rate": 0.0001849941690962099,
"loss": 3.2618,
"step": 47550
},
{
"epoch": 13.857550521227651,
"grad_norm": 0.3932760953903198,
"learning_rate": 0.00018455685131195336,
"loss": 3.2764,
"step": 47600
},
{
"epoch": 13.872109952827442,
"grad_norm": 0.41119423508644104,
"learning_rate": 0.00018411953352769677,
"loss": 3.2654,
"step": 47650
},
{
"epoch": 13.886669384427233,
"grad_norm": 0.40140846371650696,
"learning_rate": 0.00018368221574344024,
"loss": 3.2602,
"step": 47700
},
{
"epoch": 13.901228816027022,
"grad_norm": 0.3891817033290863,
"learning_rate": 0.00018324489795918365,
"loss": 3.2599,
"step": 47750
},
{
"epoch": 13.915788247626812,
"grad_norm": 0.39611101150512695,
"learning_rate": 0.0001828075801749271,
"loss": 3.2673,
"step": 47800
},
{
"epoch": 13.930347679226603,
"grad_norm": 0.41129791736602783,
"learning_rate": 0.00018237026239067053,
"loss": 3.2628,
"step": 47850
},
{
"epoch": 13.944907110826394,
"grad_norm": 0.39136409759521484,
"learning_rate": 0.00018193294460641397,
"loss": 3.2689,
"step": 47900
},
{
"epoch": 13.959466542426183,
"grad_norm": 0.38141852617263794,
"learning_rate": 0.00018149562682215743,
"loss": 3.2768,
"step": 47950
},
{
"epoch": 13.974025974025974,
"grad_norm": 0.4137849509716034,
"learning_rate": 0.00018105830903790085,
"loss": 3.2518,
"step": 48000
},
{
"epoch": 13.974025974025974,
"eval_accuracy": 0.37369154218559825,
"eval_loss": 3.529177665710449,
"eval_runtime": 180.2712,
"eval_samples_per_second": 92.322,
"eval_steps_per_second": 5.775,
"step": 48000
},
{
"epoch": 13.988585405625765,
"grad_norm": 0.3934214413166046,
"learning_rate": 0.00018062099125364431,
"loss": 3.2683,
"step": 48050
},
{
"epoch": 14.002911886319959,
"grad_norm": 0.4021332561969757,
"learning_rate": 0.00018018367346938773,
"loss": 3.2537,
"step": 48100
},
{
"epoch": 14.017471317919748,
"grad_norm": 0.41304075717926025,
"learning_rate": 0.0001797463556851312,
"loss": 3.186,
"step": 48150
},
{
"epoch": 14.032030749519539,
"grad_norm": 0.41855913400650024,
"learning_rate": 0.00017930903790087463,
"loss": 3.1729,
"step": 48200
},
{
"epoch": 14.04659018111933,
"grad_norm": 0.38180437684059143,
"learning_rate": 0.00017887172011661804,
"loss": 3.1865,
"step": 48250
},
{
"epoch": 14.06114961271912,
"grad_norm": 0.4037982225418091,
"learning_rate": 0.0001784344023323615,
"loss": 3.1946,
"step": 48300
},
{
"epoch": 14.07570904431891,
"grad_norm": 0.4038594663143158,
"learning_rate": 0.00017799708454810492,
"loss": 3.1867,
"step": 48350
},
{
"epoch": 14.0902684759187,
"grad_norm": 0.4171351492404938,
"learning_rate": 0.0001775597667638484,
"loss": 3.2073,
"step": 48400
},
{
"epoch": 14.10482790751849,
"grad_norm": 0.40714016556739807,
"learning_rate": 0.0001771224489795918,
"loss": 3.2059,
"step": 48450
},
{
"epoch": 14.119387339118282,
"grad_norm": 0.40129297971725464,
"learning_rate": 0.00017668513119533527,
"loss": 3.1958,
"step": 48500
},
{
"epoch": 14.13394677071807,
"grad_norm": 0.4317340850830078,
"learning_rate": 0.0001762478134110787,
"loss": 3.2103,
"step": 48550
},
{
"epoch": 14.148506202317861,
"grad_norm": 0.4058551788330078,
"learning_rate": 0.00017581049562682215,
"loss": 3.1942,
"step": 48600
},
{
"epoch": 14.163065633917652,
"grad_norm": 0.4167109429836273,
"learning_rate": 0.0001753731778425656,
"loss": 3.1981,
"step": 48650
},
{
"epoch": 14.177625065517443,
"grad_norm": 0.3935483694076538,
"learning_rate": 0.000174935860058309,
"loss": 3.202,
"step": 48700
},
{
"epoch": 14.192184497117232,
"grad_norm": 0.4171966016292572,
"learning_rate": 0.00017449854227405247,
"loss": 3.2034,
"step": 48750
},
{
"epoch": 14.206743928717023,
"grad_norm": 0.3995937705039978,
"learning_rate": 0.00017406122448979588,
"loss": 3.2011,
"step": 48800
},
{
"epoch": 14.221303360316814,
"grad_norm": 0.38564836978912354,
"learning_rate": 0.00017362390670553935,
"loss": 3.2065,
"step": 48850
},
{
"epoch": 14.235862791916604,
"grad_norm": 0.4166134297847748,
"learning_rate": 0.00017318658892128278,
"loss": 3.2106,
"step": 48900
},
{
"epoch": 14.250422223516393,
"grad_norm": 0.4112185537815094,
"learning_rate": 0.00017274927113702622,
"loss": 3.2138,
"step": 48950
},
{
"epoch": 14.264981655116184,
"grad_norm": 0.39878368377685547,
"learning_rate": 0.00017231195335276966,
"loss": 3.2007,
"step": 49000
},
{
"epoch": 14.264981655116184,
"eval_accuracy": 0.3732468402587643,
"eval_loss": 3.5399742126464844,
"eval_runtime": 179.8028,
"eval_samples_per_second": 92.562,
"eval_steps_per_second": 5.79,
"step": 49000
},
{
"epoch": 14.279541086715975,
"grad_norm": 0.39703792333602905,
"learning_rate": 0.00017187463556851313,
"loss": 3.2176,
"step": 49050
},
{
"epoch": 14.294100518315766,
"grad_norm": 0.3934576213359833,
"learning_rate": 0.00017143731778425654,
"loss": 3.2121,
"step": 49100
},
{
"epoch": 14.308659949915555,
"grad_norm": 0.3929370045661926,
"learning_rate": 0.00017099999999999998,
"loss": 3.2031,
"step": 49150
},
{
"epoch": 14.323219381515345,
"grad_norm": 0.4261437654495239,
"learning_rate": 0.00017056268221574342,
"loss": 3.2201,
"step": 49200
},
{
"epoch": 14.337778813115136,
"grad_norm": 0.3950461447238922,
"learning_rate": 0.00017012536443148686,
"loss": 3.2297,
"step": 49250
},
{
"epoch": 14.352338244714927,
"grad_norm": 0.40033966302871704,
"learning_rate": 0.0001696880466472303,
"loss": 3.2149,
"step": 49300
},
{
"epoch": 14.366897676314716,
"grad_norm": 0.4025183618068695,
"learning_rate": 0.00016925072886297374,
"loss": 3.2214,
"step": 49350
},
{
"epoch": 14.381457107914507,
"grad_norm": 0.4048652648925781,
"learning_rate": 0.0001688134110787172,
"loss": 3.224,
"step": 49400
},
{
"epoch": 14.396016539514298,
"grad_norm": 0.3995135426521301,
"learning_rate": 0.00016837609329446062,
"loss": 3.212,
"step": 49450
},
{
"epoch": 14.410575971114088,
"grad_norm": 0.40483996272087097,
"learning_rate": 0.00016793877551020409,
"loss": 3.2175,
"step": 49500
},
{
"epoch": 14.425135402713877,
"grad_norm": 0.40723147988319397,
"learning_rate": 0.0001675014577259475,
"loss": 3.2251,
"step": 49550
},
{
"epoch": 14.439694834313668,
"grad_norm": 0.41353291273117065,
"learning_rate": 0.00016706413994169094,
"loss": 3.2274,
"step": 49600
},
{
"epoch": 14.454254265913459,
"grad_norm": 0.4138233959674835,
"learning_rate": 0.0001666268221574344,
"loss": 3.2189,
"step": 49650
},
{
"epoch": 14.46881369751325,
"grad_norm": 0.41592147946357727,
"learning_rate": 0.00016618950437317782,
"loss": 3.2207,
"step": 49700
},
{
"epoch": 14.483373129113039,
"grad_norm": 0.41638514399528503,
"learning_rate": 0.00016575218658892128,
"loss": 3.2294,
"step": 49750
},
{
"epoch": 14.49793256071283,
"grad_norm": 0.40838560461997986,
"learning_rate": 0.0001653148688046647,
"loss": 3.226,
"step": 49800
},
{
"epoch": 14.51249199231262,
"grad_norm": 0.42148107290267944,
"learning_rate": 0.00016487755102040816,
"loss": 3.2231,
"step": 49850
},
{
"epoch": 14.527051423912411,
"grad_norm": 0.3983476161956787,
"learning_rate": 0.00016444023323615157,
"loss": 3.2314,
"step": 49900
},
{
"epoch": 14.5416108555122,
"grad_norm": 0.40091437101364136,
"learning_rate": 0.00016400291545189504,
"loss": 3.2257,
"step": 49950
},
{
"epoch": 14.556170287111991,
"grad_norm": 0.4068416953086853,
"learning_rate": 0.00016356559766763848,
"loss": 3.2079,
"step": 50000
},
{
"epoch": 14.556170287111991,
"eval_accuracy": 0.373755390188314,
"eval_loss": 3.536447763442993,
"eval_runtime": 179.8111,
"eval_samples_per_second": 92.558,
"eval_steps_per_second": 5.789,
"step": 50000
},
{
"epoch": 14.570729718711782,
"grad_norm": 0.400130957365036,
"learning_rate": 0.0001631282798833819,
"loss": 3.2299,
"step": 50050
},
{
"epoch": 14.585289150311572,
"grad_norm": 0.397758424282074,
"learning_rate": 0.00016269096209912536,
"loss": 3.2291,
"step": 50100
},
{
"epoch": 14.599848581911361,
"grad_norm": 0.41178637742996216,
"learning_rate": 0.00016225364431486877,
"loss": 3.23,
"step": 50150
},
{
"epoch": 14.614408013511152,
"grad_norm": 0.4252997040748596,
"learning_rate": 0.00016181632653061224,
"loss": 3.2266,
"step": 50200
},
{
"epoch": 14.628967445110943,
"grad_norm": 0.3951346278190613,
"learning_rate": 0.00016137900874635568,
"loss": 3.2399,
"step": 50250
},
{
"epoch": 14.643526876710734,
"grad_norm": 0.40295711159706116,
"learning_rate": 0.00016094169096209912,
"loss": 3.2332,
"step": 50300
},
{
"epoch": 14.658086308310523,
"grad_norm": 0.39642152190208435,
"learning_rate": 0.00016050437317784256,
"loss": 3.2316,
"step": 50350
},
{
"epoch": 14.672645739910314,
"grad_norm": 0.38787510991096497,
"learning_rate": 0.000160067055393586,
"loss": 3.2442,
"step": 50400
},
{
"epoch": 14.687205171510104,
"grad_norm": 0.4102849066257477,
"learning_rate": 0.00015962973760932944,
"loss": 3.232,
"step": 50450
},
{
"epoch": 14.701764603109895,
"grad_norm": 0.40455612540245056,
"learning_rate": 0.00015919241982507285,
"loss": 3.2336,
"step": 50500
},
{
"epoch": 14.716324034709684,
"grad_norm": 0.4081745147705078,
"learning_rate": 0.00015875510204081631,
"loss": 3.2318,
"step": 50550
},
{
"epoch": 14.730883466309475,
"grad_norm": 0.39961087703704834,
"learning_rate": 0.00015831778425655975,
"loss": 3.2325,
"step": 50600
},
{
"epoch": 14.745442897909266,
"grad_norm": 0.39770328998565674,
"learning_rate": 0.0001578804664723032,
"loss": 3.2374,
"step": 50650
},
{
"epoch": 14.760002329509057,
"grad_norm": 0.4101422131061554,
"learning_rate": 0.00015744314868804663,
"loss": 3.2367,
"step": 50700
},
{
"epoch": 14.774561761108846,
"grad_norm": 0.40297961235046387,
"learning_rate": 0.00015700583090379007,
"loss": 3.2426,
"step": 50750
},
{
"epoch": 14.789121192708636,
"grad_norm": 0.39780429005622864,
"learning_rate": 0.0001565685131195335,
"loss": 3.2377,
"step": 50800
},
{
"epoch": 14.803680624308427,
"grad_norm": 0.4247409403324127,
"learning_rate": 0.00015613119533527698,
"loss": 3.2329,
"step": 50850
},
{
"epoch": 14.818240055908218,
"grad_norm": 0.4056137800216675,
"learning_rate": 0.0001556938775510204,
"loss": 3.2377,
"step": 50900
},
{
"epoch": 14.832799487508007,
"grad_norm": 0.3977799117565155,
"learning_rate": 0.00015525655976676383,
"loss": 3.2334,
"step": 50950
},
{
"epoch": 14.847358919107798,
"grad_norm": 0.40999510884284973,
"learning_rate": 0.00015481924198250727,
"loss": 3.245,
"step": 51000
},
{
"epoch": 14.847358919107798,
"eval_accuracy": 0.37410919983872204,
"eval_loss": 3.528653860092163,
"eval_runtime": 179.8528,
"eval_samples_per_second": 92.537,
"eval_steps_per_second": 5.788,
"step": 51000
},
{
"epoch": 14.861918350707588,
"grad_norm": 0.42608842253685,
"learning_rate": 0.0001543819241982507,
"loss": 3.2402,
"step": 51050
},
{
"epoch": 14.87647778230738,
"grad_norm": 0.3977234363555908,
"learning_rate": 0.00015394460641399418,
"loss": 3.2373,
"step": 51100
},
{
"epoch": 14.891037213907168,
"grad_norm": 0.4012013077735901,
"learning_rate": 0.0001535072886297376,
"loss": 3.2312,
"step": 51150
},
{
"epoch": 14.905596645506959,
"grad_norm": 0.3960820734500885,
"learning_rate": 0.00015306997084548106,
"loss": 3.2527,
"step": 51200
},
{
"epoch": 14.92015607710675,
"grad_norm": 0.4106712341308594,
"learning_rate": 0.00015263265306122447,
"loss": 3.2379,
"step": 51250
},
{
"epoch": 14.93471550870654,
"grad_norm": 0.4187733829021454,
"learning_rate": 0.0001521953352769679,
"loss": 3.24,
"step": 51300
},
{
"epoch": 14.94927494030633,
"grad_norm": 0.4008129835128784,
"learning_rate": 0.00015175801749271135,
"loss": 3.241,
"step": 51350
},
{
"epoch": 14.96383437190612,
"grad_norm": 0.42290031909942627,
"learning_rate": 0.00015132069970845479,
"loss": 3.2498,
"step": 51400
},
{
"epoch": 14.978393803505911,
"grad_norm": 0.39067307114601135,
"learning_rate": 0.00015088338192419825,
"loss": 3.2471,
"step": 51450
},
{
"epoch": 14.992953235105702,
"grad_norm": 0.3999600112438202,
"learning_rate": 0.00015044606413994167,
"loss": 3.249,
"step": 51500
},
{
"epoch": 15.007279715799895,
"grad_norm": 0.41343411803245544,
"learning_rate": 0.00015000874635568513,
"loss": 3.2085,
"step": 51550
},
{
"epoch": 15.021839147399685,
"grad_norm": 0.429661363363266,
"learning_rate": 0.00014957142857142854,
"loss": 3.1666,
"step": 51600
},
{
"epoch": 15.036398578999476,
"grad_norm": 0.4097861647605896,
"learning_rate": 0.00014913411078717198,
"loss": 3.1655,
"step": 51650
},
{
"epoch": 15.050958010599267,
"grad_norm": 0.4214050769805908,
"learning_rate": 0.00014869679300291545,
"loss": 3.1668,
"step": 51700
},
{
"epoch": 15.065517442199056,
"grad_norm": 0.4203738868236542,
"learning_rate": 0.0001482594752186589,
"loss": 3.1648,
"step": 51750
},
{
"epoch": 15.080076873798847,
"grad_norm": 0.41821858286857605,
"learning_rate": 0.00014782215743440233,
"loss": 3.1647,
"step": 51800
},
{
"epoch": 15.094636305398637,
"grad_norm": 0.4323025941848755,
"learning_rate": 0.00014738483965014577,
"loss": 3.1814,
"step": 51850
},
{
"epoch": 15.109195736998428,
"grad_norm": 0.39484599232673645,
"learning_rate": 0.0001469475218658892,
"loss": 3.1772,
"step": 51900
},
{
"epoch": 15.123755168598217,
"grad_norm": 0.41107800602912903,
"learning_rate": 0.00014651020408163265,
"loss": 3.1784,
"step": 51950
},
{
"epoch": 15.138314600198008,
"grad_norm": 0.4029431939125061,
"learning_rate": 0.0001460728862973761,
"loss": 3.1718,
"step": 52000
},
{
"epoch": 15.138314600198008,
"eval_accuracy": 0.37368789708783734,
"eval_loss": 3.5386550426483154,
"eval_runtime": 179.8512,
"eval_samples_per_second": 92.538,
"eval_steps_per_second": 5.788,
"step": 52000
},
{
"epoch": 15.152874031797799,
"grad_norm": 0.40105974674224854,
"learning_rate": 0.00014563556851311953,
"loss": 3.1775,
"step": 52050
},
{
"epoch": 15.16743346339759,
"grad_norm": 0.42404162883758545,
"learning_rate": 0.00014519825072886297,
"loss": 3.1747,
"step": 52100
},
{
"epoch": 15.181992894997379,
"grad_norm": 0.4054589867591858,
"learning_rate": 0.0001447609329446064,
"loss": 3.1894,
"step": 52150
},
{
"epoch": 15.19655232659717,
"grad_norm": 0.41971468925476074,
"learning_rate": 0.00014432361516034984,
"loss": 3.1836,
"step": 52200
},
{
"epoch": 15.21111175819696,
"grad_norm": 0.4210222661495209,
"learning_rate": 0.00014388629737609328,
"loss": 3.1732,
"step": 52250
},
{
"epoch": 15.225671189796751,
"grad_norm": 0.42625147104263306,
"learning_rate": 0.00014344897959183672,
"loss": 3.1857,
"step": 52300
},
{
"epoch": 15.24023062139654,
"grad_norm": 0.4069248139858246,
"learning_rate": 0.00014301166180758016,
"loss": 3.1868,
"step": 52350
},
{
"epoch": 15.25479005299633,
"grad_norm": 0.4041866660118103,
"learning_rate": 0.0001425743440233236,
"loss": 3.1881,
"step": 52400
},
{
"epoch": 15.269349484596122,
"grad_norm": 0.40314981341362,
"learning_rate": 0.00014213702623906704,
"loss": 3.191,
"step": 52450
},
{
"epoch": 15.283908916195912,
"grad_norm": 0.4245761036872864,
"learning_rate": 0.00014169970845481048,
"loss": 3.1888,
"step": 52500
},
{
"epoch": 15.298468347795701,
"grad_norm": 0.40877848863601685,
"learning_rate": 0.00014126239067055392,
"loss": 3.1917,
"step": 52550
},
{
"epoch": 15.313027779395492,
"grad_norm": 0.4157589375972748,
"learning_rate": 0.00014082507288629736,
"loss": 3.2,
"step": 52600
},
{
"epoch": 15.327587210995283,
"grad_norm": 0.39100512862205505,
"learning_rate": 0.0001403877551020408,
"loss": 3.2019,
"step": 52650
},
{
"epoch": 15.342146642595074,
"grad_norm": 0.4026622474193573,
"learning_rate": 0.00013995043731778424,
"loss": 3.1937,
"step": 52700
},
{
"epoch": 15.356706074194863,
"grad_norm": 0.4168094992637634,
"learning_rate": 0.00013951311953352768,
"loss": 3.2028,
"step": 52750
},
{
"epoch": 15.371265505794653,
"grad_norm": 0.41785821318626404,
"learning_rate": 0.00013907580174927112,
"loss": 3.1985,
"step": 52800
},
{
"epoch": 15.385824937394444,
"grad_norm": 0.4225537180900574,
"learning_rate": 0.00013863848396501456,
"loss": 3.1917,
"step": 52850
},
{
"epoch": 15.400384368994235,
"grad_norm": 0.421794056892395,
"learning_rate": 0.000138201166180758,
"loss": 3.2067,
"step": 52900
},
{
"epoch": 15.414943800594024,
"grad_norm": 0.42855533957481384,
"learning_rate": 0.00013776384839650144,
"loss": 3.1923,
"step": 52950
},
{
"epoch": 15.429503232193815,
"grad_norm": 0.4089881181716919,
"learning_rate": 0.00013732653061224488,
"loss": 3.1904,
"step": 53000
},
{
"epoch": 15.429503232193815,
"eval_accuracy": 0.3743012141820642,
"eval_loss": 3.535125732421875,
"eval_runtime": 179.8043,
"eval_samples_per_second": 92.562,
"eval_steps_per_second": 5.79,
"step": 53000
},
{
"epoch": 15.444062663793606,
"grad_norm": 0.4249016046524048,
"learning_rate": 0.00013688921282798832,
"loss": 3.1893,
"step": 53050
},
{
"epoch": 15.458622095393396,
"grad_norm": 0.4056449234485626,
"learning_rate": 0.00013645189504373176,
"loss": 3.1973,
"step": 53100
},
{
"epoch": 15.473181526993185,
"grad_norm": 0.411278635263443,
"learning_rate": 0.00013601457725947522,
"loss": 3.2089,
"step": 53150
},
{
"epoch": 15.487740958592976,
"grad_norm": 0.4109366536140442,
"learning_rate": 0.00013557725947521866,
"loss": 3.2041,
"step": 53200
},
{
"epoch": 15.502300390192767,
"grad_norm": 0.42039617896080017,
"learning_rate": 0.0001351399416909621,
"loss": 3.2103,
"step": 53250
},
{
"epoch": 15.516859821792558,
"grad_norm": 0.4206897020339966,
"learning_rate": 0.0001347026239067055,
"loss": 3.2148,
"step": 53300
},
{
"epoch": 15.531419253392347,
"grad_norm": 0.4062785506248474,
"learning_rate": 0.00013426530612244895,
"loss": 3.2114,
"step": 53350
},
{
"epoch": 15.545978684992138,
"grad_norm": 0.42714810371398926,
"learning_rate": 0.0001338279883381924,
"loss": 3.1939,
"step": 53400
},
{
"epoch": 15.560538116591928,
"grad_norm": 0.39876991510391235,
"learning_rate": 0.00013339067055393586,
"loss": 3.1901,
"step": 53450
},
{
"epoch": 15.575097548191719,
"grad_norm": 0.397652804851532,
"learning_rate": 0.0001329533527696793,
"loss": 3.2233,
"step": 53500
},
{
"epoch": 15.58965697979151,
"grad_norm": 0.40526559948921204,
"learning_rate": 0.00013251603498542274,
"loss": 3.214,
"step": 53550
},
{
"epoch": 15.604216411391299,
"grad_norm": 0.40763163566589355,
"learning_rate": 0.00013207871720116618,
"loss": 3.2026,
"step": 53600
},
{
"epoch": 15.61877584299109,
"grad_norm": 0.4108482003211975,
"learning_rate": 0.00013164139941690962,
"loss": 3.1983,
"step": 53650
},
{
"epoch": 15.63333527459088,
"grad_norm": 0.42270195484161377,
"learning_rate": 0.00013120408163265306,
"loss": 3.2072,
"step": 53700
},
{
"epoch": 15.64789470619067,
"grad_norm": 0.4099654257297516,
"learning_rate": 0.0001307667638483965,
"loss": 3.2059,
"step": 53750
},
{
"epoch": 15.66245413779046,
"grad_norm": 0.416862428188324,
"learning_rate": 0.00013032944606413994,
"loss": 3.2046,
"step": 53800
},
{
"epoch": 15.677013569390251,
"grad_norm": 0.41440367698669434,
"learning_rate": 0.00012989212827988337,
"loss": 3.206,
"step": 53850
},
{
"epoch": 15.691573000990042,
"grad_norm": 0.4144529402256012,
"learning_rate": 0.00012945481049562681,
"loss": 3.206,
"step": 53900
},
{
"epoch": 15.706132432589833,
"grad_norm": 0.41030511260032654,
"learning_rate": 0.00012901749271137025,
"loss": 3.2183,
"step": 53950
},
{
"epoch": 15.720691864189622,
"grad_norm": 0.4152276813983917,
"learning_rate": 0.0001285801749271137,
"loss": 3.2151,
"step": 54000
},
{
"epoch": 15.720691864189622,
"eval_accuracy": 0.37434131025743445,
"eval_loss": 3.5293774604797363,
"eval_runtime": 179.8109,
"eval_samples_per_second": 92.558,
"eval_steps_per_second": 5.789,
"step": 54000
},
{
"epoch": 15.735251295789412,
"grad_norm": 0.40948715806007385,
"learning_rate": 0.00012814285714285713,
"loss": 3.2193,
"step": 54050
},
{
"epoch": 15.749810727389203,
"grad_norm": 0.3965863883495331,
"learning_rate": 0.00012770553935860057,
"loss": 3.2169,
"step": 54100
},
{
"epoch": 15.764370158988992,
"grad_norm": 0.4163205325603485,
"learning_rate": 0.000127268221574344,
"loss": 3.2179,
"step": 54150
},
{
"epoch": 15.778929590588783,
"grad_norm": 0.4208901524543762,
"learning_rate": 0.00012683090379008745,
"loss": 3.2119,
"step": 54200
},
{
"epoch": 15.793489022188574,
"grad_norm": 0.4129723012447357,
"learning_rate": 0.0001263935860058309,
"loss": 3.2213,
"step": 54250
},
{
"epoch": 15.808048453788365,
"grad_norm": 0.4082995653152466,
"learning_rate": 0.00012595626822157433,
"loss": 3.2165,
"step": 54300
},
{
"epoch": 15.822607885388155,
"grad_norm": 0.40925273299217224,
"learning_rate": 0.00012551895043731777,
"loss": 3.2099,
"step": 54350
},
{
"epoch": 15.837167316987944,
"grad_norm": 0.41905489563941956,
"learning_rate": 0.0001250816326530612,
"loss": 3.2053,
"step": 54400
},
{
"epoch": 15.851726748587735,
"grad_norm": 0.40665403008461,
"learning_rate": 0.00012464431486880465,
"loss": 3.2119,
"step": 54450
},
{
"epoch": 15.866286180187526,
"grad_norm": 0.41052520275115967,
"learning_rate": 0.0001242069970845481,
"loss": 3.2157,
"step": 54500
},
{
"epoch": 15.880845611787315,
"grad_norm": 0.42254316806793213,
"learning_rate": 0.00012376967930029153,
"loss": 3.223,
"step": 54550
},
{
"epoch": 15.895405043387106,
"grad_norm": 0.40757060050964355,
"learning_rate": 0.00012333236151603497,
"loss": 3.2128,
"step": 54600
},
{
"epoch": 15.909964474986896,
"grad_norm": 0.4044566750526428,
"learning_rate": 0.0001228950437317784,
"loss": 3.2277,
"step": 54650
},
{
"epoch": 15.924523906586687,
"grad_norm": 0.40794673562049866,
"learning_rate": 0.00012245772594752185,
"loss": 3.2132,
"step": 54700
},
{
"epoch": 15.939083338186478,
"grad_norm": 0.41059088706970215,
"learning_rate": 0.0001220204081632653,
"loss": 3.2337,
"step": 54750
},
{
"epoch": 15.953642769786267,
"grad_norm": 0.40359726548194885,
"learning_rate": 0.00012158309037900874,
"loss": 3.2256,
"step": 54800
},
{
"epoch": 15.968202201386058,
"grad_norm": 0.4164576828479767,
"learning_rate": 0.00012114577259475218,
"loss": 3.2193,
"step": 54850
},
{
"epoch": 15.982761632985849,
"grad_norm": 0.41666367650032043,
"learning_rate": 0.00012070845481049562,
"loss": 3.1938,
"step": 54900
},
{
"epoch": 15.99732106458564,
"grad_norm": 0.41611310839653015,
"learning_rate": 0.00012027113702623906,
"loss": 3.215,
"step": 54950
},
{
"epoch": 16.011647545279832,
"grad_norm": 0.4197295606136322,
"learning_rate": 0.00011983381924198251,
"loss": 3.1627,
"step": 55000
},
{
"epoch": 16.011647545279832,
"eval_accuracy": 0.3746472633017554,
"eval_loss": 3.531348943710327,
"eval_runtime": 179.744,
"eval_samples_per_second": 92.593,
"eval_steps_per_second": 5.792,
"step": 55000
},
{
"epoch": 16.02620697687962,
"grad_norm": 0.4169209897518158,
"learning_rate": 0.00011939650145772594,
"loss": 3.1447,
"step": 55050
},
{
"epoch": 16.040766408479413,
"grad_norm": 0.4214211702346802,
"learning_rate": 0.00011895918367346938,
"loss": 3.1481,
"step": 55100
},
{
"epoch": 16.055325840079202,
"grad_norm": 0.4274289309978485,
"learning_rate": 0.00011852186588921281,
"loss": 3.1546,
"step": 55150
},
{
"epoch": 16.069885271678995,
"grad_norm": 0.4178299009799957,
"learning_rate": 0.00011808454810495625,
"loss": 3.1519,
"step": 55200
},
{
"epoch": 16.084444703278784,
"grad_norm": 0.42355021834373474,
"learning_rate": 0.0001176472303206997,
"loss": 3.157,
"step": 55250
},
{
"epoch": 16.099004134878573,
"grad_norm": 0.40021342039108276,
"learning_rate": 0.00011720991253644315,
"loss": 3.1528,
"step": 55300
},
{
"epoch": 16.113563566478366,
"grad_norm": 0.42021602392196655,
"learning_rate": 0.00011677259475218659,
"loss": 3.1428,
"step": 55350
},
{
"epoch": 16.128122998078155,
"grad_norm": 0.42361438274383545,
"learning_rate": 0.00011633527696793003,
"loss": 3.1672,
"step": 55400
},
{
"epoch": 16.142682429677944,
"grad_norm": 0.4197485148906708,
"learning_rate": 0.00011589795918367347,
"loss": 3.1628,
"step": 55450
},
{
"epoch": 16.157241861277736,
"grad_norm": 0.4117751717567444,
"learning_rate": 0.00011546064139941689,
"loss": 3.1614,
"step": 55500
},
{
"epoch": 16.171801292877525,
"grad_norm": 0.4112408459186554,
"learning_rate": 0.00011502332361516033,
"loss": 3.1468,
"step": 55550
},
{
"epoch": 16.186360724477318,
"grad_norm": 0.4194517731666565,
"learning_rate": 0.00011458600583090377,
"loss": 3.1663,
"step": 55600
},
{
"epoch": 16.200920156077107,
"grad_norm": 0.40641582012176514,
"learning_rate": 0.00011414868804664722,
"loss": 3.1604,
"step": 55650
},
{
"epoch": 16.215479587676896,
"grad_norm": 0.41321802139282227,
"learning_rate": 0.00011371137026239066,
"loss": 3.1541,
"step": 55700
},
{
"epoch": 16.23003901927669,
"grad_norm": 0.40778738260269165,
"learning_rate": 0.0001132740524781341,
"loss": 3.1585,
"step": 55750
},
{
"epoch": 16.244598450876477,
"grad_norm": 0.41870421171188354,
"learning_rate": 0.00011283673469387754,
"loss": 3.1611,
"step": 55800
},
{
"epoch": 16.259157882476266,
"grad_norm": 0.4245312809944153,
"learning_rate": 0.00011239941690962098,
"loss": 3.1824,
"step": 55850
},
{
"epoch": 16.27371731407606,
"grad_norm": 0.4253610074520111,
"learning_rate": 0.00011196209912536443,
"loss": 3.1707,
"step": 55900
},
{
"epoch": 16.288276745675848,
"grad_norm": 0.42540794610977173,
"learning_rate": 0.00011152478134110786,
"loss": 3.164,
"step": 55950
},
{
"epoch": 16.30283617727564,
"grad_norm": 0.4161224663257599,
"learning_rate": 0.0001110874635568513,
"loss": 3.1717,
"step": 56000
},
{
"epoch": 16.30283617727564,
"eval_accuracy": 0.37458894173758045,
"eval_loss": 3.533280372619629,
"eval_runtime": 179.7993,
"eval_samples_per_second": 92.564,
"eval_steps_per_second": 5.79,
"step": 56000
},
{
"epoch": 16.31739560887543,
"grad_norm": 0.4027422368526459,
"learning_rate": 0.00011065014577259474,
"loss": 3.1687,
"step": 56050
},
{
"epoch": 16.33195504047522,
"grad_norm": 0.43346402049064636,
"learning_rate": 0.00011021282798833818,
"loss": 3.1666,
"step": 56100
},
{
"epoch": 16.34651447207501,
"grad_norm": 0.4163694381713867,
"learning_rate": 0.00010977551020408162,
"loss": 3.176,
"step": 56150
},
{
"epoch": 16.3610739036748,
"grad_norm": 0.41778096556663513,
"learning_rate": 0.00010933819241982507,
"loss": 3.1565,
"step": 56200
},
{
"epoch": 16.375633335274593,
"grad_norm": 0.43710342049598694,
"learning_rate": 0.00010890087463556851,
"loss": 3.1731,
"step": 56250
},
{
"epoch": 16.39019276687438,
"grad_norm": 0.4328019320964813,
"learning_rate": 0.00010846355685131195,
"loss": 3.1798,
"step": 56300
},
{
"epoch": 16.40475219847417,
"grad_norm": 0.41606712341308594,
"learning_rate": 0.00010802623906705539,
"loss": 3.1892,
"step": 56350
},
{
"epoch": 16.419311630073963,
"grad_norm": 0.4367011487483978,
"learning_rate": 0.00010758892128279882,
"loss": 3.1866,
"step": 56400
},
{
"epoch": 16.433871061673752,
"grad_norm": 0.41168850660324097,
"learning_rate": 0.00010715160349854226,
"loss": 3.184,
"step": 56450
},
{
"epoch": 16.44843049327354,
"grad_norm": 0.4158318042755127,
"learning_rate": 0.00010671428571428571,
"loss": 3.177,
"step": 56500
},
{
"epoch": 16.462989924873334,
"grad_norm": 0.41305047273635864,
"learning_rate": 0.00010627696793002915,
"loss": 3.1765,
"step": 56550
},
{
"epoch": 16.477549356473123,
"grad_norm": 0.4143592119216919,
"learning_rate": 0.00010583965014577259,
"loss": 3.1872,
"step": 56600
},
{
"epoch": 16.492108788072915,
"grad_norm": 0.427137553691864,
"learning_rate": 0.00010540233236151603,
"loss": 3.1752,
"step": 56650
},
{
"epoch": 16.506668219672704,
"grad_norm": 0.42718973755836487,
"learning_rate": 0.00010496501457725947,
"loss": 3.1832,
"step": 56700
},
{
"epoch": 16.521227651272493,
"grad_norm": 0.4194405972957611,
"learning_rate": 0.00010452769679300292,
"loss": 3.1749,
"step": 56750
},
{
"epoch": 16.535787082872286,
"grad_norm": 0.4275641441345215,
"learning_rate": 0.00010409037900874634,
"loss": 3.1785,
"step": 56800
},
{
"epoch": 16.550346514472075,
"grad_norm": 0.42118945717811584,
"learning_rate": 0.00010365306122448978,
"loss": 3.1807,
"step": 56850
},
{
"epoch": 16.564905946071864,
"grad_norm": 0.42112356424331665,
"learning_rate": 0.00010321574344023322,
"loss": 3.185,
"step": 56900
},
{
"epoch": 16.579465377671657,
"grad_norm": 0.40832531452178955,
"learning_rate": 0.00010277842565597666,
"loss": 3.1863,
"step": 56950
},
{
"epoch": 16.594024809271446,
"grad_norm": 0.41556891798973083,
"learning_rate": 0.0001023411078717201,
"loss": 3.1887,
"step": 57000
},
{
"epoch": 16.594024809271446,
"eval_accuracy": 0.375009891737069,
"eval_loss": 3.5277862548828125,
"eval_runtime": 179.8457,
"eval_samples_per_second": 92.54,
"eval_steps_per_second": 5.788,
"step": 57000
},
{
"epoch": 16.608584240871238,
"grad_norm": 0.4110031723976135,
"learning_rate": 0.00010190379008746356,
"loss": 3.1741,
"step": 57050
},
{
"epoch": 16.623143672471027,
"grad_norm": 0.4251270890235901,
"learning_rate": 0.000101466472303207,
"loss": 3.1853,
"step": 57100
},
{
"epoch": 16.637703104070816,
"grad_norm": 0.4084523320198059,
"learning_rate": 0.00010102915451895043,
"loss": 3.1895,
"step": 57150
},
{
"epoch": 16.65226253567061,
"grad_norm": 0.43704357743263245,
"learning_rate": 0.00010059183673469387,
"loss": 3.2,
"step": 57200
},
{
"epoch": 16.666821967270398,
"grad_norm": 0.45471593737602234,
"learning_rate": 0.0001001545189504373,
"loss": 3.1835,
"step": 57250
},
{
"epoch": 16.681381398870187,
"grad_norm": 0.41394278407096863,
"learning_rate": 9.971720116618074e-05,
"loss": 3.1846,
"step": 57300
},
{
"epoch": 16.69594083046998,
"grad_norm": 0.4269932210445404,
"learning_rate": 9.927988338192418e-05,
"loss": 3.1975,
"step": 57350
},
{
"epoch": 16.71050026206977,
"grad_norm": 0.41786158084869385,
"learning_rate": 9.884256559766763e-05,
"loss": 3.2013,
"step": 57400
},
{
"epoch": 16.72505969366956,
"grad_norm": 0.41637122631073,
"learning_rate": 9.840524781341107e-05,
"loss": 3.1928,
"step": 57450
},
{
"epoch": 16.73961912526935,
"grad_norm": 0.42277225852012634,
"learning_rate": 9.796793002915451e-05,
"loss": 3.1766,
"step": 57500
},
{
"epoch": 16.75417855686914,
"grad_norm": 0.421393483877182,
"learning_rate": 9.753061224489795e-05,
"loss": 3.1925,
"step": 57550
},
{
"epoch": 16.76873798846893,
"grad_norm": 0.4143362045288086,
"learning_rate": 9.709329446064139e-05,
"loss": 3.195,
"step": 57600
},
{
"epoch": 16.78329742006872,
"grad_norm": 0.4214780628681183,
"learning_rate": 9.665597667638484e-05,
"loss": 3.1782,
"step": 57650
},
{
"epoch": 16.79785685166851,
"grad_norm": 0.43336132168769836,
"learning_rate": 9.621865889212827e-05,
"loss": 3.1895,
"step": 57700
},
{
"epoch": 16.812416283268302,
"grad_norm": 0.4261871874332428,
"learning_rate": 9.578134110787171e-05,
"loss": 3.1901,
"step": 57750
},
{
"epoch": 16.82697571486809,
"grad_norm": 0.43283653259277344,
"learning_rate": 9.534402332361515e-05,
"loss": 3.1966,
"step": 57800
},
{
"epoch": 16.841535146467884,
"grad_norm": 0.4269659221172333,
"learning_rate": 9.490670553935859e-05,
"loss": 3.1832,
"step": 57850
},
{
"epoch": 16.856094578067673,
"grad_norm": 0.417172372341156,
"learning_rate": 9.446938775510203e-05,
"loss": 3.1962,
"step": 57900
},
{
"epoch": 16.87065400966746,
"grad_norm": 0.41687533259391785,
"learning_rate": 9.403206997084548e-05,
"loss": 3.187,
"step": 57950
},
{
"epoch": 16.885213441267254,
"grad_norm": 0.40989968180656433,
"learning_rate": 9.359475218658892e-05,
"loss": 3.1901,
"step": 58000
},
{
"epoch": 16.885213441267254,
"eval_accuracy": 0.375465411373387,
"eval_loss": 3.5243453979492188,
"eval_runtime": 179.7012,
"eval_samples_per_second": 92.615,
"eval_steps_per_second": 5.793,
"step": 58000
},
{
"epoch": 16.899772872867043,
"grad_norm": 0.42806559801101685,
"learning_rate": 9.315743440233236e-05,
"loss": 3.1907,
"step": 58050
},
{
"epoch": 16.914332304466832,
"grad_norm": 0.40956610441207886,
"learning_rate": 9.27201166180758e-05,
"loss": 3.1856,
"step": 58100
},
{
"epoch": 16.928891736066625,
"grad_norm": 0.41996699571609497,
"learning_rate": 9.228279883381922e-05,
"loss": 3.2155,
"step": 58150
},
{
"epoch": 16.943451167666414,
"grad_norm": 0.4105769693851471,
"learning_rate": 9.184548104956266e-05,
"loss": 3.1813,
"step": 58200
},
{
"epoch": 16.958010599266206,
"grad_norm": 0.4150310754776001,
"learning_rate": 9.140816326530612e-05,
"loss": 3.1908,
"step": 58250
},
{
"epoch": 16.972570030865995,
"grad_norm": 0.42143964767456055,
"learning_rate": 9.097084548104956e-05,
"loss": 3.1859,
"step": 58300
},
{
"epoch": 16.987129462465784,
"grad_norm": 0.42836061120033264,
"learning_rate": 9.0533527696793e-05,
"loss": 3.1989,
"step": 58350
},
{
"epoch": 17.00145594315998,
"grad_norm": 0.413256973028183,
"learning_rate": 9.009620991253644e-05,
"loss": 3.1796,
"step": 58400
},
{
"epoch": 17.01601537475977,
"grad_norm": 0.43588265776634216,
"learning_rate": 8.965889212827987e-05,
"loss": 3.1193,
"step": 58450
},
{
"epoch": 17.03057480635956,
"grad_norm": 0.4288026690483093,
"learning_rate": 8.922157434402333e-05,
"loss": 3.1271,
"step": 58500
},
{
"epoch": 17.04513423795935,
"grad_norm": 0.42423388361930847,
"learning_rate": 8.878425655976677e-05,
"loss": 3.1333,
"step": 58550
},
{
"epoch": 17.05969366955914,
"grad_norm": 0.41696009039878845,
"learning_rate": 8.83469387755102e-05,
"loss": 3.134,
"step": 58600
},
{
"epoch": 17.07425310115893,
"grad_norm": 0.4192046523094177,
"learning_rate": 8.790962099125363e-05,
"loss": 3.1377,
"step": 58650
},
{
"epoch": 17.08881253275872,
"grad_norm": 0.43218058347702026,
"learning_rate": 8.747230320699707e-05,
"loss": 3.1403,
"step": 58700
},
{
"epoch": 17.103371964358512,
"grad_norm": 0.4226701855659485,
"learning_rate": 8.703498542274051e-05,
"loss": 3.1419,
"step": 58750
},
{
"epoch": 17.1179313959583,
"grad_norm": 0.4426027536392212,
"learning_rate": 8.659766763848396e-05,
"loss": 3.1454,
"step": 58800
},
{
"epoch": 17.132490827558094,
"grad_norm": 0.43630558252334595,
"learning_rate": 8.61603498542274e-05,
"loss": 3.1482,
"step": 58850
},
{
"epoch": 17.147050259157883,
"grad_norm": 0.43396347761154175,
"learning_rate": 8.572303206997084e-05,
"loss": 3.1412,
"step": 58900
},
{
"epoch": 17.161609690757672,
"grad_norm": 0.42278262972831726,
"learning_rate": 8.528571428571428e-05,
"loss": 3.1413,
"step": 58950
},
{
"epoch": 17.176169122357464,
"grad_norm": 0.4185219705104828,
"learning_rate": 8.484839650145771e-05,
"loss": 3.1498,
"step": 59000
},
{
"epoch": 17.176169122357464,
"eval_accuracy": 0.37519579172287665,
"eval_loss": 3.5312256813049316,
"eval_runtime": 179.692,
"eval_samples_per_second": 92.62,
"eval_steps_per_second": 5.793,
"step": 59000
},
{
"epoch": 17.190728553957253,
"grad_norm": 0.437406063079834,
"learning_rate": 8.441107871720115e-05,
"loss": 3.1484,
"step": 59050
},
{
"epoch": 17.205287985557042,
"grad_norm": 0.41340211033821106,
"learning_rate": 8.397376093294459e-05,
"loss": 3.1348,
"step": 59100
},
{
"epoch": 17.219847417156835,
"grad_norm": 0.43353283405303955,
"learning_rate": 8.353644314868804e-05,
"loss": 3.1418,
"step": 59150
},
{
"epoch": 17.234406848756624,
"grad_norm": 0.430377721786499,
"learning_rate": 8.309912536443148e-05,
"loss": 3.1397,
"step": 59200
},
{
"epoch": 17.248966280356417,
"grad_norm": 0.4174799919128418,
"learning_rate": 8.266180758017492e-05,
"loss": 3.1419,
"step": 59250
},
{
"epoch": 17.263525711956206,
"grad_norm": 0.4215719699859619,
"learning_rate": 8.222448979591836e-05,
"loss": 3.1431,
"step": 59300
},
{
"epoch": 17.278085143555995,
"grad_norm": 0.42813974618911743,
"learning_rate": 8.17871720116618e-05,
"loss": 3.1442,
"step": 59350
},
{
"epoch": 17.292644575155787,
"grad_norm": 0.4232740104198456,
"learning_rate": 8.134985422740525e-05,
"loss": 3.1537,
"step": 59400
},
{
"epoch": 17.307204006755576,
"grad_norm": 0.42498260736465454,
"learning_rate": 8.091253644314868e-05,
"loss": 3.1549,
"step": 59450
},
{
"epoch": 17.321763438355365,
"grad_norm": 0.42179837822914124,
"learning_rate": 8.047521865889212e-05,
"loss": 3.1431,
"step": 59500
},
{
"epoch": 17.336322869955158,
"grad_norm": 0.44017553329467773,
"learning_rate": 8.003790087463556e-05,
"loss": 3.154,
"step": 59550
},
{
"epoch": 17.350882301554947,
"grad_norm": 0.41716912388801575,
"learning_rate": 7.9600583090379e-05,
"loss": 3.1716,
"step": 59600
},
{
"epoch": 17.36544173315474,
"grad_norm": 0.41841381788253784,
"learning_rate": 7.916326530612244e-05,
"loss": 3.1608,
"step": 59650
},
{
"epoch": 17.38000116475453,
"grad_norm": 0.42180711030960083,
"learning_rate": 7.872594752186589e-05,
"loss": 3.1682,
"step": 59700
},
{
"epoch": 17.394560596354317,
"grad_norm": 0.42136502265930176,
"learning_rate": 7.828862973760933e-05,
"loss": 3.152,
"step": 59750
},
{
"epoch": 17.40912002795411,
"grad_norm": 0.4209064245223999,
"learning_rate": 7.785131195335277e-05,
"loss": 3.146,
"step": 59800
},
{
"epoch": 17.4236794595539,
"grad_norm": 0.4188046157360077,
"learning_rate": 7.741399416909621e-05,
"loss": 3.1496,
"step": 59850
},
{
"epoch": 17.438238891153688,
"grad_norm": 0.4277164041996002,
"learning_rate": 7.697667638483963e-05,
"loss": 3.1616,
"step": 59900
},
{
"epoch": 17.45279832275348,
"grad_norm": 0.44717901945114136,
"learning_rate": 7.653935860058307e-05,
"loss": 3.161,
"step": 59950
},
{
"epoch": 17.46735775435327,
"grad_norm": 0.40624383091926575,
"learning_rate": 7.610204081632653e-05,
"loss": 3.153,
"step": 60000
},
{
"epoch": 17.46735775435327,
"eval_accuracy": 0.37553701990681954,
"eval_loss": 3.5280678272247314,
"eval_runtime": 179.5658,
"eval_samples_per_second": 92.685,
"eval_steps_per_second": 5.797,
"step": 60000
},
{
"epoch": 17.481917185953062,
"grad_norm": 0.44909200072288513,
"learning_rate": 7.566472303206997e-05,
"loss": 3.1741,
"step": 60050
},
{
"epoch": 17.49647661755285,
"grad_norm": 0.43057718873023987,
"learning_rate": 7.52274052478134e-05,
"loss": 3.1605,
"step": 60100
},
{
"epoch": 17.51103604915264,
"grad_norm": 0.41980892419815063,
"learning_rate": 7.479008746355684e-05,
"loss": 3.1542,
"step": 60150
},
{
"epoch": 17.525595480752433,
"grad_norm": 0.41728129982948303,
"learning_rate": 7.435276967930028e-05,
"loss": 3.1517,
"step": 60200
},
{
"epoch": 17.54015491235222,
"grad_norm": 0.41326892375946045,
"learning_rate": 7.391545189504372e-05,
"loss": 3.1509,
"step": 60250
},
{
"epoch": 17.55471434395201,
"grad_norm": 0.43065664172172546,
"learning_rate": 7.347813411078716e-05,
"loss": 3.1619,
"step": 60300
},
{
"epoch": 17.569273775551803,
"grad_norm": 0.40984541177749634,
"learning_rate": 7.30408163265306e-05,
"loss": 3.1531,
"step": 60350
},
{
"epoch": 17.583833207151592,
"grad_norm": 0.4190158247947693,
"learning_rate": 7.260349854227406e-05,
"loss": 3.1643,
"step": 60400
},
{
"epoch": 17.598392638751385,
"grad_norm": 0.4273621141910553,
"learning_rate": 7.216618075801748e-05,
"loss": 3.1627,
"step": 60450
},
{
"epoch": 17.612952070351174,
"grad_norm": 0.41709256172180176,
"learning_rate": 7.172886297376092e-05,
"loss": 3.1575,
"step": 60500
},
{
"epoch": 17.627511501950963,
"grad_norm": 0.42943283915519714,
"learning_rate": 7.129154518950437e-05,
"loss": 3.1634,
"step": 60550
},
{
"epoch": 17.642070933550755,
"grad_norm": 0.43262234330177307,
"learning_rate": 7.085422740524781e-05,
"loss": 3.1558,
"step": 60600
},
{
"epoch": 17.656630365150544,
"grad_norm": 0.4381544589996338,
"learning_rate": 7.041690962099124e-05,
"loss": 3.1505,
"step": 60650
},
{
"epoch": 17.671189796750333,
"grad_norm": 0.4137042164802551,
"learning_rate": 6.997959183673469e-05,
"loss": 3.1699,
"step": 60700
},
{
"epoch": 17.685749228350126,
"grad_norm": 0.4235377311706543,
"learning_rate": 6.954227405247813e-05,
"loss": 3.169,
"step": 60750
},
{
"epoch": 17.700308659949915,
"grad_norm": 0.4289599061012268,
"learning_rate": 6.910495626822157e-05,
"loss": 3.1644,
"step": 60800
},
{
"epoch": 17.714868091549707,
"grad_norm": 0.4362151622772217,
"learning_rate": 6.866763848396501e-05,
"loss": 3.1613,
"step": 60850
},
{
"epoch": 17.729427523149496,
"grad_norm": 0.42833998799324036,
"learning_rate": 6.823032069970845e-05,
"loss": 3.1587,
"step": 60900
},
{
"epoch": 17.743986954749285,
"grad_norm": 0.4399029314517975,
"learning_rate": 6.779300291545189e-05,
"loss": 3.1602,
"step": 60950
},
{
"epoch": 17.758546386349078,
"grad_norm": 0.41309642791748047,
"learning_rate": 6.735568513119533e-05,
"loss": 3.1645,
"step": 61000
},
{
"epoch": 17.758546386349078,
"eval_accuracy": 0.37569305360774724,
"eval_loss": 3.5239369869232178,
"eval_runtime": 179.7036,
"eval_samples_per_second": 92.614,
"eval_steps_per_second": 5.793,
"step": 61000
},
{
"epoch": 17.773105817948867,
"grad_norm": 0.4099070727825165,
"learning_rate": 6.691836734693877e-05,
"loss": 3.165,
"step": 61050
},
{
"epoch": 17.787665249548656,
"grad_norm": 0.4193074405193329,
"learning_rate": 6.648104956268221e-05,
"loss": 3.1705,
"step": 61100
},
{
"epoch": 17.80222468114845,
"grad_norm": 0.42403796315193176,
"learning_rate": 6.604373177842565e-05,
"loss": 3.1646,
"step": 61150
},
{
"epoch": 17.816784112748238,
"grad_norm": 0.4121449589729309,
"learning_rate": 6.560641399416909e-05,
"loss": 3.1642,
"step": 61200
},
{
"epoch": 17.83134354434803,
"grad_norm": 0.4212753176689148,
"learning_rate": 6.516909620991253e-05,
"loss": 3.163,
"step": 61250
},
{
"epoch": 17.84590297594782,
"grad_norm": 0.43732142448425293,
"learning_rate": 6.473177842565598e-05,
"loss": 3.1773,
"step": 61300
},
{
"epoch": 17.860462407547608,
"grad_norm": 0.4187993109226227,
"learning_rate": 6.42944606413994e-05,
"loss": 3.1628,
"step": 61350
},
{
"epoch": 17.8750218391474,
"grad_norm": 0.4270228445529938,
"learning_rate": 6.385714285714284e-05,
"loss": 3.1583,
"step": 61400
},
{
"epoch": 17.88958127074719,
"grad_norm": 0.42085975408554077,
"learning_rate": 6.34198250728863e-05,
"loss": 3.157,
"step": 61450
},
{
"epoch": 17.90414070234698,
"grad_norm": 0.4262135326862335,
"learning_rate": 6.298250728862974e-05,
"loss": 3.1748,
"step": 61500
},
{
"epoch": 17.91870013394677,
"grad_norm": 0.43082737922668457,
"learning_rate": 6.254518950437316e-05,
"loss": 3.1695,
"step": 61550
},
{
"epoch": 17.93325956554656,
"grad_norm": 0.43583792448043823,
"learning_rate": 6.210787172011662e-05,
"loss": 3.1674,
"step": 61600
},
{
"epoch": 17.947818997146353,
"grad_norm": 0.4230196177959442,
"learning_rate": 6.167055393586006e-05,
"loss": 3.1615,
"step": 61650
},
{
"epoch": 17.962378428746142,
"grad_norm": 0.41631415486335754,
"learning_rate": 6.12332361516035e-05,
"loss": 3.1596,
"step": 61700
},
{
"epoch": 17.97693786034593,
"grad_norm": 0.42340517044067383,
"learning_rate": 6.079591836734693e-05,
"loss": 3.1593,
"step": 61750
},
{
"epoch": 17.991497291945723,
"grad_norm": 0.4355524182319641,
"learning_rate": 6.0358600583090374e-05,
"loss": 3.1761,
"step": 61800
},
{
"epoch": 18.005823772639918,
"grad_norm": 0.4226396083831787,
"learning_rate": 5.9921282798833814e-05,
"loss": 3.1571,
"step": 61850
},
{
"epoch": 18.020383204239707,
"grad_norm": 0.42405375838279724,
"learning_rate": 5.948396501457725e-05,
"loss": 3.1209,
"step": 61900
},
{
"epoch": 18.034942635839496,
"grad_norm": 0.42336833477020264,
"learning_rate": 5.90466472303207e-05,
"loss": 3.1297,
"step": 61950
},
{
"epoch": 18.04950206743929,
"grad_norm": 0.43224647641181946,
"learning_rate": 5.860932944606413e-05,
"loss": 3.1118,
"step": 62000
},
{
"epoch": 18.04950206743929,
"eval_accuracy": 0.37571586486470276,
"eval_loss": 3.526365041732788,
"eval_runtime": 179.6025,
"eval_samples_per_second": 92.666,
"eval_steps_per_second": 5.796,
"step": 62000
},
{
"epoch": 18.064061499039077,
"grad_norm": 0.44077619910240173,
"learning_rate": 5.817201166180757e-05,
"loss": 3.1216,
"step": 62050
},
{
"epoch": 18.078620930638866,
"grad_norm": 0.4180721342563629,
"learning_rate": 5.773469387755102e-05,
"loss": 3.1149,
"step": 62100
},
{
"epoch": 18.09318036223866,
"grad_norm": 0.4361351430416107,
"learning_rate": 5.729737609329446e-05,
"loss": 3.1209,
"step": 62150
},
{
"epoch": 18.107739793838448,
"grad_norm": 0.4234081208705902,
"learning_rate": 5.686005830903789e-05,
"loss": 3.1161,
"step": 62200
},
{
"epoch": 18.12229922543824,
"grad_norm": 0.4476962387561798,
"learning_rate": 5.6422740524781336e-05,
"loss": 3.1178,
"step": 62250
},
{
"epoch": 18.13685865703803,
"grad_norm": 0.4251765012741089,
"learning_rate": 5.5985422740524776e-05,
"loss": 3.1241,
"step": 62300
},
{
"epoch": 18.15141808863782,
"grad_norm": 0.4326912462711334,
"learning_rate": 5.554810495626822e-05,
"loss": 3.1365,
"step": 62350
},
{
"epoch": 18.16597752023761,
"grad_norm": 0.41611161828041077,
"learning_rate": 5.5110787172011655e-05,
"loss": 3.1302,
"step": 62400
},
{
"epoch": 18.1805369518374,
"grad_norm": 0.424152672290802,
"learning_rate": 5.4673469387755094e-05,
"loss": 3.1176,
"step": 62450
},
{
"epoch": 18.19509638343719,
"grad_norm": 0.41061875224113464,
"learning_rate": 5.4236151603498534e-05,
"loss": 3.121,
"step": 62500
},
{
"epoch": 18.20965581503698,
"grad_norm": 0.4476122260093689,
"learning_rate": 5.379883381924198e-05,
"loss": 3.1174,
"step": 62550
},
{
"epoch": 18.22421524663677,
"grad_norm": 0.4128943979740143,
"learning_rate": 5.336151603498542e-05,
"loss": 3.139,
"step": 62600
},
{
"epoch": 18.238774678236563,
"grad_norm": 0.43339040875434875,
"learning_rate": 5.292419825072885e-05,
"loss": 3.1296,
"step": 62650
},
{
"epoch": 18.253334109836352,
"grad_norm": 0.4117395877838135,
"learning_rate": 5.24868804664723e-05,
"loss": 3.1261,
"step": 62700
},
{
"epoch": 18.26789354143614,
"grad_norm": 0.4270617961883545,
"learning_rate": 5.204956268221574e-05,
"loss": 3.1174,
"step": 62750
},
{
"epoch": 18.282452973035934,
"grad_norm": 0.4476640820503235,
"learning_rate": 5.1612244897959184e-05,
"loss": 3.1307,
"step": 62800
},
{
"epoch": 18.297012404635723,
"grad_norm": 0.4141584038734436,
"learning_rate": 5.117492711370262e-05,
"loss": 3.1366,
"step": 62850
},
{
"epoch": 18.31157183623551,
"grad_norm": 0.4253668189048767,
"learning_rate": 5.0737609329446057e-05,
"loss": 3.1301,
"step": 62900
},
{
"epoch": 18.326131267835304,
"grad_norm": 0.4373990595340729,
"learning_rate": 5.03002915451895e-05,
"loss": 3.1225,
"step": 62950
},
{
"epoch": 18.340690699435093,
"grad_norm": 0.41809549927711487,
"learning_rate": 4.986297376093294e-05,
"loss": 3.1376,
"step": 63000
},
{
"epoch": 18.340690699435093,
"eval_accuracy": 0.3758768370851776,
"eval_loss": 3.526252031326294,
"eval_runtime": 179.5814,
"eval_samples_per_second": 92.677,
"eval_steps_per_second": 5.797,
"step": 63000
},
{
"epoch": 18.355250131034886,
"grad_norm": 0.42304909229278564,
"learning_rate": 4.942565597667638e-05,
"loss": 3.1323,
"step": 63050
},
{
"epoch": 18.369809562634675,
"grad_norm": 0.42781421542167664,
"learning_rate": 4.898833819241982e-05,
"loss": 3.134,
"step": 63100
},
{
"epoch": 18.384368994234464,
"grad_norm": 0.42817869782447815,
"learning_rate": 4.855102040816326e-05,
"loss": 3.1311,
"step": 63150
},
{
"epoch": 18.398928425834256,
"grad_norm": 0.43528953194618225,
"learning_rate": 4.81137026239067e-05,
"loss": 3.1295,
"step": 63200
},
{
"epoch": 18.413487857434045,
"grad_norm": 0.4256283938884735,
"learning_rate": 4.7676384839650146e-05,
"loss": 3.1365,
"step": 63250
},
{
"epoch": 18.428047289033834,
"grad_norm": 0.42997053265571594,
"learning_rate": 4.723906705539358e-05,
"loss": 3.138,
"step": 63300
},
{
"epoch": 18.442606720633627,
"grad_norm": 0.4277358651161194,
"learning_rate": 4.680174927113702e-05,
"loss": 3.1319,
"step": 63350
},
{
"epoch": 18.457166152233416,
"grad_norm": 0.43694230914115906,
"learning_rate": 4.6364431486880465e-05,
"loss": 3.1376,
"step": 63400
},
{
"epoch": 18.47172558383321,
"grad_norm": 0.427422434091568,
"learning_rate": 4.5927113702623904e-05,
"loss": 3.1373,
"step": 63450
},
{
"epoch": 18.486285015432998,
"grad_norm": 0.43156328797340393,
"learning_rate": 4.548979591836734e-05,
"loss": 3.1221,
"step": 63500
},
{
"epoch": 18.500844447032787,
"grad_norm": 0.42131608724594116,
"learning_rate": 4.505247813411078e-05,
"loss": 3.1361,
"step": 63550
},
{
"epoch": 18.51540387863258,
"grad_norm": 0.44428882002830505,
"learning_rate": 4.461516034985422e-05,
"loss": 3.1381,
"step": 63600
},
{
"epoch": 18.529963310232368,
"grad_norm": 0.4337320327758789,
"learning_rate": 4.417784256559766e-05,
"loss": 3.1281,
"step": 63650
},
{
"epoch": 18.544522741832157,
"grad_norm": 0.4291313588619232,
"learning_rate": 4.374052478134111e-05,
"loss": 3.1439,
"step": 63700
},
{
"epoch": 18.55908217343195,
"grad_norm": 0.42422613501548767,
"learning_rate": 4.330320699708454e-05,
"loss": 3.1322,
"step": 63750
},
{
"epoch": 18.57364160503174,
"grad_norm": 0.4216996729373932,
"learning_rate": 4.286588921282798e-05,
"loss": 3.1422,
"step": 63800
},
{
"epoch": 18.58820103663153,
"grad_norm": 0.4323543608188629,
"learning_rate": 4.242857142857143e-05,
"loss": 3.1405,
"step": 63850
},
{
"epoch": 18.60276046823132,
"grad_norm": 0.4178192615509033,
"learning_rate": 4.1991253644314866e-05,
"loss": 3.1384,
"step": 63900
},
{
"epoch": 18.61731989983111,
"grad_norm": 0.4237135052680969,
"learning_rate": 4.15539358600583e-05,
"loss": 3.1346,
"step": 63950
},
{
"epoch": 18.631879331430902,
"grad_norm": 0.42646321654319763,
"learning_rate": 4.1116618075801745e-05,
"loss": 3.1384,
"step": 64000
},
{
"epoch": 18.631879331430902,
"eval_accuracy": 0.3760594447246205,
"eval_loss": 3.5238232612609863,
"eval_runtime": 179.56,
"eval_samples_per_second": 92.688,
"eval_steps_per_second": 5.798,
"step": 64000
},
{
"epoch": 18.64643876303069,
"grad_norm": 0.43727925419807434,
"learning_rate": 4.0679300291545185e-05,
"loss": 3.1504,
"step": 64050
},
{
"epoch": 18.66099819463048,
"grad_norm": 0.43674325942993164,
"learning_rate": 4.024198250728863e-05,
"loss": 3.1456,
"step": 64100
},
{
"epoch": 18.675557626230272,
"grad_norm": 0.4338884949684143,
"learning_rate": 3.980466472303207e-05,
"loss": 3.1343,
"step": 64150
},
{
"epoch": 18.69011705783006,
"grad_norm": 0.4316923916339874,
"learning_rate": 3.93673469387755e-05,
"loss": 3.1403,
"step": 64200
},
{
"epoch": 18.704676489429854,
"grad_norm": 0.4154166281223297,
"learning_rate": 3.893002915451895e-05,
"loss": 3.134,
"step": 64250
},
{
"epoch": 18.719235921029643,
"grad_norm": 0.43093207478523254,
"learning_rate": 3.849271137026239e-05,
"loss": 3.1324,
"step": 64300
},
{
"epoch": 18.733795352629432,
"grad_norm": 0.42517316341400146,
"learning_rate": 3.805539358600583e-05,
"loss": 3.138,
"step": 64350
},
{
"epoch": 18.748354784229225,
"grad_norm": 0.4249558448791504,
"learning_rate": 3.761807580174926e-05,
"loss": 3.1348,
"step": 64400
},
{
"epoch": 18.762914215829014,
"grad_norm": 0.4178551137447357,
"learning_rate": 3.718075801749271e-05,
"loss": 3.1347,
"step": 64450
},
{
"epoch": 18.777473647428806,
"grad_norm": 0.43049004673957825,
"learning_rate": 3.674344023323615e-05,
"loss": 3.1401,
"step": 64500
},
{
"epoch": 18.792033079028595,
"grad_norm": 0.4242251515388489,
"learning_rate": 3.6306122448979586e-05,
"loss": 3.1359,
"step": 64550
},
{
"epoch": 18.806592510628384,
"grad_norm": 0.41446393728256226,
"learning_rate": 3.5868804664723026e-05,
"loss": 3.1467,
"step": 64600
},
{
"epoch": 18.821151942228177,
"grad_norm": 0.4318779408931732,
"learning_rate": 3.543148688046647e-05,
"loss": 3.1429,
"step": 64650
},
{
"epoch": 18.835711373827966,
"grad_norm": 0.4431349039077759,
"learning_rate": 3.499416909620991e-05,
"loss": 3.1446,
"step": 64700
},
{
"epoch": 18.850270805427755,
"grad_norm": 0.44169580936431885,
"learning_rate": 3.455685131195335e-05,
"loss": 3.1393,
"step": 64750
},
{
"epoch": 18.864830237027547,
"grad_norm": 0.42799896001815796,
"learning_rate": 3.411953352769679e-05,
"loss": 3.1429,
"step": 64800
},
{
"epoch": 18.879389668627336,
"grad_norm": 0.42514893412590027,
"learning_rate": 3.368221574344023e-05,
"loss": 3.1263,
"step": 64850
},
{
"epoch": 18.893949100227125,
"grad_norm": 0.4253270626068115,
"learning_rate": 3.324489795918367e-05,
"loss": 3.1494,
"step": 64900
},
{
"epoch": 18.908508531826918,
"grad_norm": 0.4338584244251251,
"learning_rate": 3.280758017492711e-05,
"loss": 3.1512,
"step": 64950
},
{
"epoch": 18.923067963426707,
"grad_norm": 0.42179763317108154,
"learning_rate": 3.237026239067055e-05,
"loss": 3.1365,
"step": 65000
},
{
"epoch": 18.923067963426707,
"eval_accuracy": 0.3764943871961591,
"eval_loss": 3.520042657852173,
"eval_runtime": 179.62,
"eval_samples_per_second": 92.657,
"eval_steps_per_second": 5.796,
"step": 65000
},
{
"epoch": 18.9376273950265,
"grad_norm": 0.4421677589416504,
"learning_rate": 3.1932944606413995e-05,
"loss": 3.1499,
"step": 65050
},
{
"epoch": 18.95218682662629,
"grad_norm": 0.4211215674877167,
"learning_rate": 3.149562682215743e-05,
"loss": 3.1447,
"step": 65100
},
{
"epoch": 18.966746258226078,
"grad_norm": 0.42026522755622864,
"learning_rate": 3.1058309037900874e-05,
"loss": 3.1472,
"step": 65150
},
{
"epoch": 18.98130568982587,
"grad_norm": 0.42903971672058105,
"learning_rate": 3.062099125364431e-05,
"loss": 3.1352,
"step": 65200
},
{
"epoch": 18.99586512142566,
"grad_norm": 0.4268266558647156,
"learning_rate": 3.0183673469387753e-05,
"loss": 3.1464,
"step": 65250
},
{
"epoch": 19.010191602119853,
"grad_norm": 0.43615421652793884,
"learning_rate": 2.9746355685131196e-05,
"loss": 3.1255,
"step": 65300
},
{
"epoch": 19.024751033719642,
"grad_norm": 0.43458208441734314,
"learning_rate": 2.9309037900874632e-05,
"loss": 3.1095,
"step": 65350
},
{
"epoch": 19.039310465319435,
"grad_norm": 0.4229234457015991,
"learning_rate": 2.8871720116618075e-05,
"loss": 3.119,
"step": 65400
},
{
"epoch": 19.053869896919224,
"grad_norm": 0.42766526341438293,
"learning_rate": 2.8434402332361514e-05,
"loss": 3.1083,
"step": 65450
},
{
"epoch": 19.068429328519013,
"grad_norm": 0.4272925853729248,
"learning_rate": 2.7997084548104954e-05,
"loss": 3.0968,
"step": 65500
},
{
"epoch": 19.082988760118806,
"grad_norm": 0.43676096200942993,
"learning_rate": 2.7559766763848393e-05,
"loss": 3.1023,
"step": 65550
},
{
"epoch": 19.097548191718595,
"grad_norm": 0.4115598797798157,
"learning_rate": 2.7122448979591836e-05,
"loss": 3.1031,
"step": 65600
},
{
"epoch": 19.112107623318387,
"grad_norm": 0.4189133048057556,
"learning_rate": 2.6685131195335272e-05,
"loss": 3.0964,
"step": 65650
},
{
"epoch": 19.126667054918176,
"grad_norm": 0.4264732003211975,
"learning_rate": 2.6247813411078715e-05,
"loss": 3.0977,
"step": 65700
},
{
"epoch": 19.141226486517965,
"grad_norm": 0.4251263439655304,
"learning_rate": 2.5810495626822158e-05,
"loss": 3.1168,
"step": 65750
},
{
"epoch": 19.155785918117758,
"grad_norm": 0.41753003001213074,
"learning_rate": 2.5373177842565594e-05,
"loss": 3.1148,
"step": 65800
},
{
"epoch": 19.170345349717547,
"grad_norm": 0.4261728525161743,
"learning_rate": 2.4935860058309037e-05,
"loss": 3.1159,
"step": 65850
},
{
"epoch": 19.184904781317336,
"grad_norm": 0.4432784914970398,
"learning_rate": 2.4498542274052476e-05,
"loss": 3.1098,
"step": 65900
},
{
"epoch": 19.19946421291713,
"grad_norm": 0.4324243366718292,
"learning_rate": 2.406122448979592e-05,
"loss": 3.1032,
"step": 65950
},
{
"epoch": 19.214023644516917,
"grad_norm": 0.42517364025115967,
"learning_rate": 2.3623906705539355e-05,
"loss": 3.1048,
"step": 66000
},
{
"epoch": 19.214023644516917,
"eval_accuracy": 0.37641925114876434,
"eval_loss": 3.522754430770874,
"eval_runtime": 179.766,
"eval_samples_per_second": 92.581,
"eval_steps_per_second": 5.791,
"step": 66000
},
{
"epoch": 19.22858307611671,
"grad_norm": 0.43855613470077515,
"learning_rate": 2.3186588921282798e-05,
"loss": 3.1086,
"step": 66050
},
{
"epoch": 19.2431425077165,
"grad_norm": 0.42424947023391724,
"learning_rate": 2.2749271137026234e-05,
"loss": 3.1104,
"step": 66100
},
{
"epoch": 19.257701939316288,
"grad_norm": 0.4278988540172577,
"learning_rate": 2.2311953352769677e-05,
"loss": 3.1145,
"step": 66150
},
{
"epoch": 19.27226137091608,
"grad_norm": 0.4234248399734497,
"learning_rate": 2.1874635568513116e-05,
"loss": 3.1091,
"step": 66200
},
{
"epoch": 19.28682080251587,
"grad_norm": 0.42990297079086304,
"learning_rate": 2.143731778425656e-05,
"loss": 3.1092,
"step": 66250
},
{
"epoch": 19.30138023411566,
"grad_norm": 0.4154321849346161,
"learning_rate": 2.1e-05,
"loss": 3.1191,
"step": 66300
},
{
"epoch": 19.31593966571545,
"grad_norm": 0.42328861355781555,
"learning_rate": 2.056268221574344e-05,
"loss": 3.1237,
"step": 66350
},
{
"epoch": 19.33049909731524,
"grad_norm": 0.4259921610355377,
"learning_rate": 2.012536443148688e-05,
"loss": 3.1164,
"step": 66400
},
{
"epoch": 19.345058528915033,
"grad_norm": 0.4307624101638794,
"learning_rate": 1.9688046647230317e-05,
"loss": 3.1103,
"step": 66450
},
{
"epoch": 19.35961796051482,
"grad_norm": 0.43328267335891724,
"learning_rate": 1.925072886297376e-05,
"loss": 3.1119,
"step": 66500
},
{
"epoch": 19.37417739211461,
"grad_norm": 0.4142916798591614,
"learning_rate": 1.88134110787172e-05,
"loss": 3.1167,
"step": 66550
},
{
"epoch": 19.388736823714403,
"grad_norm": 0.4262329041957855,
"learning_rate": 1.837609329446064e-05,
"loss": 3.1089,
"step": 66600
},
{
"epoch": 19.403296255314192,
"grad_norm": 0.4410352110862732,
"learning_rate": 1.7938775510204082e-05,
"loss": 3.1179,
"step": 66650
},
{
"epoch": 19.41785568691398,
"grad_norm": 0.4364875853061676,
"learning_rate": 1.750145772594752e-05,
"loss": 3.1097,
"step": 66700
},
{
"epoch": 19.432415118513774,
"grad_norm": 0.4255106449127197,
"learning_rate": 1.706413994169096e-05,
"loss": 3.1238,
"step": 66750
},
{
"epoch": 19.446974550113563,
"grad_norm": 0.4275910258293152,
"learning_rate": 1.66268221574344e-05,
"loss": 3.1268,
"step": 66800
},
{
"epoch": 19.461533981713355,
"grad_norm": 0.42917412519454956,
"learning_rate": 1.618950437317784e-05,
"loss": 3.1079,
"step": 66850
},
{
"epoch": 19.476093413313144,
"grad_norm": 0.4198929965496063,
"learning_rate": 1.5752186588921283e-05,
"loss": 3.1104,
"step": 66900
},
{
"epoch": 19.490652844912933,
"grad_norm": 0.43348103761672974,
"learning_rate": 1.5314868804664722e-05,
"loss": 3.1199,
"step": 66950
},
{
"epoch": 19.505212276512726,
"grad_norm": 0.4270970821380615,
"learning_rate": 1.4877551020408162e-05,
"loss": 3.1252,
"step": 67000
},
{
"epoch": 19.505212276512726,
"eval_accuracy": 0.3765385987044852,
"eval_loss": 3.52213716506958,
"eval_runtime": 179.576,
"eval_samples_per_second": 92.679,
"eval_steps_per_second": 5.797,
"step": 67000
},
{
"epoch": 19.519771708112515,
"grad_norm": 0.42685285210609436,
"learning_rate": 1.4440233236151601e-05,
"loss": 3.112,
"step": 67050
},
{
"epoch": 19.534331139712307,
"grad_norm": 0.4240424633026123,
"learning_rate": 1.400291545189504e-05,
"loss": 3.115,
"step": 67100
},
{
"epoch": 19.548890571312096,
"grad_norm": 0.41864222288131714,
"learning_rate": 1.3565597667638484e-05,
"loss": 3.1075,
"step": 67150
},
{
"epoch": 19.563450002911885,
"grad_norm": 0.430819571018219,
"learning_rate": 1.3128279883381923e-05,
"loss": 3.102,
"step": 67200
},
{
"epoch": 19.578009434511678,
"grad_norm": 0.45127809047698975,
"learning_rate": 1.2690962099125364e-05,
"loss": 3.1151,
"step": 67250
},
{
"epoch": 19.592568866111467,
"grad_norm": 0.4324467182159424,
"learning_rate": 1.2253644314868804e-05,
"loss": 3.1218,
"step": 67300
},
{
"epoch": 19.607128297711256,
"grad_norm": 0.41157129406929016,
"learning_rate": 1.1816326530612243e-05,
"loss": 3.1168,
"step": 67350
},
{
"epoch": 19.62168772931105,
"grad_norm": 0.4352055788040161,
"learning_rate": 1.1379008746355684e-05,
"loss": 3.1135,
"step": 67400
},
{
"epoch": 19.636247160910838,
"grad_norm": 0.4236614406108856,
"learning_rate": 1.0941690962099124e-05,
"loss": 3.1209,
"step": 67450
},
{
"epoch": 19.650806592510627,
"grad_norm": 0.4268696904182434,
"learning_rate": 1.0504373177842565e-05,
"loss": 3.119,
"step": 67500
},
{
"epoch": 19.66536602411042,
"grad_norm": 0.42168357968330383,
"learning_rate": 1.0067055393586005e-05,
"loss": 3.092,
"step": 67550
},
{
"epoch": 19.679925455710208,
"grad_norm": 0.4286148250102997,
"learning_rate": 9.629737609329444e-06,
"loss": 3.1094,
"step": 67600
},
{
"epoch": 19.69448488731,
"grad_norm": 0.4331967830657959,
"learning_rate": 9.192419825072885e-06,
"loss": 3.1227,
"step": 67650
},
{
"epoch": 19.70904431890979,
"grad_norm": 0.4273418188095093,
"learning_rate": 8.755102040816326e-06,
"loss": 3.1184,
"step": 67700
},
{
"epoch": 19.72360375050958,
"grad_norm": 0.44387224316596985,
"learning_rate": 8.317784256559766e-06,
"loss": 3.1179,
"step": 67750
},
{
"epoch": 19.73816318210937,
"grad_norm": 0.4357852637767792,
"learning_rate": 7.880466472303207e-06,
"loss": 3.1163,
"step": 67800
},
{
"epoch": 19.75272261370916,
"grad_norm": 0.41910186409950256,
"learning_rate": 7.443148688046647e-06,
"loss": 3.1157,
"step": 67850
},
{
"epoch": 19.767282045308953,
"grad_norm": 0.4404396414756775,
"learning_rate": 7.005830903790087e-06,
"loss": 3.1209,
"step": 67900
},
{
"epoch": 19.781841476908742,
"grad_norm": 0.4255041182041168,
"learning_rate": 6.568513119533527e-06,
"loss": 3.1139,
"step": 67950
},
{
"epoch": 19.79640090850853,
"grad_norm": 0.42811915278434753,
"learning_rate": 6.1311953352769675e-06,
"loss": 3.1198,
"step": 68000
},
{
"epoch": 19.79640090850853,
"eval_accuracy": 0.3765822222938177,
"eval_loss": 3.520904064178467,
"eval_runtime": 179.5918,
"eval_samples_per_second": 92.671,
"eval_steps_per_second": 5.796,
"step": 68000
},
{
"epoch": 19.810960340108323,
"grad_norm": 0.43425923585891724,
"learning_rate": 5.693877551020407e-06,
"loss": 3.1135,
"step": 68050
},
{
"epoch": 19.825519771708112,
"grad_norm": 0.4284214377403259,
"learning_rate": 5.256559766763848e-06,
"loss": 3.1227,
"step": 68100
},
{
"epoch": 19.8400792033079,
"grad_norm": 0.43212711811065674,
"learning_rate": 4.8192419825072884e-06,
"loss": 3.1076,
"step": 68150
},
{
"epoch": 19.854638634907694,
"grad_norm": 0.41544848680496216,
"learning_rate": 4.381924198250729e-06,
"loss": 3.1172,
"step": 68200
},
{
"epoch": 19.869198066507483,
"grad_norm": 0.41702643036842346,
"learning_rate": 3.944606413994168e-06,
"loss": 3.1174,
"step": 68250
},
{
"epoch": 19.883757498107276,
"grad_norm": 0.432802677154541,
"learning_rate": 3.5072886297376094e-06,
"loss": 3.1218,
"step": 68300
},
{
"epoch": 19.898316929707065,
"grad_norm": 0.4204798936843872,
"learning_rate": 3.0699708454810493e-06,
"loss": 3.1128,
"step": 68350
},
{
"epoch": 19.912876361306854,
"grad_norm": 0.43337222933769226,
"learning_rate": 2.6326530612244892e-06,
"loss": 3.1171,
"step": 68400
},
{
"epoch": 19.927435792906646,
"grad_norm": 0.4287244975566864,
"learning_rate": 2.1953352769679296e-06,
"loss": 3.1083,
"step": 68450
},
{
"epoch": 19.941995224506435,
"grad_norm": 0.4293520152568817,
"learning_rate": 1.7580174927113701e-06,
"loss": 3.1071,
"step": 68500
},
{
"epoch": 19.956554656106224,
"grad_norm": 0.43255236744880676,
"learning_rate": 1.3206997084548104e-06,
"loss": 3.1085,
"step": 68550
},
{
"epoch": 19.971114087706017,
"grad_norm": 0.42116379737854004,
"learning_rate": 8.833819241982507e-07,
"loss": 3.1196,
"step": 68600
},
{
"epoch": 19.985673519305806,
"grad_norm": 0.4275839328765869,
"learning_rate": 4.4606413994169093e-07,
"loss": 3.1015,
"step": 68650
},
{
"epoch": 20.0,
"grad_norm": 1.8701350688934326,
"learning_rate": 8.746355685131195e-09,
"loss": 3.1135,
"step": 68700
},
{
"epoch": 20.0,
"step": 68700,
"total_flos": 1.43566384398336e+18,
"train_loss": 3.438475851533715,
"train_runtime": 136533.177,
"train_samples_per_second": 40.243,
"train_steps_per_second": 0.503
}
],
"logging_steps": 50,
"max_steps": 68700,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.43566384398336e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}