maestro-4-0619 / trainer_state.json
hidude562's picture
Upload 15 files
b41d60b verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8068215610383563,
"eval_steps": 50000,
"global_step": 350000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004610408920219179,
"grad_norm": 0.9611970782279968,
"learning_rate": 4.997706321562191e-05,
"loss": 0.4484,
"step": 200
},
{
"epoch": 0.0009220817840438358,
"grad_norm": 1.1472898721694946,
"learning_rate": 4.995401117102082e-05,
"loss": 0.3912,
"step": 400
},
{
"epoch": 0.0013831226760657536,
"grad_norm": 0.40446579456329346,
"learning_rate": 4.9930959126419716e-05,
"loss": 0.4384,
"step": 600
},
{
"epoch": 0.0018441635680876715,
"grad_norm": 0.32557180523872375,
"learning_rate": 4.990790708181862e-05,
"loss": 0.3821,
"step": 800
},
{
"epoch": 0.0023052044601095893,
"grad_norm": 0.5198535919189453,
"learning_rate": 4.988485503721753e-05,
"loss": 0.4328,
"step": 1000
},
{
"epoch": 0.002766245352131507,
"grad_norm": 0.8159506916999817,
"learning_rate": 4.986180299261643e-05,
"loss": 0.3947,
"step": 1200
},
{
"epoch": 0.003227286244153425,
"grad_norm": 0.18847960233688354,
"learning_rate": 4.983875094801534e-05,
"loss": 0.4029,
"step": 1400
},
{
"epoch": 0.003688327136175343,
"grad_norm": 0.5850073099136353,
"learning_rate": 4.9815698903414245e-05,
"loss": 0.4062,
"step": 1600
},
{
"epoch": 0.004149368028197261,
"grad_norm": 0.7543673515319824,
"learning_rate": 4.9792646858813143e-05,
"loss": 0.397,
"step": 1800
},
{
"epoch": 0.0046104089202191785,
"grad_norm": 0.5074718594551086,
"learning_rate": 4.976959481421205e-05,
"loss": 0.3975,
"step": 2000
},
{
"epoch": 0.005071449812241097,
"grad_norm": 0.898962140083313,
"learning_rate": 4.974665802983396e-05,
"loss": 0.3979,
"step": 2200
},
{
"epoch": 0.005532490704263014,
"grad_norm": 0.49346038699150085,
"learning_rate": 4.9723605985232864e-05,
"loss": 0.4186,
"step": 2400
},
{
"epoch": 0.005993531596284933,
"grad_norm": 0.3977065682411194,
"learning_rate": 4.970055394063177e-05,
"loss": 0.4373,
"step": 2600
},
{
"epoch": 0.00645457248830685,
"grad_norm": 0.19677992165088654,
"learning_rate": 4.967750189603067e-05,
"loss": 0.3846,
"step": 2800
},
{
"epoch": 0.006915613380328769,
"grad_norm": 0.21055851876735687,
"learning_rate": 4.9654449851429574e-05,
"loss": 0.4004,
"step": 3000
},
{
"epoch": 0.007376654272350686,
"grad_norm": 1.1038213968276978,
"learning_rate": 4.963139780682848e-05,
"loss": 0.3867,
"step": 3200
},
{
"epoch": 0.007837695164372604,
"grad_norm": 0.3879926800727844,
"learning_rate": 4.960834576222738e-05,
"loss": 0.367,
"step": 3400
},
{
"epoch": 0.008298736056394522,
"grad_norm": 0.4286792576313019,
"learning_rate": 4.9585293717626285e-05,
"loss": 0.3695,
"step": 3600
},
{
"epoch": 0.00875977694841644,
"grad_norm": 0.44477948546409607,
"learning_rate": 4.956224167302519e-05,
"loss": 0.3602,
"step": 3800
},
{
"epoch": 0.009220817840438357,
"grad_norm": 0.531840980052948,
"learning_rate": 4.9539189628424096e-05,
"loss": 0.393,
"step": 4000
},
{
"epoch": 0.009681858732460276,
"grad_norm": 0.22501394152641296,
"learning_rate": 4.9516137583822995e-05,
"loss": 0.3827,
"step": 4200
},
{
"epoch": 0.010142899624482194,
"grad_norm": 0.7713117003440857,
"learning_rate": 4.949320079944491e-05,
"loss": 0.4092,
"step": 4400
},
{
"epoch": 0.010603940516504111,
"grad_norm": 0.35417065024375916,
"learning_rate": 4.947026401506682e-05,
"loss": 0.3868,
"step": 4600
},
{
"epoch": 0.011064981408526029,
"grad_norm": 0.7144293785095215,
"learning_rate": 4.9447211970465726e-05,
"loss": 0.4243,
"step": 4800
},
{
"epoch": 0.011526022300547946,
"grad_norm": 0.438975065946579,
"learning_rate": 4.9424159925864625e-05,
"loss": 0.373,
"step": 5000
},
{
"epoch": 0.011987063192569865,
"grad_norm": 0.5525286793708801,
"learning_rate": 4.940110788126353e-05,
"loss": 0.383,
"step": 5200
},
{
"epoch": 0.012448104084591783,
"grad_norm": 0.6666736006736755,
"learning_rate": 4.9378055836662436e-05,
"loss": 0.3751,
"step": 5400
},
{
"epoch": 0.0129091449766137,
"grad_norm": 0.7882196307182312,
"learning_rate": 4.935500379206134e-05,
"loss": 0.411,
"step": 5600
},
{
"epoch": 0.013370185868635618,
"grad_norm": 0.3870885670185089,
"learning_rate": 4.933195174746024e-05,
"loss": 0.4229,
"step": 5800
},
{
"epoch": 0.013831226760657537,
"grad_norm": 0.6244848966598511,
"learning_rate": 4.9308899702859146e-05,
"loss": 0.3998,
"step": 6000
},
{
"epoch": 0.014292267652679455,
"grad_norm": 0.6144024729728699,
"learning_rate": 4.928584765825805e-05,
"loss": 0.3662,
"step": 6200
},
{
"epoch": 0.014753308544701372,
"grad_norm": 0.3026362359523773,
"learning_rate": 4.926279561365695e-05,
"loss": 0.4024,
"step": 6400
},
{
"epoch": 0.01521434943672329,
"grad_norm": 0.22851639986038208,
"learning_rate": 4.9239743569055856e-05,
"loss": 0.4056,
"step": 6600
},
{
"epoch": 0.01567539032874521,
"grad_norm": 0.44063982367515564,
"learning_rate": 4.921669152445477e-05,
"loss": 0.4035,
"step": 6800
},
{
"epoch": 0.016136431220767126,
"grad_norm": 0.3869895339012146,
"learning_rate": 4.919363947985367e-05,
"loss": 0.4232,
"step": 7000
},
{
"epoch": 0.016597472112789044,
"grad_norm": 0.5635095238685608,
"learning_rate": 4.9170587435252573e-05,
"loss": 0.3959,
"step": 7200
},
{
"epoch": 0.01705851300481096,
"grad_norm": 0.47322916984558105,
"learning_rate": 4.914753539065148e-05,
"loss": 0.4044,
"step": 7400
},
{
"epoch": 0.01751955389683288,
"grad_norm": 0.2524668276309967,
"learning_rate": 4.9124483346050385e-05,
"loss": 0.45,
"step": 7600
},
{
"epoch": 0.017980594788854797,
"grad_norm": 0.3071528375148773,
"learning_rate": 4.9101431301449284e-05,
"loss": 0.3771,
"step": 7800
},
{
"epoch": 0.018441635680876714,
"grad_norm": 0.39643633365631104,
"learning_rate": 4.907837925684819e-05,
"loss": 0.3761,
"step": 8000
},
{
"epoch": 0.01890267657289863,
"grad_norm": 0.2907356917858124,
"learning_rate": 4.9055327212247095e-05,
"loss": 0.3906,
"step": 8200
},
{
"epoch": 0.019363717464920552,
"grad_norm": 0.7327684164047241,
"learning_rate": 4.9032275167645994e-05,
"loss": 0.4103,
"step": 8400
},
{
"epoch": 0.01982475835694247,
"grad_norm": 0.6757892370223999,
"learning_rate": 4.90092231230449e-05,
"loss": 0.4212,
"step": 8600
},
{
"epoch": 0.020285799248964387,
"grad_norm": 0.40062421560287476,
"learning_rate": 4.898628633866681e-05,
"loss": 0.3433,
"step": 8800
},
{
"epoch": 0.020746840140986305,
"grad_norm": 0.9614256024360657,
"learning_rate": 4.896334955428872e-05,
"loss": 0.3736,
"step": 9000
},
{
"epoch": 0.021207881033008222,
"grad_norm": 0.18410637974739075,
"learning_rate": 4.8940297509687624e-05,
"loss": 0.3872,
"step": 9200
},
{
"epoch": 0.02166892192503014,
"grad_norm": 0.3441492021083832,
"learning_rate": 4.891724546508653e-05,
"loss": 0.3952,
"step": 9400
},
{
"epoch": 0.022129962817052058,
"grad_norm": 0.3522215783596039,
"learning_rate": 4.8894193420485435e-05,
"loss": 0.3766,
"step": 9600
},
{
"epoch": 0.022591003709073975,
"grad_norm": 0.26243916153907776,
"learning_rate": 4.887114137588434e-05,
"loss": 0.3899,
"step": 9800
},
{
"epoch": 0.023052044601095893,
"grad_norm": 0.5126281976699829,
"learning_rate": 4.884808933128324e-05,
"loss": 0.3694,
"step": 10000
},
{
"epoch": 0.023513085493117813,
"grad_norm": 0.37676921486854553,
"learning_rate": 4.8825037286682145e-05,
"loss": 0.3905,
"step": 10200
},
{
"epoch": 0.02397412638513973,
"grad_norm": 0.4603672921657562,
"learning_rate": 4.880198524208105e-05,
"loss": 0.382,
"step": 10400
},
{
"epoch": 0.02443516727716165,
"grad_norm": 0.3685164153575897,
"learning_rate": 4.877893319747996e-05,
"loss": 0.386,
"step": 10600
},
{
"epoch": 0.024896208169183566,
"grad_norm": 0.5517568588256836,
"learning_rate": 4.8755881152878856e-05,
"loss": 0.4461,
"step": 10800
},
{
"epoch": 0.025357249061205483,
"grad_norm": 0.47988224029541016,
"learning_rate": 4.873282910827776e-05,
"loss": 0.3965,
"step": 11000
},
{
"epoch": 0.0258182899532274,
"grad_norm": 0.45358026027679443,
"learning_rate": 4.870977706367667e-05,
"loss": 0.4079,
"step": 11200
},
{
"epoch": 0.02627933084524932,
"grad_norm": 0.6052194833755493,
"learning_rate": 4.8686725019075566e-05,
"loss": 0.3724,
"step": 11400
},
{
"epoch": 0.026740371737271236,
"grad_norm": 0.7866759300231934,
"learning_rate": 4.866367297447447e-05,
"loss": 0.4172,
"step": 11600
},
{
"epoch": 0.027201412629293154,
"grad_norm": 0.4371585249900818,
"learning_rate": 4.864073619009638e-05,
"loss": 0.4469,
"step": 11800
},
{
"epoch": 0.027662453521315074,
"grad_norm": 0.6400772929191589,
"learning_rate": 4.8617684145495286e-05,
"loss": 0.4097,
"step": 12000
},
{
"epoch": 0.028123494413336992,
"grad_norm": 0.7816802859306335,
"learning_rate": 4.8594632100894185e-05,
"loss": 0.4017,
"step": 12200
},
{
"epoch": 0.02858453530535891,
"grad_norm": 0.5563467144966125,
"learning_rate": 4.85715800562931e-05,
"loss": 0.3639,
"step": 12400
},
{
"epoch": 0.029045576197380827,
"grad_norm": 0.5669108033180237,
"learning_rate": 4.8548528011692003e-05,
"loss": 0.3671,
"step": 12600
},
{
"epoch": 0.029506617089402744,
"grad_norm": 0.22141028940677643,
"learning_rate": 4.85254759670909e-05,
"loss": 0.4122,
"step": 12800
},
{
"epoch": 0.029967657981424662,
"grad_norm": 0.8881646394729614,
"learning_rate": 4.850242392248981e-05,
"loss": 0.3572,
"step": 13000
},
{
"epoch": 0.03042869887344658,
"grad_norm": 0.42685621976852417,
"learning_rate": 4.8479371877888714e-05,
"loss": 0.3848,
"step": 13200
},
{
"epoch": 0.030889739765468497,
"grad_norm": 0.48679056763648987,
"learning_rate": 4.845631983328762e-05,
"loss": 0.3406,
"step": 13400
},
{
"epoch": 0.03135078065749042,
"grad_norm": 0.4828736186027527,
"learning_rate": 4.843326778868652e-05,
"loss": 0.3849,
"step": 13600
},
{
"epoch": 0.031811821549512335,
"grad_norm": 0.32578787207603455,
"learning_rate": 4.8410215744085424e-05,
"loss": 0.4313,
"step": 13800
},
{
"epoch": 0.03227286244153425,
"grad_norm": 0.3235298693180084,
"learning_rate": 4.838716369948433e-05,
"loss": 0.3981,
"step": 14000
},
{
"epoch": 0.03273390333355617,
"grad_norm": 0.5211312174797058,
"learning_rate": 4.836411165488323e-05,
"loss": 0.3652,
"step": 14200
},
{
"epoch": 0.03319494422557809,
"grad_norm": 0.3209587633609772,
"learning_rate": 4.834117487050514e-05,
"loss": 0.4348,
"step": 14400
},
{
"epoch": 0.033655985117600005,
"grad_norm": 0.8167837262153625,
"learning_rate": 4.831812282590404e-05,
"loss": 0.396,
"step": 14600
},
{
"epoch": 0.03411702600962192,
"grad_norm": 0.2770218253135681,
"learning_rate": 4.829507078130295e-05,
"loss": 0.3799,
"step": 14800
},
{
"epoch": 0.03457806690164384,
"grad_norm": 0.4959569573402405,
"learning_rate": 4.8272018736701855e-05,
"loss": 0.3862,
"step": 15000
},
{
"epoch": 0.03503910779366576,
"grad_norm": 0.21598905324935913,
"learning_rate": 4.8248966692100754e-05,
"loss": 0.4202,
"step": 15200
},
{
"epoch": 0.035500148685687676,
"grad_norm": 0.4073766767978668,
"learning_rate": 4.8225914647499666e-05,
"loss": 0.3862,
"step": 15400
},
{
"epoch": 0.03596118957770959,
"grad_norm": 0.8326546549797058,
"learning_rate": 4.820286260289857e-05,
"loss": 0.3817,
"step": 15600
},
{
"epoch": 0.03642223046973151,
"grad_norm": 0.4838143289089203,
"learning_rate": 4.817981055829747e-05,
"loss": 0.3796,
"step": 15800
},
{
"epoch": 0.03688327136175343,
"grad_norm": 1.0439170598983765,
"learning_rate": 4.8156758513696376e-05,
"loss": 0.3666,
"step": 16000
},
{
"epoch": 0.037344312253775346,
"grad_norm": 0.9597964286804199,
"learning_rate": 4.813370646909528e-05,
"loss": 0.4283,
"step": 16200
},
{
"epoch": 0.03780535314579726,
"grad_norm": 0.6745343208312988,
"learning_rate": 4.811065442449418e-05,
"loss": 0.3763,
"step": 16400
},
{
"epoch": 0.03826639403781919,
"grad_norm": 0.22818545997142792,
"learning_rate": 4.8087602379893087e-05,
"loss": 0.3925,
"step": 16600
},
{
"epoch": 0.038727434929841105,
"grad_norm": 0.15839001536369324,
"learning_rate": 4.806455033529199e-05,
"loss": 0.377,
"step": 16800
},
{
"epoch": 0.03918847582186302,
"grad_norm": 0.39590421319007874,
"learning_rate": 4.80414982906909e-05,
"loss": 0.3428,
"step": 17000
},
{
"epoch": 0.03964951671388494,
"grad_norm": 0.37461820244789124,
"learning_rate": 4.80184462460898e-05,
"loss": 0.3649,
"step": 17200
},
{
"epoch": 0.04011055760590686,
"grad_norm": 0.3822472095489502,
"learning_rate": 4.79953942014887e-05,
"loss": 0.4305,
"step": 17400
},
{
"epoch": 0.040571598497928775,
"grad_norm": 0.2986813187599182,
"learning_rate": 4.797234215688761e-05,
"loss": 0.4093,
"step": 17600
},
{
"epoch": 0.04103263938995069,
"grad_norm": 0.2903802990913391,
"learning_rate": 4.794929011228651e-05,
"loss": 0.4172,
"step": 17800
},
{
"epoch": 0.04149368028197261,
"grad_norm": 1.391071081161499,
"learning_rate": 4.792623806768541e-05,
"loss": 0.4121,
"step": 18000
},
{
"epoch": 0.04195472117399453,
"grad_norm": 0.743326723575592,
"learning_rate": 4.790318602308432e-05,
"loss": 0.3982,
"step": 18200
},
{
"epoch": 0.042415762066016445,
"grad_norm": 0.37891262769699097,
"learning_rate": 4.7880133978483224e-05,
"loss": 0.3865,
"step": 18400
},
{
"epoch": 0.04287680295803836,
"grad_norm": 0.35921722650527954,
"learning_rate": 4.785708193388212e-05,
"loss": 0.404,
"step": 18600
},
{
"epoch": 0.04333784385006028,
"grad_norm": 0.3930653929710388,
"learning_rate": 4.7834029889281035e-05,
"loss": 0.3888,
"step": 18800
},
{
"epoch": 0.0437988847420822,
"grad_norm": 0.2733406722545624,
"learning_rate": 4.781097784467994e-05,
"loss": 0.4002,
"step": 19000
},
{
"epoch": 0.044259925634104115,
"grad_norm": 0.1752399206161499,
"learning_rate": 4.778792580007884e-05,
"loss": 0.378,
"step": 19200
},
{
"epoch": 0.04472096652612603,
"grad_norm": 0.3835102617740631,
"learning_rate": 4.7764873755477746e-05,
"loss": 0.3891,
"step": 19400
},
{
"epoch": 0.04518200741814795,
"grad_norm": 0.8556287288665771,
"learning_rate": 4.774182171087665e-05,
"loss": 0.4174,
"step": 19600
},
{
"epoch": 0.04564304831016987,
"grad_norm": 0.5243550539016724,
"learning_rate": 4.771876966627555e-05,
"loss": 0.4004,
"step": 19800
},
{
"epoch": 0.046104089202191785,
"grad_norm": 0.7527849078178406,
"learning_rate": 4.769583288189746e-05,
"loss": 0.3687,
"step": 20000
},
{
"epoch": 0.04656513009421371,
"grad_norm": 0.7944478988647461,
"learning_rate": 4.7672780837296365e-05,
"loss": 0.3769,
"step": 20200
},
{
"epoch": 0.04702617098623563,
"grad_norm": 0.3618263602256775,
"learning_rate": 4.764972879269527e-05,
"loss": 0.4074,
"step": 20400
},
{
"epoch": 0.047487211878257544,
"grad_norm": 0.7329002618789673,
"learning_rate": 4.7626676748094177e-05,
"loss": 0.3885,
"step": 20600
},
{
"epoch": 0.04794825277027946,
"grad_norm": 0.43845218420028687,
"learning_rate": 4.7603624703493075e-05,
"loss": 0.4164,
"step": 20800
},
{
"epoch": 0.04840929366230138,
"grad_norm": 0.4874444007873535,
"learning_rate": 4.758057265889198e-05,
"loss": 0.4059,
"step": 21000
},
{
"epoch": 0.0488703345543233,
"grad_norm": 0.25360676646232605,
"learning_rate": 4.755752061429089e-05,
"loss": 0.3983,
"step": 21200
},
{
"epoch": 0.049331375446345214,
"grad_norm": 0.42428573966026306,
"learning_rate": 4.7534468569689786e-05,
"loss": 0.4038,
"step": 21400
},
{
"epoch": 0.04979241633836713,
"grad_norm": 0.5442131757736206,
"learning_rate": 4.75114165250887e-05,
"loss": 0.3563,
"step": 21600
},
{
"epoch": 0.05025345723038905,
"grad_norm": 0.3310032784938812,
"learning_rate": 4.7488364480487604e-05,
"loss": 0.3949,
"step": 21800
},
{
"epoch": 0.05071449812241097,
"grad_norm": 0.8897857666015625,
"learning_rate": 4.746542769610951e-05,
"loss": 0.4365,
"step": 22000
},
{
"epoch": 0.051175539014432884,
"grad_norm": 0.502668559551239,
"learning_rate": 4.744237565150841e-05,
"loss": 0.3914,
"step": 22200
},
{
"epoch": 0.0516365799064548,
"grad_norm": 0.7646440267562866,
"learning_rate": 4.741932360690732e-05,
"loss": 0.3977,
"step": 22400
},
{
"epoch": 0.05209762079847672,
"grad_norm": 0.8941252827644348,
"learning_rate": 4.739627156230622e-05,
"loss": 0.3941,
"step": 22600
},
{
"epoch": 0.05255866169049864,
"grad_norm": 0.524489164352417,
"learning_rate": 4.737321951770512e-05,
"loss": 0.3735,
"step": 22800
},
{
"epoch": 0.053019702582520555,
"grad_norm": 0.4568984806537628,
"learning_rate": 4.735016747310403e-05,
"loss": 0.3732,
"step": 23000
},
{
"epoch": 0.05348074347454247,
"grad_norm": 0.13151802122592926,
"learning_rate": 4.7327115428502933e-05,
"loss": 0.3924,
"step": 23200
},
{
"epoch": 0.05394178436656439,
"grad_norm": 0.8445279002189636,
"learning_rate": 4.730406338390184e-05,
"loss": 0.3863,
"step": 23400
},
{
"epoch": 0.05440282525858631,
"grad_norm": 0.407316654920578,
"learning_rate": 4.728101133930074e-05,
"loss": 0.4293,
"step": 23600
},
{
"epoch": 0.05486386615060823,
"grad_norm": 0.5282636880874634,
"learning_rate": 4.7257959294699644e-05,
"loss": 0.3909,
"step": 23800
},
{
"epoch": 0.05532490704263015,
"grad_norm": 0.241099551320076,
"learning_rate": 4.723490725009855e-05,
"loss": 0.4211,
"step": 24000
},
{
"epoch": 0.055785947934652066,
"grad_norm": 0.2243630737066269,
"learning_rate": 4.721197046572046e-05,
"loss": 0.4301,
"step": 24200
},
{
"epoch": 0.056246988826673984,
"grad_norm": 0.36898645758628845,
"learning_rate": 4.7188918421119364e-05,
"loss": 0.3864,
"step": 24400
},
{
"epoch": 0.0567080297186959,
"grad_norm": 0.6935632824897766,
"learning_rate": 4.716586637651827e-05,
"loss": 0.4308,
"step": 24600
},
{
"epoch": 0.05716907061071782,
"grad_norm": 0.7641319036483765,
"learning_rate": 4.7142814331917176e-05,
"loss": 0.417,
"step": 24800
},
{
"epoch": 0.057630111502739736,
"grad_norm": 0.16372926533222198,
"learning_rate": 4.7119762287316075e-05,
"loss": 0.4054,
"step": 25000
},
{
"epoch": 0.058091152394761654,
"grad_norm": 0.4964084029197693,
"learning_rate": 4.709671024271498e-05,
"loss": 0.413,
"step": 25200
},
{
"epoch": 0.05855219328678357,
"grad_norm": 0.45398572087287903,
"learning_rate": 4.7073658198113886e-05,
"loss": 0.3773,
"step": 25400
},
{
"epoch": 0.05901323417880549,
"grad_norm": 0.6680997014045715,
"learning_rate": 4.705060615351279e-05,
"loss": 0.3604,
"step": 25600
},
{
"epoch": 0.059474275070827406,
"grad_norm": 0.36069509387016296,
"learning_rate": 4.702755410891169e-05,
"loss": 0.4098,
"step": 25800
},
{
"epoch": 0.059935315962849324,
"grad_norm": 0.7913092374801636,
"learning_rate": 4.7004502064310596e-05,
"loss": 0.3655,
"step": 26000
},
{
"epoch": 0.06039635685487124,
"grad_norm": 0.4226435720920563,
"learning_rate": 4.69814500197095e-05,
"loss": 0.3764,
"step": 26200
},
{
"epoch": 0.06085739774689316,
"grad_norm": 0.5026397109031677,
"learning_rate": 4.69583979751084e-05,
"loss": 0.3815,
"step": 26400
},
{
"epoch": 0.061318438638915077,
"grad_norm": 0.767078697681427,
"learning_rate": 4.6935345930507306e-05,
"loss": 0.4031,
"step": 26600
},
{
"epoch": 0.061779479530936994,
"grad_norm": 0.5278864502906799,
"learning_rate": 4.691229388590621e-05,
"loss": 0.3465,
"step": 26800
},
{
"epoch": 0.06224052042295891,
"grad_norm": 0.3403940796852112,
"learning_rate": 4.688924184130512e-05,
"loss": 0.374,
"step": 27000
},
{
"epoch": 0.06270156131498084,
"grad_norm": 0.805055558681488,
"learning_rate": 4.686618979670402e-05,
"loss": 0.4138,
"step": 27200
},
{
"epoch": 0.06316260220700275,
"grad_norm": 0.22436587512493134,
"learning_rate": 4.684313775210292e-05,
"loss": 0.3964,
"step": 27400
},
{
"epoch": 0.06362364309902467,
"grad_norm": 0.20275616645812988,
"learning_rate": 4.6820085707501835e-05,
"loss": 0.3952,
"step": 27600
},
{
"epoch": 0.06408468399104658,
"grad_norm": 0.49204909801483154,
"learning_rate": 4.6797033662900734e-05,
"loss": 0.4232,
"step": 27800
},
{
"epoch": 0.0645457248830685,
"grad_norm": 0.364629864692688,
"learning_rate": 4.677409687852264e-05,
"loss": 0.3709,
"step": 28000
},
{
"epoch": 0.06500676577509042,
"grad_norm": 0.5196096301078796,
"learning_rate": 4.675104483392155e-05,
"loss": 0.38,
"step": 28200
},
{
"epoch": 0.06546780666711234,
"grad_norm": 0.6097026467323303,
"learning_rate": 4.6727992789320454e-05,
"loss": 0.4249,
"step": 28400
},
{
"epoch": 0.06592884755913425,
"grad_norm": 0.48773273825645447,
"learning_rate": 4.670494074471935e-05,
"loss": 0.3718,
"step": 28600
},
{
"epoch": 0.06638988845115618,
"grad_norm": 0.8855012059211731,
"learning_rate": 4.668188870011826e-05,
"loss": 0.394,
"step": 28800
},
{
"epoch": 0.06685092934317809,
"grad_norm": 0.26086971163749695,
"learning_rate": 4.6658836655517165e-05,
"loss": 0.3887,
"step": 29000
},
{
"epoch": 0.06731197023520001,
"grad_norm": 0.4091934859752655,
"learning_rate": 4.6635784610916063e-05,
"loss": 0.3906,
"step": 29200
},
{
"epoch": 0.06777301112722194,
"grad_norm": 0.6718190908432007,
"learning_rate": 4.661273256631497e-05,
"loss": 0.4077,
"step": 29400
},
{
"epoch": 0.06823405201924385,
"grad_norm": 0.2565561830997467,
"learning_rate": 4.658979578193688e-05,
"loss": 0.3875,
"step": 29600
},
{
"epoch": 0.06869509291126577,
"grad_norm": 0.8020517230033875,
"learning_rate": 4.6566743737335784e-05,
"loss": 0.4273,
"step": 29800
},
{
"epoch": 0.06915613380328768,
"grad_norm": 1.2540035247802734,
"learning_rate": 4.654369169273469e-05,
"loss": 0.3942,
"step": 30000
},
{
"epoch": 0.0696171746953096,
"grad_norm": 0.5798929333686829,
"learning_rate": 4.6520639648133595e-05,
"loss": 0.3875,
"step": 30200
},
{
"epoch": 0.07007821558733152,
"grad_norm": 0.34180527925491333,
"learning_rate": 4.64975876035325e-05,
"loss": 0.4077,
"step": 30400
},
{
"epoch": 0.07053925647935344,
"grad_norm": 0.3982234597206116,
"learning_rate": 4.647453555893141e-05,
"loss": 0.376,
"step": 30600
},
{
"epoch": 0.07100029737137535,
"grad_norm": 0.33950579166412354,
"learning_rate": 4.6451483514330306e-05,
"loss": 0.4314,
"step": 30800
},
{
"epoch": 0.07146133826339728,
"grad_norm": 0.4567508101463318,
"learning_rate": 4.642843146972921e-05,
"loss": 0.3924,
"step": 31000
},
{
"epoch": 0.07192237915541919,
"grad_norm": 0.598886251449585,
"learning_rate": 4.640537942512812e-05,
"loss": 0.3888,
"step": 31200
},
{
"epoch": 0.07238342004744111,
"grad_norm": 0.18900546431541443,
"learning_rate": 4.6382327380527016e-05,
"loss": 0.4076,
"step": 31400
},
{
"epoch": 0.07284446093946302,
"grad_norm": 0.12266331911087036,
"learning_rate": 4.635927533592592e-05,
"loss": 0.3928,
"step": 31600
},
{
"epoch": 0.07330550183148495,
"grad_norm": 0.772557258605957,
"learning_rate": 4.633622329132483e-05,
"loss": 0.3777,
"step": 31800
},
{
"epoch": 0.07376654272350686,
"grad_norm": 0.3965064585208893,
"learning_rate": 4.631317124672373e-05,
"loss": 0.3733,
"step": 32000
},
{
"epoch": 0.07422758361552878,
"grad_norm": 0.931974470615387,
"learning_rate": 4.629011920212263e-05,
"loss": 0.3632,
"step": 32200
},
{
"epoch": 0.07468862450755069,
"grad_norm": 0.32918283343315125,
"learning_rate": 4.626706715752154e-05,
"loss": 0.3621,
"step": 32400
},
{
"epoch": 0.07514966539957262,
"grad_norm": 0.4414158761501312,
"learning_rate": 4.624401511292044e-05,
"loss": 0.3968,
"step": 32600
},
{
"epoch": 0.07561070629159453,
"grad_norm": 0.6213604807853699,
"learning_rate": 4.622096306831934e-05,
"loss": 0.3723,
"step": 32800
},
{
"epoch": 0.07607174718361645,
"grad_norm": 0.4169836640357971,
"learning_rate": 4.619791102371825e-05,
"loss": 0.3803,
"step": 33000
},
{
"epoch": 0.07653278807563837,
"grad_norm": 0.505544900894165,
"learning_rate": 4.617485897911715e-05,
"loss": 0.3921,
"step": 33200
},
{
"epoch": 0.07699382896766029,
"grad_norm": 0.6366299390792847,
"learning_rate": 4.615180693451606e-05,
"loss": 0.3831,
"step": 33400
},
{
"epoch": 0.07745486985968221,
"grad_norm": 0.39639851450920105,
"learning_rate": 4.6128754889914965e-05,
"loss": 0.3789,
"step": 33600
},
{
"epoch": 0.07791591075170412,
"grad_norm": 0.18556788563728333,
"learning_rate": 4.610570284531387e-05,
"loss": 0.3632,
"step": 33800
},
{
"epoch": 0.07837695164372604,
"grad_norm": 0.4612889587879181,
"learning_rate": 4.6082650800712776e-05,
"loss": 0.3874,
"step": 34000
},
{
"epoch": 0.07883799253574796,
"grad_norm": 0.3722321689128876,
"learning_rate": 4.6059598756111675e-05,
"loss": 0.4,
"step": 34200
},
{
"epoch": 0.07929903342776988,
"grad_norm": 0.22102029621601105,
"learning_rate": 4.603654671151058e-05,
"loss": 0.3908,
"step": 34400
},
{
"epoch": 0.07976007431979179,
"grad_norm": 0.5308703184127808,
"learning_rate": 4.6013494666909486e-05,
"loss": 0.3572,
"step": 34600
},
{
"epoch": 0.08022111521181371,
"grad_norm": 0.485630065202713,
"learning_rate": 4.5990442622308385e-05,
"loss": 0.416,
"step": 34800
},
{
"epoch": 0.08068215610383563,
"grad_norm": 0.495767205953598,
"learning_rate": 4.5967505837930294e-05,
"loss": 0.4,
"step": 35000
},
{
"epoch": 0.08114319699585755,
"grad_norm": 0.25368234515190125,
"learning_rate": 4.59444537933292e-05,
"loss": 0.3695,
"step": 35200
},
{
"epoch": 0.08160423788787946,
"grad_norm": 0.3967105448246002,
"learning_rate": 4.5921401748728106e-05,
"loss": 0.3828,
"step": 35400
},
{
"epoch": 0.08206527877990138,
"grad_norm": 0.6415128707885742,
"learning_rate": 4.589834970412701e-05,
"loss": 0.3552,
"step": 35600
},
{
"epoch": 0.0825263196719233,
"grad_norm": 0.29484426975250244,
"learning_rate": 4.587529765952591e-05,
"loss": 0.3676,
"step": 35800
},
{
"epoch": 0.08298736056394522,
"grad_norm": 0.5850203633308411,
"learning_rate": 4.5852245614924816e-05,
"loss": 0.4134,
"step": 36000
},
{
"epoch": 0.08344840145596713,
"grad_norm": 0.43537184596061707,
"learning_rate": 4.582919357032372e-05,
"loss": 0.3782,
"step": 36200
},
{
"epoch": 0.08390944234798905,
"grad_norm": 0.5117996335029602,
"learning_rate": 4.580614152572262e-05,
"loss": 0.3466,
"step": 36400
},
{
"epoch": 0.08437048324001098,
"grad_norm": 1.2749828100204468,
"learning_rate": 4.578308948112153e-05,
"loss": 0.3923,
"step": 36600
},
{
"epoch": 0.08483152413203289,
"grad_norm": 0.8420085310935974,
"learning_rate": 4.576003743652044e-05,
"loss": 0.3871,
"step": 36800
},
{
"epoch": 0.08529256502405481,
"grad_norm": 0.44337111711502075,
"learning_rate": 4.573698539191934e-05,
"loss": 0.4209,
"step": 37000
},
{
"epoch": 0.08575360591607673,
"grad_norm": 0.44473299384117126,
"learning_rate": 4.571404860754125e-05,
"loss": 0.4037,
"step": 37200
},
{
"epoch": 0.08621464680809865,
"grad_norm": 0.38705477118492126,
"learning_rate": 4.569099656294015e-05,
"loss": 0.396,
"step": 37400
},
{
"epoch": 0.08667568770012056,
"grad_norm": 0.5287489295005798,
"learning_rate": 4.566805977856206e-05,
"loss": 0.4225,
"step": 37600
},
{
"epoch": 0.08713672859214248,
"grad_norm": 0.17981275916099548,
"learning_rate": 4.564500773396097e-05,
"loss": 0.3732,
"step": 37800
},
{
"epoch": 0.0875977694841644,
"grad_norm": 0.7367402911186218,
"learning_rate": 4.5621955689359866e-05,
"loss": 0.3751,
"step": 38000
},
{
"epoch": 0.08805881037618632,
"grad_norm": 0.5823915600776672,
"learning_rate": 4.559890364475877e-05,
"loss": 0.3804,
"step": 38200
},
{
"epoch": 0.08851985126820823,
"grad_norm": 1.2252907752990723,
"learning_rate": 4.557585160015768e-05,
"loss": 0.3596,
"step": 38400
},
{
"epoch": 0.08898089216023015,
"grad_norm": 0.26793625950813293,
"learning_rate": 4.555279955555658e-05,
"loss": 0.386,
"step": 38600
},
{
"epoch": 0.08944193305225207,
"grad_norm": 0.578952968120575,
"learning_rate": 4.552974751095548e-05,
"loss": 0.3662,
"step": 38800
},
{
"epoch": 0.08990297394427399,
"grad_norm": 0.3861071467399597,
"learning_rate": 4.5506695466354395e-05,
"loss": 0.4183,
"step": 39000
},
{
"epoch": 0.0903640148362959,
"grad_norm": 0.5592653155326843,
"learning_rate": 4.54836434217533e-05,
"loss": 0.3702,
"step": 39200
},
{
"epoch": 0.09082505572831782,
"grad_norm": 0.44036003947257996,
"learning_rate": 4.54605913771522e-05,
"loss": 0.3941,
"step": 39400
},
{
"epoch": 0.09128609662033974,
"grad_norm": 0.28817713260650635,
"learning_rate": 4.5437539332551105e-05,
"loss": 0.4177,
"step": 39600
},
{
"epoch": 0.09174713751236166,
"grad_norm": 0.3406611979007721,
"learning_rate": 4.541448728795001e-05,
"loss": 0.4048,
"step": 39800
},
{
"epoch": 0.09220817840438357,
"grad_norm": 0.1624402552843094,
"learning_rate": 4.539155050357192e-05,
"loss": 0.4067,
"step": 40000
},
{
"epoch": 0.0926692192964055,
"grad_norm": 0.35895049571990967,
"learning_rate": 4.536849845897082e-05,
"loss": 0.3828,
"step": 40200
},
{
"epoch": 0.09313026018842742,
"grad_norm": 1.1362278461456299,
"learning_rate": 4.5345446414369724e-05,
"loss": 0.3641,
"step": 40400
},
{
"epoch": 0.09359130108044933,
"grad_norm": 0.7713387608528137,
"learning_rate": 4.532239436976863e-05,
"loss": 0.3894,
"step": 40600
},
{
"epoch": 0.09405234197247125,
"grad_norm": 0.5108577013015747,
"learning_rate": 4.529934232516753e-05,
"loss": 0.3877,
"step": 40800
},
{
"epoch": 0.09451338286449316,
"grad_norm": 0.5325179100036621,
"learning_rate": 4.5276290280566435e-05,
"loss": 0.3784,
"step": 41000
},
{
"epoch": 0.09497442375651509,
"grad_norm": 0.3445191979408264,
"learning_rate": 4.525323823596534e-05,
"loss": 0.3462,
"step": 41200
},
{
"epoch": 0.095435464648537,
"grad_norm": 0.5994306206703186,
"learning_rate": 4.5230186191364246e-05,
"loss": 0.4067,
"step": 41400
},
{
"epoch": 0.09589650554055892,
"grad_norm": 0.22779715061187744,
"learning_rate": 4.5207134146763145e-05,
"loss": 0.3874,
"step": 41600
},
{
"epoch": 0.09635754643258083,
"grad_norm": 0.4003934860229492,
"learning_rate": 4.518408210216205e-05,
"loss": 0.3658,
"step": 41800
},
{
"epoch": 0.09681858732460276,
"grad_norm": 0.7105618715286255,
"learning_rate": 4.516103005756096e-05,
"loss": 0.3704,
"step": 42000
},
{
"epoch": 0.09727962821662467,
"grad_norm": 2.1603026390075684,
"learning_rate": 4.513797801295986e-05,
"loss": 0.3852,
"step": 42200
},
{
"epoch": 0.0977406691086466,
"grad_norm": 0.7211620211601257,
"learning_rate": 4.511492596835877e-05,
"loss": 0.4272,
"step": 42400
},
{
"epoch": 0.0982017100006685,
"grad_norm": 0.5221651792526245,
"learning_rate": 4.509187392375767e-05,
"loss": 0.3817,
"step": 42600
},
{
"epoch": 0.09866275089269043,
"grad_norm": 0.557681679725647,
"learning_rate": 4.506882187915657e-05,
"loss": 0.4047,
"step": 42800
},
{
"epoch": 0.09912379178471234,
"grad_norm": 0.316997766494751,
"learning_rate": 4.504588509477848e-05,
"loss": 0.3723,
"step": 43000
},
{
"epoch": 0.09958483267673426,
"grad_norm": 0.2839302718639374,
"learning_rate": 4.502283305017739e-05,
"loss": 0.3839,
"step": 43200
},
{
"epoch": 0.10004587356875617,
"grad_norm": 0.42442891001701355,
"learning_rate": 4.499978100557629e-05,
"loss": 0.4014,
"step": 43400
},
{
"epoch": 0.1005069144607781,
"grad_norm": 0.7007307410240173,
"learning_rate": 4.49767289609752e-05,
"loss": 0.4481,
"step": 43600
},
{
"epoch": 0.10096795535280002,
"grad_norm": 0.7117435932159424,
"learning_rate": 4.49536769163741e-05,
"loss": 0.3754,
"step": 43800
},
{
"epoch": 0.10142899624482193,
"grad_norm": 0.5472663640975952,
"learning_rate": 4.4930624871773e-05,
"loss": 0.3816,
"step": 44000
},
{
"epoch": 0.10189003713684386,
"grad_norm": 0.17194071412086487,
"learning_rate": 4.490757282717191e-05,
"loss": 0.3862,
"step": 44200
},
{
"epoch": 0.10235107802886577,
"grad_norm": 0.8156585097312927,
"learning_rate": 4.488452078257081e-05,
"loss": 0.3883,
"step": 44400
},
{
"epoch": 0.1028121189208877,
"grad_norm": 0.2466941624879837,
"learning_rate": 4.486146873796971e-05,
"loss": 0.3702,
"step": 44600
},
{
"epoch": 0.1032731598129096,
"grad_norm": 0.5899674892425537,
"learning_rate": 4.483841669336862e-05,
"loss": 0.3758,
"step": 44800
},
{
"epoch": 0.10373420070493153,
"grad_norm": 0.6639291048049927,
"learning_rate": 4.4815479908990535e-05,
"loss": 0.4195,
"step": 45000
},
{
"epoch": 0.10419524159695344,
"grad_norm": 0.715785026550293,
"learning_rate": 4.4792427864389434e-05,
"loss": 0.3925,
"step": 45200
},
{
"epoch": 0.10465628248897536,
"grad_norm": 0.42432162165641785,
"learning_rate": 4.476937581978834e-05,
"loss": 0.3665,
"step": 45400
},
{
"epoch": 0.10511732338099727,
"grad_norm": 0.3713330626487732,
"learning_rate": 4.4746323775187245e-05,
"loss": 0.3628,
"step": 45600
},
{
"epoch": 0.1055783642730192,
"grad_norm": 0.19220516085624695,
"learning_rate": 4.4723271730586144e-05,
"loss": 0.3595,
"step": 45800
},
{
"epoch": 0.10603940516504111,
"grad_norm": 0.4247741997241974,
"learning_rate": 4.470021968598505e-05,
"loss": 0.4114,
"step": 46000
},
{
"epoch": 0.10650044605706303,
"grad_norm": 0.41326048970222473,
"learning_rate": 4.4677167641383955e-05,
"loss": 0.3698,
"step": 46200
},
{
"epoch": 0.10696148694908494,
"grad_norm": 0.5432236194610596,
"learning_rate": 4.465411559678286e-05,
"loss": 0.3739,
"step": 46400
},
{
"epoch": 0.10742252784110687,
"grad_norm": 0.3301286995410919,
"learning_rate": 4.463106355218176e-05,
"loss": 0.3754,
"step": 46600
},
{
"epoch": 0.10788356873312878,
"grad_norm": 0.2771298885345459,
"learning_rate": 4.4608011507580666e-05,
"loss": 0.3852,
"step": 46800
},
{
"epoch": 0.1083446096251507,
"grad_norm": 0.7244779467582703,
"learning_rate": 4.458495946297957e-05,
"loss": 0.4135,
"step": 47000
},
{
"epoch": 0.10880565051717261,
"grad_norm": 0.6219808459281921,
"learning_rate": 4.456190741837847e-05,
"loss": 0.3608,
"step": 47200
},
{
"epoch": 0.10926669140919454,
"grad_norm": 0.391397088766098,
"learning_rate": 4.4538855373777376e-05,
"loss": 0.4037,
"step": 47400
},
{
"epoch": 0.10972773230121646,
"grad_norm": 0.6644930243492126,
"learning_rate": 4.451580332917628e-05,
"loss": 0.3762,
"step": 47600
},
{
"epoch": 0.11018877319323837,
"grad_norm": 0.7442438006401062,
"learning_rate": 4.449275128457519e-05,
"loss": 0.3932,
"step": 47800
},
{
"epoch": 0.1106498140852603,
"grad_norm": 0.40641096234321594,
"learning_rate": 4.446969923997409e-05,
"loss": 0.3568,
"step": 48000
},
{
"epoch": 0.11111085497728221,
"grad_norm": 0.331028014421463,
"learning_rate": 4.4446647195373e-05,
"loss": 0.3592,
"step": 48200
},
{
"epoch": 0.11157189586930413,
"grad_norm": 0.18387028574943542,
"learning_rate": 4.4423595150771904e-05,
"loss": 0.3802,
"step": 48400
},
{
"epoch": 0.11203293676132604,
"grad_norm": 0.23090212047100067,
"learning_rate": 4.44005431061708e-05,
"loss": 0.3684,
"step": 48600
},
{
"epoch": 0.11249397765334797,
"grad_norm": 0.22124917805194855,
"learning_rate": 4.437749106156971e-05,
"loss": 0.41,
"step": 48800
},
{
"epoch": 0.11295501854536988,
"grad_norm": 0.8326044678688049,
"learning_rate": 4.4354439016968615e-05,
"loss": 0.3694,
"step": 49000
},
{
"epoch": 0.1134160594373918,
"grad_norm": 0.15534135699272156,
"learning_rate": 4.4331502232590524e-05,
"loss": 0.4071,
"step": 49200
},
{
"epoch": 0.11387710032941371,
"grad_norm": 0.5642709732055664,
"learning_rate": 4.430845018798942e-05,
"loss": 0.377,
"step": 49400
},
{
"epoch": 0.11433814122143564,
"grad_norm": 0.4568231701850891,
"learning_rate": 4.428539814338833e-05,
"loss": 0.3748,
"step": 49600
},
{
"epoch": 0.11479918211345755,
"grad_norm": 1.3475579023361206,
"learning_rate": 4.4262346098787234e-05,
"loss": 0.3757,
"step": 49800
},
{
"epoch": 0.11526022300547947,
"grad_norm": 0.9372040033340454,
"learning_rate": 4.423929405418614e-05,
"loss": 0.3995,
"step": 50000
},
{
"epoch": 0.11526022300547947,
"eval_loss": 0.39160656929016113,
"eval_runtime": 223.8495,
"eval_samples_per_second": 19.576,
"eval_steps_per_second": 19.576,
"step": 50000
},
{
"epoch": 0.11572126389750138,
"grad_norm": 0.25584205985069275,
"learning_rate": 4.421624200958504e-05,
"loss": 0.3731,
"step": 50200
},
{
"epoch": 0.11618230478952331,
"grad_norm": 0.48883160948753357,
"learning_rate": 4.4193189964983944e-05,
"loss": 0.3661,
"step": 50400
},
{
"epoch": 0.11664334568154522,
"grad_norm": 0.5731106400489807,
"learning_rate": 4.417013792038285e-05,
"loss": 0.3732,
"step": 50600
},
{
"epoch": 0.11710438657356714,
"grad_norm": 0.5235938429832458,
"learning_rate": 4.414708587578175e-05,
"loss": 0.3933,
"step": 50800
},
{
"epoch": 0.11756542746558905,
"grad_norm": 0.29224804043769836,
"learning_rate": 4.4124149091403665e-05,
"loss": 0.3732,
"step": 51000
},
{
"epoch": 0.11802646835761098,
"grad_norm": 0.22135676443576813,
"learning_rate": 4.410109704680257e-05,
"loss": 0.3712,
"step": 51200
},
{
"epoch": 0.1184875092496329,
"grad_norm": 0.31122633814811707,
"learning_rate": 4.4078045002201476e-05,
"loss": 0.3698,
"step": 51400
},
{
"epoch": 0.11894855014165481,
"grad_norm": 0.6024936437606812,
"learning_rate": 4.4054992957600375e-05,
"loss": 0.3668,
"step": 51600
},
{
"epoch": 0.11940959103367674,
"grad_norm": 0.23626506328582764,
"learning_rate": 4.403194091299928e-05,
"loss": 0.353,
"step": 51800
},
{
"epoch": 0.11987063192569865,
"grad_norm": 0.3983624279499054,
"learning_rate": 4.4008888868398186e-05,
"loss": 0.4459,
"step": 52000
},
{
"epoch": 0.12033167281772057,
"grad_norm": 0.8529877662658691,
"learning_rate": 4.3985836823797085e-05,
"loss": 0.4018,
"step": 52200
},
{
"epoch": 0.12079271370974248,
"grad_norm": 0.42104384303092957,
"learning_rate": 4.396278477919599e-05,
"loss": 0.417,
"step": 52400
},
{
"epoch": 0.12125375460176441,
"grad_norm": 0.6153512597084045,
"learning_rate": 4.39397327345949e-05,
"loss": 0.3811,
"step": 52600
},
{
"epoch": 0.12171479549378632,
"grad_norm": 0.48799267411231995,
"learning_rate": 4.39166806899938e-05,
"loss": 0.3605,
"step": 52800
},
{
"epoch": 0.12217583638580824,
"grad_norm": 0.21338334679603577,
"learning_rate": 4.38936286453927e-05,
"loss": 0.3721,
"step": 53000
},
{
"epoch": 0.12263687727783015,
"grad_norm": 0.14639084041118622,
"learning_rate": 4.387057660079161e-05,
"loss": 0.3972,
"step": 53200
},
{
"epoch": 0.12309791816985208,
"grad_norm": 0.6060304045677185,
"learning_rate": 4.384752455619051e-05,
"loss": 0.3994,
"step": 53400
},
{
"epoch": 0.12355895906187399,
"grad_norm": 0.2732614576816559,
"learning_rate": 4.382447251158942e-05,
"loss": 0.3849,
"step": 53600
},
{
"epoch": 0.12401999995389591,
"grad_norm": 1.3727577924728394,
"learning_rate": 4.3801420466988324e-05,
"loss": 0.383,
"step": 53800
},
{
"epoch": 0.12448104084591782,
"grad_norm": 0.5051097869873047,
"learning_rate": 4.377836842238723e-05,
"loss": 0.3637,
"step": 54000
},
{
"epoch": 0.12494208173793975,
"grad_norm": 0.48982998728752136,
"learning_rate": 4.3755316377786135e-05,
"loss": 0.3963,
"step": 54200
},
{
"epoch": 0.12540312262996167,
"grad_norm": 0.8108826279640198,
"learning_rate": 4.373237959340804e-05,
"loss": 0.401,
"step": 54400
},
{
"epoch": 0.12586416352198357,
"grad_norm": 0.19028553366661072,
"learning_rate": 4.370932754880694e-05,
"loss": 0.3573,
"step": 54600
},
{
"epoch": 0.1263252044140055,
"grad_norm": 0.30870822072029114,
"learning_rate": 4.368627550420585e-05,
"loss": 0.3821,
"step": 54800
},
{
"epoch": 0.12678624530602742,
"grad_norm": 0.48654425144195557,
"learning_rate": 4.3663223459604755e-05,
"loss": 0.4015,
"step": 55000
},
{
"epoch": 0.12724728619804934,
"grad_norm": 0.9675493836402893,
"learning_rate": 4.3640171415003654e-05,
"loss": 0.3857,
"step": 55200
},
{
"epoch": 0.12770832709007127,
"grad_norm": 0.634908139705658,
"learning_rate": 4.361711937040256e-05,
"loss": 0.3725,
"step": 55400
},
{
"epoch": 0.12816936798209316,
"grad_norm": 0.33301594853401184,
"learning_rate": 4.3594067325801465e-05,
"loss": 0.4018,
"step": 55600
},
{
"epoch": 0.1286304088741151,
"grad_norm": 0.7453652024269104,
"learning_rate": 4.3571015281200364e-05,
"loss": 0.3927,
"step": 55800
},
{
"epoch": 0.129091449766137,
"grad_norm": 0.4528222978115082,
"learning_rate": 4.354796323659927e-05,
"loss": 0.3553,
"step": 56000
},
{
"epoch": 0.12955249065815894,
"grad_norm": 0.43707868456840515,
"learning_rate": 4.3524911191998175e-05,
"loss": 0.3924,
"step": 56200
},
{
"epoch": 0.13001353155018083,
"grad_norm": 0.7208048105239868,
"learning_rate": 4.350185914739708e-05,
"loss": 0.4174,
"step": 56400
},
{
"epoch": 0.13047457244220276,
"grad_norm": 0.3886493742465973,
"learning_rate": 4.347880710279598e-05,
"loss": 0.3744,
"step": 56600
},
{
"epoch": 0.13093561333422468,
"grad_norm": 0.4961196184158325,
"learning_rate": 4.345575505819489e-05,
"loss": 0.3933,
"step": 56800
},
{
"epoch": 0.1313966542262466,
"grad_norm": 0.34462639689445496,
"learning_rate": 4.34327030135938e-05,
"loss": 0.3862,
"step": 57000
},
{
"epoch": 0.1318576951182685,
"grad_norm": 0.42773371934890747,
"learning_rate": 4.34097662292157e-05,
"loss": 0.3799,
"step": 57200
},
{
"epoch": 0.13231873601029043,
"grad_norm": 0.16554999351501465,
"learning_rate": 4.3386714184614606e-05,
"loss": 0.3879,
"step": 57400
},
{
"epoch": 0.13277977690231235,
"grad_norm": 0.4824365973472595,
"learning_rate": 4.336366214001351e-05,
"loss": 0.3976,
"step": 57600
},
{
"epoch": 0.13324081779433428,
"grad_norm": 1.0460017919540405,
"learning_rate": 4.334061009541242e-05,
"loss": 0.3784,
"step": 57800
},
{
"epoch": 0.13370185868635617,
"grad_norm": 0.3153966963291168,
"learning_rate": 4.3317558050811316e-05,
"loss": 0.3614,
"step": 58000
},
{
"epoch": 0.1341628995783781,
"grad_norm": 0.3575451076030731,
"learning_rate": 4.329450600621022e-05,
"loss": 0.3862,
"step": 58200
},
{
"epoch": 0.13462394047040002,
"grad_norm": 0.40099725127220154,
"learning_rate": 4.327145396160913e-05,
"loss": 0.4005,
"step": 58400
},
{
"epoch": 0.13508498136242195,
"grad_norm": 0.671525776386261,
"learning_rate": 4.324840191700803e-05,
"loss": 0.3847,
"step": 58600
},
{
"epoch": 0.13554602225444387,
"grad_norm": 0.5234976410865784,
"learning_rate": 4.322534987240693e-05,
"loss": 0.36,
"step": 58800
},
{
"epoch": 0.13600706314646577,
"grad_norm": 0.6042407155036926,
"learning_rate": 4.320229782780584e-05,
"loss": 0.3557,
"step": 59000
},
{
"epoch": 0.1364681040384877,
"grad_norm": 0.22175343334674835,
"learning_rate": 4.3179245783204744e-05,
"loss": 0.3696,
"step": 59200
},
{
"epoch": 0.13692914493050962,
"grad_norm": 0.14810913801193237,
"learning_rate": 4.315619373860364e-05,
"loss": 0.3465,
"step": 59400
},
{
"epoch": 0.13739018582253154,
"grad_norm": 0.24461548030376434,
"learning_rate": 4.313314169400255e-05,
"loss": 0.3719,
"step": 59600
},
{
"epoch": 0.13785122671455344,
"grad_norm": 0.29650434851646423,
"learning_rate": 4.311008964940146e-05,
"loss": 0.3653,
"step": 59800
},
{
"epoch": 0.13831226760657536,
"grad_norm": 0.5877122282981873,
"learning_rate": 4.308703760480036e-05,
"loss": 0.3811,
"step": 60000
},
{
"epoch": 0.1387733084985973,
"grad_norm": 0.504462718963623,
"learning_rate": 4.3063985560199265e-05,
"loss": 0.3833,
"step": 60200
},
{
"epoch": 0.1392343493906192,
"grad_norm": 0.32623839378356934,
"learning_rate": 4.304093351559817e-05,
"loss": 0.3952,
"step": 60400
},
{
"epoch": 0.1396953902826411,
"grad_norm": 0.2459900826215744,
"learning_rate": 4.301799673122008e-05,
"loss": 0.395,
"step": 60600
},
{
"epoch": 0.14015643117466303,
"grad_norm": 0.5604976415634155,
"learning_rate": 4.299494468661898e-05,
"loss": 0.3823,
"step": 60800
},
{
"epoch": 0.14061747206668496,
"grad_norm": 0.5950489640235901,
"learning_rate": 4.2971892642017885e-05,
"loss": 0.423,
"step": 61000
},
{
"epoch": 0.14107851295870688,
"grad_norm": 0.704501211643219,
"learning_rate": 4.294884059741679e-05,
"loss": 0.3696,
"step": 61200
},
{
"epoch": 0.14153955385072878,
"grad_norm": 0.3407481014728546,
"learning_rate": 4.2925788552815696e-05,
"loss": 0.3643,
"step": 61400
},
{
"epoch": 0.1420005947427507,
"grad_norm": 0.34696170687675476,
"learning_rate": 4.2902736508214595e-05,
"loss": 0.3646,
"step": 61600
},
{
"epoch": 0.14246163563477263,
"grad_norm": 0.7647753357887268,
"learning_rate": 4.28796844636135e-05,
"loss": 0.3582,
"step": 61800
},
{
"epoch": 0.14292267652679455,
"grad_norm": 0.2633316218852997,
"learning_rate": 4.2856632419012406e-05,
"loss": 0.385,
"step": 62000
},
{
"epoch": 0.14338371741881648,
"grad_norm": 0.25915348529815674,
"learning_rate": 4.2833580374411305e-05,
"loss": 0.387,
"step": 62200
},
{
"epoch": 0.14384475831083837,
"grad_norm": 0.36526253819465637,
"learning_rate": 4.281052832981021e-05,
"loss": 0.3416,
"step": 62400
},
{
"epoch": 0.1443057992028603,
"grad_norm": 0.28252243995666504,
"learning_rate": 4.2787476285209116e-05,
"loss": 0.4045,
"step": 62600
},
{
"epoch": 0.14476684009488222,
"grad_norm": 0.595001757144928,
"learning_rate": 4.276442424060802e-05,
"loss": 0.3602,
"step": 62800
},
{
"epoch": 0.14522788098690415,
"grad_norm": 1.0779852867126465,
"learning_rate": 4.274148745622993e-05,
"loss": 0.4336,
"step": 63000
},
{
"epoch": 0.14568892187892604,
"grad_norm": 0.6181137561798096,
"learning_rate": 4.271843541162884e-05,
"loss": 0.4191,
"step": 63200
},
{
"epoch": 0.14614996277094797,
"grad_norm": 0.6328127384185791,
"learning_rate": 4.269538336702774e-05,
"loss": 0.3691,
"step": 63400
},
{
"epoch": 0.1466110036629699,
"grad_norm": 0.6281986832618713,
"learning_rate": 4.267233132242665e-05,
"loss": 0.394,
"step": 63600
},
{
"epoch": 0.14707204455499182,
"grad_norm": 0.2862294912338257,
"learning_rate": 4.264927927782555e-05,
"loss": 0.3842,
"step": 63800
},
{
"epoch": 0.1475330854470137,
"grad_norm": 0.31163880228996277,
"learning_rate": 4.262622723322445e-05,
"loss": 0.3908,
"step": 64000
},
{
"epoch": 0.14799412633903564,
"grad_norm": 0.38868942856788635,
"learning_rate": 4.260317518862336e-05,
"loss": 0.4189,
"step": 64200
},
{
"epoch": 0.14845516723105756,
"grad_norm": 0.4506785571575165,
"learning_rate": 4.258012314402226e-05,
"loss": 0.3614,
"step": 64400
},
{
"epoch": 0.14891620812307949,
"grad_norm": 0.4143483638763428,
"learning_rate": 4.255707109942116e-05,
"loss": 0.358,
"step": 64600
},
{
"epoch": 0.14937724901510138,
"grad_norm": 0.6284642815589905,
"learning_rate": 4.253401905482007e-05,
"loss": 0.42,
"step": 64800
},
{
"epoch": 0.1498382899071233,
"grad_norm": 0.33402329683303833,
"learning_rate": 4.2510967010218975e-05,
"loss": 0.3909,
"step": 65000
},
{
"epoch": 0.15029933079914523,
"grad_norm": 0.6994507908821106,
"learning_rate": 4.2487914965617873e-05,
"loss": 0.3907,
"step": 65200
},
{
"epoch": 0.15076037169116716,
"grad_norm": 0.18129920959472656,
"learning_rate": 4.246486292101678e-05,
"loss": 0.4027,
"step": 65400
},
{
"epoch": 0.15122141258318905,
"grad_norm": 0.4378039836883545,
"learning_rate": 4.2441810876415685e-05,
"loss": 0.339,
"step": 65600
},
{
"epoch": 0.15168245347521098,
"grad_norm": 0.4359930753707886,
"learning_rate": 4.241875883181459e-05,
"loss": 0.385,
"step": 65800
},
{
"epoch": 0.1521434943672329,
"grad_norm": 0.36101460456848145,
"learning_rate": 4.2395706787213496e-05,
"loss": 0.3766,
"step": 66000
},
{
"epoch": 0.15260453525925483,
"grad_norm": 0.4215210974216461,
"learning_rate": 4.23726547426124e-05,
"loss": 0.3822,
"step": 66200
},
{
"epoch": 0.15306557615127675,
"grad_norm": 0.7895861864089966,
"learning_rate": 4.23496026980113e-05,
"loss": 0.3817,
"step": 66400
},
{
"epoch": 0.15352661704329865,
"grad_norm": 0.12895886600017548,
"learning_rate": 4.2326550653410206e-05,
"loss": 0.3831,
"step": 66600
},
{
"epoch": 0.15398765793532057,
"grad_norm": 0.3126397132873535,
"learning_rate": 4.230349860880911e-05,
"loss": 0.4431,
"step": 66800
},
{
"epoch": 0.1544486988273425,
"grad_norm": 0.746769368648529,
"learning_rate": 4.228056182443102e-05,
"loss": 0.4479,
"step": 67000
},
{
"epoch": 0.15490973971936442,
"grad_norm": 0.2802204489707947,
"learning_rate": 4.225750977982992e-05,
"loss": 0.4137,
"step": 67200
},
{
"epoch": 0.15537078061138632,
"grad_norm": 0.3978649079799652,
"learning_rate": 4.2234457735228826e-05,
"loss": 0.3551,
"step": 67400
},
{
"epoch": 0.15583182150340824,
"grad_norm": 0.45735758543014526,
"learning_rate": 4.221140569062773e-05,
"loss": 0.3373,
"step": 67600
},
{
"epoch": 0.15629286239543017,
"grad_norm": 0.38934004306793213,
"learning_rate": 4.218835364602664e-05,
"loss": 0.3969,
"step": 67800
},
{
"epoch": 0.1567539032874521,
"grad_norm": 0.8273873329162598,
"learning_rate": 4.2165301601425536e-05,
"loss": 0.3722,
"step": 68000
},
{
"epoch": 0.157214944179474,
"grad_norm": 0.38158196210861206,
"learning_rate": 4.214224955682444e-05,
"loss": 0.3623,
"step": 68200
},
{
"epoch": 0.1576759850714959,
"grad_norm": 0.28139957785606384,
"learning_rate": 4.211919751222335e-05,
"loss": 0.3819,
"step": 68400
},
{
"epoch": 0.15813702596351784,
"grad_norm": 0.9534692168235779,
"learning_rate": 4.209614546762225e-05,
"loss": 0.3787,
"step": 68600
},
{
"epoch": 0.15859806685553976,
"grad_norm": 0.45207953453063965,
"learning_rate": 4.207309342302116e-05,
"loss": 0.3768,
"step": 68800
},
{
"epoch": 0.15905910774756166,
"grad_norm": 0.22342385351657867,
"learning_rate": 4.2050041378420065e-05,
"loss": 0.3738,
"step": 69000
},
{
"epoch": 0.15952014863958358,
"grad_norm": 0.2750399708747864,
"learning_rate": 4.202698933381897e-05,
"loss": 0.3786,
"step": 69200
},
{
"epoch": 0.1599811895316055,
"grad_norm": 0.41750824451446533,
"learning_rate": 4.200393728921787e-05,
"loss": 0.3925,
"step": 69400
},
{
"epoch": 0.16044223042362743,
"grad_norm": 0.23955592513084412,
"learning_rate": 4.198100050483978e-05,
"loss": 0.3731,
"step": 69600
},
{
"epoch": 0.16090327131564935,
"grad_norm": 0.999873161315918,
"learning_rate": 4.195806372046169e-05,
"loss": 0.3481,
"step": 69800
},
{
"epoch": 0.16136431220767125,
"grad_norm": 0.4593783915042877,
"learning_rate": 4.193501167586059e-05,
"loss": 0.408,
"step": 70000
},
{
"epoch": 0.16182535309969318,
"grad_norm": 0.25219962000846863,
"learning_rate": 4.191195963125949e-05,
"loss": 0.3467,
"step": 70200
},
{
"epoch": 0.1622863939917151,
"grad_norm": 0.32533201575279236,
"learning_rate": 4.18889075866584e-05,
"loss": 0.3932,
"step": 70400
},
{
"epoch": 0.16274743488373702,
"grad_norm": 0.2120884209871292,
"learning_rate": 4.1865855542057303e-05,
"loss": 0.3372,
"step": 70600
},
{
"epoch": 0.16320847577575892,
"grad_norm": 0.7418591976165771,
"learning_rate": 4.184280349745621e-05,
"loss": 0.3624,
"step": 70800
},
{
"epoch": 0.16366951666778085,
"grad_norm": 0.4451257288455963,
"learning_rate": 4.181975145285511e-05,
"loss": 0.3574,
"step": 71000
},
{
"epoch": 0.16413055755980277,
"grad_norm": 0.5629644989967346,
"learning_rate": 4.179669940825402e-05,
"loss": 0.3707,
"step": 71200
},
{
"epoch": 0.1645915984518247,
"grad_norm": 0.6238035559654236,
"learning_rate": 4.1773647363652926e-05,
"loss": 0.3889,
"step": 71400
},
{
"epoch": 0.1650526393438466,
"grad_norm": 0.4385073781013489,
"learning_rate": 4.1750595319051825e-05,
"loss": 0.4079,
"step": 71600
},
{
"epoch": 0.16551368023586852,
"grad_norm": 0.38517189025878906,
"learning_rate": 4.172754327445073e-05,
"loss": 0.3997,
"step": 71800
},
{
"epoch": 0.16597472112789044,
"grad_norm": 0.5004132986068726,
"learning_rate": 4.1704491229849636e-05,
"loss": 0.3702,
"step": 72000
},
{
"epoch": 0.16643576201991236,
"grad_norm": 0.30892640352249146,
"learning_rate": 4.1681439185248535e-05,
"loss": 0.3735,
"step": 72200
},
{
"epoch": 0.16689680291193426,
"grad_norm": 0.3698577582836151,
"learning_rate": 4.165838714064744e-05,
"loss": 0.3747,
"step": 72400
},
{
"epoch": 0.16735784380395619,
"grad_norm": 0.375169575214386,
"learning_rate": 4.163533509604635e-05,
"loss": 0.4043,
"step": 72600
},
{
"epoch": 0.1678188846959781,
"grad_norm": 1.9246922731399536,
"learning_rate": 4.161228305144525e-05,
"loss": 0.4098,
"step": 72800
},
{
"epoch": 0.16827992558800003,
"grad_norm": 0.18416735529899597,
"learning_rate": 4.158934626706716e-05,
"loss": 0.4062,
"step": 73000
},
{
"epoch": 0.16874096648002196,
"grad_norm": 0.4047417938709259,
"learning_rate": 4.156629422246606e-05,
"loss": 0.3883,
"step": 73200
},
{
"epoch": 0.16920200737204386,
"grad_norm": 0.5749362111091614,
"learning_rate": 4.1543242177864966e-05,
"loss": 0.3586,
"step": 73400
},
{
"epoch": 0.16966304826406578,
"grad_norm": 0.282154381275177,
"learning_rate": 4.152019013326387e-05,
"loss": 0.3946,
"step": 73600
},
{
"epoch": 0.1701240891560877,
"grad_norm": 0.6659444570541382,
"learning_rate": 4.149713808866277e-05,
"loss": 0.3716,
"step": 73800
},
{
"epoch": 0.17058513004810963,
"grad_norm": 0.6463894844055176,
"learning_rate": 4.1474086044061676e-05,
"loss": 0.3946,
"step": 74000
},
{
"epoch": 0.17104617094013153,
"grad_norm": 0.39749765396118164,
"learning_rate": 4.145103399946059e-05,
"loss": 0.3984,
"step": 74200
},
{
"epoch": 0.17150721183215345,
"grad_norm": 0.329479455947876,
"learning_rate": 4.142798195485949e-05,
"loss": 0.3581,
"step": 74400
},
{
"epoch": 0.17196825272417537,
"grad_norm": 0.7334747314453125,
"learning_rate": 4.140492991025839e-05,
"loss": 0.355,
"step": 74600
},
{
"epoch": 0.1724292936161973,
"grad_norm": 0.5938326120376587,
"learning_rate": 4.13818778656573e-05,
"loss": 0.3764,
"step": 74800
},
{
"epoch": 0.1728903345082192,
"grad_norm": 0.22325685620307922,
"learning_rate": 4.1358825821056205e-05,
"loss": 0.3873,
"step": 75000
},
{
"epoch": 0.17335137540024112,
"grad_norm": 0.542846143245697,
"learning_rate": 4.1335773776455104e-05,
"loss": 0.3939,
"step": 75200
},
{
"epoch": 0.17381241629226304,
"grad_norm": 0.41635704040527344,
"learning_rate": 4.131272173185401e-05,
"loss": 0.3403,
"step": 75400
},
{
"epoch": 0.17427345718428497,
"grad_norm": 0.44018426537513733,
"learning_rate": 4.1289669687252915e-05,
"loss": 0.4018,
"step": 75600
},
{
"epoch": 0.17473449807630687,
"grad_norm": 0.5704178214073181,
"learning_rate": 4.1266617642651814e-05,
"loss": 0.3701,
"step": 75800
},
{
"epoch": 0.1751955389683288,
"grad_norm": 0.8065271377563477,
"learning_rate": 4.124356559805072e-05,
"loss": 0.37,
"step": 76000
},
{
"epoch": 0.17565657986035071,
"grad_norm": 0.83006751537323,
"learning_rate": 4.122062881367263e-05,
"loss": 0.3737,
"step": 76200
},
{
"epoch": 0.17611762075237264,
"grad_norm": 0.5519546866416931,
"learning_rate": 4.1197576769071534e-05,
"loss": 0.4437,
"step": 76400
},
{
"epoch": 0.17657866164439456,
"grad_norm": 0.4186224043369293,
"learning_rate": 4.117452472447044e-05,
"loss": 0.4014,
"step": 76600
},
{
"epoch": 0.17703970253641646,
"grad_norm": 0.41330209374427795,
"learning_rate": 4.115147267986934e-05,
"loss": 0.4055,
"step": 76800
},
{
"epoch": 0.17750074342843838,
"grad_norm": 0.3060867488384247,
"learning_rate": 4.1128420635268245e-05,
"loss": 0.3556,
"step": 77000
},
{
"epoch": 0.1779617843204603,
"grad_norm": 0.3334102928638458,
"learning_rate": 4.110536859066715e-05,
"loss": 0.3847,
"step": 77200
},
{
"epoch": 0.17842282521248223,
"grad_norm": 0.49521735310554504,
"learning_rate": 4.1082316546066056e-05,
"loss": 0.3945,
"step": 77400
},
{
"epoch": 0.17888386610450413,
"grad_norm": 0.27854031324386597,
"learning_rate": 4.105926450146496e-05,
"loss": 0.4074,
"step": 77600
},
{
"epoch": 0.17934490699652605,
"grad_norm": 0.38079917430877686,
"learning_rate": 4.103621245686387e-05,
"loss": 0.4136,
"step": 77800
},
{
"epoch": 0.17980594788854798,
"grad_norm": 0.6132557392120361,
"learning_rate": 4.1013160412262766e-05,
"loss": 0.3652,
"step": 78000
},
{
"epoch": 0.1802669887805699,
"grad_norm": 0.39130258560180664,
"learning_rate": 4.099010836766167e-05,
"loss": 0.4007,
"step": 78200
},
{
"epoch": 0.1807280296725918,
"grad_norm": 0.29027581214904785,
"learning_rate": 4.096705632306058e-05,
"loss": 0.3481,
"step": 78400
},
{
"epoch": 0.18118907056461372,
"grad_norm": 0.36792126297950745,
"learning_rate": 4.094400427845948e-05,
"loss": 0.3697,
"step": 78600
},
{
"epoch": 0.18165011145663565,
"grad_norm": 0.2508639395236969,
"learning_rate": 4.092095223385838e-05,
"loss": 0.3639,
"step": 78800
},
{
"epoch": 0.18211115234865757,
"grad_norm": 0.44309931993484497,
"learning_rate": 4.089790018925729e-05,
"loss": 0.3895,
"step": 79000
},
{
"epoch": 0.18257219324067947,
"grad_norm": 0.6594695448875427,
"learning_rate": 4.08749634048792e-05,
"loss": 0.3893,
"step": 79200
},
{
"epoch": 0.1830332341327014,
"grad_norm": 0.48919928073883057,
"learning_rate": 4.08519113602781e-05,
"loss": 0.3657,
"step": 79400
},
{
"epoch": 0.18349427502472332,
"grad_norm": 0.1823994517326355,
"learning_rate": 4.0828859315677e-05,
"loss": 0.3877,
"step": 79600
},
{
"epoch": 0.18395531591674524,
"grad_norm": 0.85259610414505,
"learning_rate": 4.080580727107591e-05,
"loss": 0.3676,
"step": 79800
},
{
"epoch": 0.18441635680876714,
"grad_norm": 0.17565611004829407,
"learning_rate": 4.078275522647481e-05,
"loss": 0.364,
"step": 80000
},
{
"epoch": 0.18487739770078906,
"grad_norm": 0.3634127080440521,
"learning_rate": 4.075970318187372e-05,
"loss": 0.3705,
"step": 80200
},
{
"epoch": 0.185338438592811,
"grad_norm": 0.2691134214401245,
"learning_rate": 4.0736651137272624e-05,
"loss": 0.3858,
"step": 80400
},
{
"epoch": 0.1857994794848329,
"grad_norm": 0.8339262008666992,
"learning_rate": 4.071359909267153e-05,
"loss": 0.3336,
"step": 80600
},
{
"epoch": 0.18626052037685484,
"grad_norm": 0.4361639618873596,
"learning_rate": 4.069054704807043e-05,
"loss": 0.3684,
"step": 80800
},
{
"epoch": 0.18672156126887673,
"grad_norm": 0.9091641306877136,
"learning_rate": 4.0667495003469335e-05,
"loss": 0.4046,
"step": 81000
},
{
"epoch": 0.18718260216089866,
"grad_norm": 0.5257648229598999,
"learning_rate": 4.064444295886824e-05,
"loss": 0.371,
"step": 81200
},
{
"epoch": 0.18764364305292058,
"grad_norm": 0.3674139380455017,
"learning_rate": 4.0621390914267146e-05,
"loss": 0.3515,
"step": 81400
},
{
"epoch": 0.1881046839449425,
"grad_norm": 0.407173752784729,
"learning_rate": 4.0598338869666045e-05,
"loss": 0.3533,
"step": 81600
},
{
"epoch": 0.1885657248369644,
"grad_norm": 0.24924825131893158,
"learning_rate": 4.057528682506495e-05,
"loss": 0.3972,
"step": 81800
},
{
"epoch": 0.18902676572898633,
"grad_norm": 0.7661758065223694,
"learning_rate": 4.0552234780463856e-05,
"loss": 0.3557,
"step": 82000
},
{
"epoch": 0.18948780662100825,
"grad_norm": 0.29369255900382996,
"learning_rate": 4.0529182735862755e-05,
"loss": 0.3525,
"step": 82200
},
{
"epoch": 0.18994884751303018,
"grad_norm": 0.2929767966270447,
"learning_rate": 4.050613069126166e-05,
"loss": 0.3582,
"step": 82400
},
{
"epoch": 0.19040988840505207,
"grad_norm": 0.49124881625175476,
"learning_rate": 4.0483078646660566e-05,
"loss": 0.3545,
"step": 82600
},
{
"epoch": 0.190870929297074,
"grad_norm": 0.4029316306114197,
"learning_rate": 4.046002660205947e-05,
"loss": 0.4113,
"step": 82800
},
{
"epoch": 0.19133197018909592,
"grad_norm": 0.28346729278564453,
"learning_rate": 4.043708981768138e-05,
"loss": 0.4012,
"step": 83000
},
{
"epoch": 0.19179301108111785,
"grad_norm": 0.5860701203346252,
"learning_rate": 4.041403777308029e-05,
"loss": 0.3506,
"step": 83200
},
{
"epoch": 0.19225405197313974,
"grad_norm": 0.4684862494468689,
"learning_rate": 4.039098572847919e-05,
"loss": 0.3634,
"step": 83400
},
{
"epoch": 0.19271509286516167,
"grad_norm": 0.4674646258354187,
"learning_rate": 4.03679336838781e-05,
"loss": 0.4294,
"step": 83600
},
{
"epoch": 0.1931761337571836,
"grad_norm": 0.39276108145713806,
"learning_rate": 4.0344881639277e-05,
"loss": 0.3642,
"step": 83800
},
{
"epoch": 0.19363717464920552,
"grad_norm": 0.6815670132637024,
"learning_rate": 4.03218295946759e-05,
"loss": 0.401,
"step": 84000
},
{
"epoch": 0.19409821554122744,
"grad_norm": 0.3022634983062744,
"learning_rate": 4.029877755007481e-05,
"loss": 0.3625,
"step": 84200
},
{
"epoch": 0.19455925643324934,
"grad_norm": 0.8782984614372253,
"learning_rate": 4.027572550547371e-05,
"loss": 0.397,
"step": 84400
},
{
"epoch": 0.19502029732527126,
"grad_norm": 0.4724620580673218,
"learning_rate": 4.025267346087261e-05,
"loss": 0.3774,
"step": 84600
},
{
"epoch": 0.1954813382172932,
"grad_norm": 0.40024200081825256,
"learning_rate": 4.022962141627152e-05,
"loss": 0.3816,
"step": 84800
},
{
"epoch": 0.1959423791093151,
"grad_norm": 0.6734246611595154,
"learning_rate": 4.0206569371670425e-05,
"loss": 0.3684,
"step": 85000
},
{
"epoch": 0.196403420001337,
"grad_norm": 0.8082005977630615,
"learning_rate": 4.0183517327069323e-05,
"loss": 0.3824,
"step": 85200
},
{
"epoch": 0.19686446089335893,
"grad_norm": 0.24818405508995056,
"learning_rate": 4.016046528246823e-05,
"loss": 0.3897,
"step": 85400
},
{
"epoch": 0.19732550178538086,
"grad_norm": 0.4388584494590759,
"learning_rate": 4.0137413237867135e-05,
"loss": 0.3997,
"step": 85600
},
{
"epoch": 0.19778654267740278,
"grad_norm": 0.4702792167663574,
"learning_rate": 4.0114361193266034e-05,
"loss": 0.3859,
"step": 85800
},
{
"epoch": 0.19824758356942468,
"grad_norm": 0.6281085014343262,
"learning_rate": 4.009130914866494e-05,
"loss": 0.3671,
"step": 86000
},
{
"epoch": 0.1987086244614466,
"grad_norm": 0.7686699032783508,
"learning_rate": 4.0068257104063845e-05,
"loss": 0.3523,
"step": 86200
},
{
"epoch": 0.19916966535346853,
"grad_norm": 0.3837953805923462,
"learning_rate": 4.004532031968576e-05,
"loss": 0.3687,
"step": 86400
},
{
"epoch": 0.19963070624549045,
"grad_norm": 0.2947828471660614,
"learning_rate": 4.002226827508466e-05,
"loss": 0.3817,
"step": 86600
},
{
"epoch": 0.20009174713751235,
"grad_norm": 0.49799805879592896,
"learning_rate": 3.9999216230483566e-05,
"loss": 0.3523,
"step": 86800
},
{
"epoch": 0.20055278802953427,
"grad_norm": 0.33872556686401367,
"learning_rate": 3.997616418588247e-05,
"loss": 0.3945,
"step": 87000
},
{
"epoch": 0.2010138289215562,
"grad_norm": 0.5729738473892212,
"learning_rate": 3.995311214128137e-05,
"loss": 0.3645,
"step": 87200
},
{
"epoch": 0.20147486981357812,
"grad_norm": 0.4476766288280487,
"learning_rate": 3.9930060096680276e-05,
"loss": 0.3659,
"step": 87400
},
{
"epoch": 0.20193591070560005,
"grad_norm": 0.3485075831413269,
"learning_rate": 3.990700805207918e-05,
"loss": 0.367,
"step": 87600
},
{
"epoch": 0.20239695159762194,
"grad_norm": 0.2224113792181015,
"learning_rate": 3.988395600747809e-05,
"loss": 0.3751,
"step": 87800
},
{
"epoch": 0.20285799248964387,
"grad_norm": 0.5686330795288086,
"learning_rate": 3.9860903962876986e-05,
"loss": 0.3821,
"step": 88000
},
{
"epoch": 0.2033190333816658,
"grad_norm": 0.15622970461845398,
"learning_rate": 3.983785191827589e-05,
"loss": 0.3928,
"step": 88200
},
{
"epoch": 0.20378007427368772,
"grad_norm": 0.5265315771102905,
"learning_rate": 3.98147998736748e-05,
"loss": 0.4138,
"step": 88400
},
{
"epoch": 0.2042411151657096,
"grad_norm": 0.21689961850643158,
"learning_rate": 3.97917478290737e-05,
"loss": 0.3423,
"step": 88600
},
{
"epoch": 0.20470215605773154,
"grad_norm": 0.6536559462547302,
"learning_rate": 3.97686957844726e-05,
"loss": 0.3807,
"step": 88800
},
{
"epoch": 0.20516319694975346,
"grad_norm": 0.29682546854019165,
"learning_rate": 3.974575900009452e-05,
"loss": 0.4259,
"step": 89000
},
{
"epoch": 0.2056242378417754,
"grad_norm": 0.4027779698371887,
"learning_rate": 3.9722706955493424e-05,
"loss": 0.4414,
"step": 89200
},
{
"epoch": 0.20608527873379728,
"grad_norm": 0.21460078656673431,
"learning_rate": 3.969965491089232e-05,
"loss": 0.3868,
"step": 89400
},
{
"epoch": 0.2065463196258192,
"grad_norm": 0.761016845703125,
"learning_rate": 3.967660286629123e-05,
"loss": 0.4153,
"step": 89600
},
{
"epoch": 0.20700736051784113,
"grad_norm": 0.5944260954856873,
"learning_rate": 3.9653550821690134e-05,
"loss": 0.3542,
"step": 89800
},
{
"epoch": 0.20746840140986306,
"grad_norm": 0.2797197997570038,
"learning_rate": 3.963049877708904e-05,
"loss": 0.3613,
"step": 90000
},
{
"epoch": 0.20792944230188495,
"grad_norm": 0.4935290217399597,
"learning_rate": 3.960744673248794e-05,
"loss": 0.3444,
"step": 90200
},
{
"epoch": 0.20839048319390688,
"grad_norm": 0.41999584436416626,
"learning_rate": 3.9584394687886844e-05,
"loss": 0.4135,
"step": 90400
},
{
"epoch": 0.2088515240859288,
"grad_norm": 0.43173354864120483,
"learning_rate": 3.956134264328575e-05,
"loss": 0.3429,
"step": 90600
},
{
"epoch": 0.20931256497795073,
"grad_norm": 0.6350071430206299,
"learning_rate": 3.953829059868465e-05,
"loss": 0.3484,
"step": 90800
},
{
"epoch": 0.20977360586997262,
"grad_norm": 0.39693182706832886,
"learning_rate": 3.9515238554083554e-05,
"loss": 0.3639,
"step": 91000
},
{
"epoch": 0.21023464676199455,
"grad_norm": 0.43670088052749634,
"learning_rate": 3.949218650948246e-05,
"loss": 0.3712,
"step": 91200
},
{
"epoch": 0.21069568765401647,
"grad_norm": 0.2015966773033142,
"learning_rate": 3.9469134464881366e-05,
"loss": 0.3768,
"step": 91400
},
{
"epoch": 0.2111567285460384,
"grad_norm": 0.5477193593978882,
"learning_rate": 3.9446082420280265e-05,
"loss": 0.3708,
"step": 91600
},
{
"epoch": 0.21161776943806032,
"grad_norm": 0.28118032217025757,
"learning_rate": 3.942303037567917e-05,
"loss": 0.4038,
"step": 91800
},
{
"epoch": 0.21207881033008222,
"grad_norm": 0.38235944509506226,
"learning_rate": 3.9399978331078076e-05,
"loss": 0.3982,
"step": 92000
},
{
"epoch": 0.21253985122210414,
"grad_norm": 0.6877797842025757,
"learning_rate": 3.9376926286476975e-05,
"loss": 0.3778,
"step": 92200
},
{
"epoch": 0.21300089211412607,
"grad_norm": 0.09272262454032898,
"learning_rate": 3.935387424187589e-05,
"loss": 0.372,
"step": 92400
},
{
"epoch": 0.213461933006148,
"grad_norm": 0.36820438504219055,
"learning_rate": 3.933082219727479e-05,
"loss": 0.3554,
"step": 92600
},
{
"epoch": 0.2139229738981699,
"grad_norm": 0.7568904161453247,
"learning_rate": 3.930777015267369e-05,
"loss": 0.4154,
"step": 92800
},
{
"epoch": 0.2143840147901918,
"grad_norm": 0.28105273842811584,
"learning_rate": 3.92847181080726e-05,
"loss": 0.3858,
"step": 93000
},
{
"epoch": 0.21484505568221374,
"grad_norm": 0.6823813319206238,
"learning_rate": 3.92616660634715e-05,
"loss": 0.3674,
"step": 93200
},
{
"epoch": 0.21530609657423566,
"grad_norm": 0.3331362307071686,
"learning_rate": 3.923872927909341e-05,
"loss": 0.3947,
"step": 93400
},
{
"epoch": 0.21576713746625756,
"grad_norm": 0.955589234828949,
"learning_rate": 3.921579249471532e-05,
"loss": 0.3967,
"step": 93600
},
{
"epoch": 0.21622817835827948,
"grad_norm": 0.15500684082508087,
"learning_rate": 3.919274045011422e-05,
"loss": 0.4331,
"step": 93800
},
{
"epoch": 0.2166892192503014,
"grad_norm": 0.7300329208374023,
"learning_rate": 3.9169688405513126e-05,
"loss": 0.3831,
"step": 94000
},
{
"epoch": 0.21715026014232333,
"grad_norm": 0.31543096899986267,
"learning_rate": 3.914663636091203e-05,
"loss": 0.3766,
"step": 94200
},
{
"epoch": 0.21761130103434523,
"grad_norm": 0.5309344530105591,
"learning_rate": 3.912358431631094e-05,
"loss": 0.3616,
"step": 94400
},
{
"epoch": 0.21807234192636715,
"grad_norm": 0.761132538318634,
"learning_rate": 3.910053227170984e-05,
"loss": 0.3881,
"step": 94600
},
{
"epoch": 0.21853338281838908,
"grad_norm": 0.4186858534812927,
"learning_rate": 3.907748022710874e-05,
"loss": 0.4144,
"step": 94800
},
{
"epoch": 0.218994423710411,
"grad_norm": 0.4831596910953522,
"learning_rate": 3.9054428182507655e-05,
"loss": 0.3443,
"step": 95000
},
{
"epoch": 0.21945546460243293,
"grad_norm": 0.4133339822292328,
"learning_rate": 3.9031376137906554e-05,
"loss": 0.3922,
"step": 95200
},
{
"epoch": 0.21991650549445482,
"grad_norm": 0.3791184425354004,
"learning_rate": 3.900832409330546e-05,
"loss": 0.3633,
"step": 95400
},
{
"epoch": 0.22037754638647675,
"grad_norm": 0.48449036478996277,
"learning_rate": 3.898538730892737e-05,
"loss": 0.3642,
"step": 95600
},
{
"epoch": 0.22083858727849867,
"grad_norm": 0.07597929239273071,
"learning_rate": 3.8962335264326274e-05,
"loss": 0.3742,
"step": 95800
},
{
"epoch": 0.2212996281705206,
"grad_norm": 0.3345245122909546,
"learning_rate": 3.893928321972517e-05,
"loss": 0.3612,
"step": 96000
},
{
"epoch": 0.2217606690625425,
"grad_norm": 0.5337228178977966,
"learning_rate": 3.891623117512408e-05,
"loss": 0.3891,
"step": 96200
},
{
"epoch": 0.22222170995456442,
"grad_norm": 0.290238618850708,
"learning_rate": 3.8893179130522984e-05,
"loss": 0.3824,
"step": 96400
},
{
"epoch": 0.22268275084658634,
"grad_norm": 0.6779570579528809,
"learning_rate": 3.887012708592189e-05,
"loss": 0.3519,
"step": 96600
},
{
"epoch": 0.22314379173860827,
"grad_norm": 0.1940668225288391,
"learning_rate": 3.884707504132079e-05,
"loss": 0.366,
"step": 96800
},
{
"epoch": 0.22360483263063016,
"grad_norm": 0.36103132367134094,
"learning_rate": 3.8824022996719695e-05,
"loss": 0.3825,
"step": 97000
},
{
"epoch": 0.2240658735226521,
"grad_norm": 0.29168155789375305,
"learning_rate": 3.88009709521186e-05,
"loss": 0.3756,
"step": 97200
},
{
"epoch": 0.224526914414674,
"grad_norm": 0.29785749316215515,
"learning_rate": 3.87779189075175e-05,
"loss": 0.4082,
"step": 97400
},
{
"epoch": 0.22498795530669594,
"grad_norm": 0.4983058273792267,
"learning_rate": 3.8754866862916405e-05,
"loss": 0.3565,
"step": 97600
},
{
"epoch": 0.22544899619871783,
"grad_norm": 0.5154557824134827,
"learning_rate": 3.873181481831531e-05,
"loss": 0.3443,
"step": 97800
},
{
"epoch": 0.22591003709073976,
"grad_norm": 0.7208424806594849,
"learning_rate": 3.8708762773714216e-05,
"loss": 0.3998,
"step": 98000
},
{
"epoch": 0.22637107798276168,
"grad_norm": 0.5739054679870605,
"learning_rate": 3.868571072911312e-05,
"loss": 0.3697,
"step": 98200
},
{
"epoch": 0.2268321188747836,
"grad_norm": 0.29941099882125854,
"learning_rate": 3.866265868451203e-05,
"loss": 0.3808,
"step": 98400
},
{
"epoch": 0.22729315976680553,
"grad_norm": 0.4051118791103363,
"learning_rate": 3.863960663991093e-05,
"loss": 0.3629,
"step": 98600
},
{
"epoch": 0.22775420065882743,
"grad_norm": 0.47142454981803894,
"learning_rate": 3.861655459530983e-05,
"loss": 0.3822,
"step": 98800
},
{
"epoch": 0.22821524155084935,
"grad_norm": 0.7139914631843567,
"learning_rate": 3.859350255070874e-05,
"loss": 0.3672,
"step": 99000
},
{
"epoch": 0.22867628244287128,
"grad_norm": 0.2713923752307892,
"learning_rate": 3.8570450506107644e-05,
"loss": 0.382,
"step": 99200
},
{
"epoch": 0.2291373233348932,
"grad_norm": 1.1755609512329102,
"learning_rate": 3.854739846150654e-05,
"loss": 0.3441,
"step": 99400
},
{
"epoch": 0.2295983642269151,
"grad_norm": 0.45880043506622314,
"learning_rate": 3.852434641690545e-05,
"loss": 0.3607,
"step": 99600
},
{
"epoch": 0.23005940511893702,
"grad_norm": 0.4835624098777771,
"learning_rate": 3.8501294372304354e-05,
"loss": 0.3876,
"step": 99800
},
{
"epoch": 0.23052044601095895,
"grad_norm": 0.6053724884986877,
"learning_rate": 3.847835758792626e-05,
"loss": 0.3592,
"step": 100000
},
{
"epoch": 0.23052044601095895,
"eval_loss": 0.38054874539375305,
"eval_runtime": 222.7393,
"eval_samples_per_second": 19.673,
"eval_steps_per_second": 19.673,
"step": 100000
},
{
"epoch": 0.23098148690298087,
"grad_norm": 0.27187952399253845,
"learning_rate": 3.845530554332516e-05,
"loss": 0.3387,
"step": 100200
},
{
"epoch": 0.23144252779500277,
"grad_norm": 0.6491442918777466,
"learning_rate": 3.843236875894708e-05,
"loss": 0.3694,
"step": 100400
},
{
"epoch": 0.2319035686870247,
"grad_norm": 0.4726333022117615,
"learning_rate": 3.8409316714345984e-05,
"loss": 0.3497,
"step": 100600
},
{
"epoch": 0.23236460957904662,
"grad_norm": 0.5095975995063782,
"learning_rate": 3.838626466974489e-05,
"loss": 0.3846,
"step": 100800
},
{
"epoch": 0.23282565047106854,
"grad_norm": 0.7148911356925964,
"learning_rate": 3.836321262514379e-05,
"loss": 0.3906,
"step": 101000
},
{
"epoch": 0.23328669136309044,
"grad_norm": 0.24845024943351746,
"learning_rate": 3.8340160580542694e-05,
"loss": 0.3442,
"step": 101200
},
{
"epoch": 0.23374773225511236,
"grad_norm": 0.53382807970047,
"learning_rate": 3.83171085359416e-05,
"loss": 0.3392,
"step": 101400
},
{
"epoch": 0.23420877314713429,
"grad_norm": 0.45207053422927856,
"learning_rate": 3.8294056491340505e-05,
"loss": 0.35,
"step": 101600
},
{
"epoch": 0.2346698140391562,
"grad_norm": 0.6363802552223206,
"learning_rate": 3.8271004446739404e-05,
"loss": 0.4104,
"step": 101800
},
{
"epoch": 0.2351308549311781,
"grad_norm": 0.14711298048496246,
"learning_rate": 3.824795240213831e-05,
"loss": 0.3633,
"step": 102000
},
{
"epoch": 0.23559189582320003,
"grad_norm": 0.2457539439201355,
"learning_rate": 3.8224900357537215e-05,
"loss": 0.3591,
"step": 102200
},
{
"epoch": 0.23605293671522196,
"grad_norm": 0.4251687526702881,
"learning_rate": 3.8201848312936114e-05,
"loss": 0.3299,
"step": 102400
},
{
"epoch": 0.23651397760724388,
"grad_norm": 0.3099224865436554,
"learning_rate": 3.817879626833502e-05,
"loss": 0.3716,
"step": 102600
},
{
"epoch": 0.2369750184992658,
"grad_norm": 0.5165499448776245,
"learning_rate": 3.8155744223733926e-05,
"loss": 0.3809,
"step": 102800
},
{
"epoch": 0.2374360593912877,
"grad_norm": 0.6008449792861938,
"learning_rate": 3.813269217913283e-05,
"loss": 0.3745,
"step": 103000
},
{
"epoch": 0.23789710028330963,
"grad_norm": 0.395580530166626,
"learning_rate": 3.810964013453173e-05,
"loss": 0.3795,
"step": 103200
},
{
"epoch": 0.23835814117533155,
"grad_norm": 0.2354406863451004,
"learning_rate": 3.8086588089930636e-05,
"loss": 0.3883,
"step": 103400
},
{
"epoch": 0.23881918206735347,
"grad_norm": 0.5513392090797424,
"learning_rate": 3.806353604532954e-05,
"loss": 0.3734,
"step": 103600
},
{
"epoch": 0.23928022295937537,
"grad_norm": 0.5765381455421448,
"learning_rate": 3.804048400072845e-05,
"loss": 0.3719,
"step": 103800
},
{
"epoch": 0.2397412638513973,
"grad_norm": 1.0008771419525146,
"learning_rate": 3.801743195612735e-05,
"loss": 0.3685,
"step": 104000
},
{
"epoch": 0.24020230474341922,
"grad_norm": 0.6254777312278748,
"learning_rate": 3.799437991152626e-05,
"loss": 0.3871,
"step": 104200
},
{
"epoch": 0.24066334563544114,
"grad_norm": 0.5210611820220947,
"learning_rate": 3.797132786692516e-05,
"loss": 0.3631,
"step": 104400
},
{
"epoch": 0.24112438652746304,
"grad_norm": 0.2938978374004364,
"learning_rate": 3.794827582232406e-05,
"loss": 0.3688,
"step": 104600
},
{
"epoch": 0.24158542741948497,
"grad_norm": 0.4644298553466797,
"learning_rate": 3.792522377772297e-05,
"loss": 0.3557,
"step": 104800
},
{
"epoch": 0.2420464683115069,
"grad_norm": 0.2099383920431137,
"learning_rate": 3.7902171733121875e-05,
"loss": 0.3884,
"step": 105000
},
{
"epoch": 0.24250750920352881,
"grad_norm": 1.0953824520111084,
"learning_rate": 3.7879119688520773e-05,
"loss": 0.3834,
"step": 105200
},
{
"epoch": 0.2429685500955507,
"grad_norm": 0.30743712186813354,
"learning_rate": 3.785606764391968e-05,
"loss": 0.3477,
"step": 105400
},
{
"epoch": 0.24342959098757264,
"grad_norm": 0.29531943798065186,
"learning_rate": 3.783313085954159e-05,
"loss": 0.3719,
"step": 105600
},
{
"epoch": 0.24389063187959456,
"grad_norm": 0.4399455785751343,
"learning_rate": 3.7810078814940494e-05,
"loss": 0.3556,
"step": 105800
},
{
"epoch": 0.24435167277161648,
"grad_norm": 0.29192543029785156,
"learning_rate": 3.778702677033939e-05,
"loss": 0.3712,
"step": 106000
},
{
"epoch": 0.2448127136636384,
"grad_norm": 0.30115434527397156,
"learning_rate": 3.77639747257383e-05,
"loss": 0.3738,
"step": 106200
},
{
"epoch": 0.2452737545556603,
"grad_norm": 0.41634321212768555,
"learning_rate": 3.7740922681137204e-05,
"loss": 0.3795,
"step": 106400
},
{
"epoch": 0.24573479544768223,
"grad_norm": 0.6761085391044617,
"learning_rate": 3.771787063653611e-05,
"loss": 0.3648,
"step": 106600
},
{
"epoch": 0.24619583633970415,
"grad_norm": 0.2420043796300888,
"learning_rate": 3.769493385215802e-05,
"loss": 0.372,
"step": 106800
},
{
"epoch": 0.24665687723172608,
"grad_norm": 1.5752192735671997,
"learning_rate": 3.7671881807556925e-05,
"loss": 0.358,
"step": 107000
},
{
"epoch": 0.24711791812374798,
"grad_norm": 0.2581362724304199,
"learning_rate": 3.764882976295583e-05,
"loss": 0.3178,
"step": 107200
},
{
"epoch": 0.2475789590157699,
"grad_norm": 0.5073797702789307,
"learning_rate": 3.762577771835473e-05,
"loss": 0.3553,
"step": 107400
},
{
"epoch": 0.24803999990779182,
"grad_norm": 0.29270970821380615,
"learning_rate": 3.7602725673753635e-05,
"loss": 0.3394,
"step": 107600
},
{
"epoch": 0.24850104079981375,
"grad_norm": 0.41889190673828125,
"learning_rate": 3.757967362915254e-05,
"loss": 0.3872,
"step": 107800
},
{
"epoch": 0.24896208169183565,
"grad_norm": 0.47351330518722534,
"learning_rate": 3.7556621584551446e-05,
"loss": 0.352,
"step": 108000
},
{
"epoch": 0.24942312258385757,
"grad_norm": 0.6487288475036621,
"learning_rate": 3.7533569539950345e-05,
"loss": 0.4207,
"step": 108200
},
{
"epoch": 0.2498841634758795,
"grad_norm": 0.7773205637931824,
"learning_rate": 3.751051749534925e-05,
"loss": 0.3683,
"step": 108400
},
{
"epoch": 0.2503452043679014,
"grad_norm": 0.5849452018737793,
"learning_rate": 3.748746545074816e-05,
"loss": 0.4069,
"step": 108600
},
{
"epoch": 0.25080624525992334,
"grad_norm": 0.3614829480648041,
"learning_rate": 3.7464413406147056e-05,
"loss": 0.3911,
"step": 108800
},
{
"epoch": 0.25126728615194527,
"grad_norm": 0.6186047792434692,
"learning_rate": 3.744136136154596e-05,
"loss": 0.3587,
"step": 109000
},
{
"epoch": 0.25172832704396714,
"grad_norm": 0.5673872828483582,
"learning_rate": 3.741830931694487e-05,
"loss": 0.3343,
"step": 109200
},
{
"epoch": 0.25218936793598906,
"grad_norm": 0.4475044012069702,
"learning_rate": 3.739525727234377e-05,
"loss": 0.3744,
"step": 109400
},
{
"epoch": 0.252650408828011,
"grad_norm": 0.19151391088962555,
"learning_rate": 3.737220522774267e-05,
"loss": 0.3572,
"step": 109600
},
{
"epoch": 0.2531114497200329,
"grad_norm": 0.8350563645362854,
"learning_rate": 3.7349153183141584e-05,
"loss": 0.3807,
"step": 109800
},
{
"epoch": 0.25357249061205483,
"grad_norm": 0.42143338918685913,
"learning_rate": 3.732610113854049e-05,
"loss": 0.3423,
"step": 110000
},
{
"epoch": 0.25403353150407676,
"grad_norm": 0.4613721966743469,
"learning_rate": 3.730304909393939e-05,
"loss": 0.4052,
"step": 110200
},
{
"epoch": 0.2544945723960987,
"grad_norm": 0.13319946825504303,
"learning_rate": 3.7279997049338294e-05,
"loss": 0.3886,
"step": 110400
},
{
"epoch": 0.2549556132881206,
"grad_norm": 0.2844022512435913,
"learning_rate": 3.72569450047372e-05,
"loss": 0.3652,
"step": 110600
},
{
"epoch": 0.25541665418014253,
"grad_norm": 0.45382627844810486,
"learning_rate": 3.723400822035911e-05,
"loss": 0.3932,
"step": 110800
},
{
"epoch": 0.2558776950721644,
"grad_norm": 0.3049659729003906,
"learning_rate": 3.721095617575801e-05,
"loss": 0.3663,
"step": 111000
},
{
"epoch": 0.2563387359641863,
"grad_norm": 1.9019083976745605,
"learning_rate": 3.7187904131156914e-05,
"loss": 0.3575,
"step": 111200
},
{
"epoch": 0.25679977685620825,
"grad_norm": 0.4642539620399475,
"learning_rate": 3.716485208655582e-05,
"loss": 0.3589,
"step": 111400
},
{
"epoch": 0.2572608177482302,
"grad_norm": 0.18418247997760773,
"learning_rate": 3.714191530217773e-05,
"loss": 0.3916,
"step": 111600
},
{
"epoch": 0.2577218586402521,
"grad_norm": 0.1706364005804062,
"learning_rate": 3.711886325757663e-05,
"loss": 0.3464,
"step": 111800
},
{
"epoch": 0.258182899532274,
"grad_norm": 0.19704899191856384,
"learning_rate": 3.709581121297553e-05,
"loss": 0.3477,
"step": 112000
},
{
"epoch": 0.25864394042429595,
"grad_norm": 0.77329421043396,
"learning_rate": 3.7072759168374446e-05,
"loss": 0.3626,
"step": 112200
},
{
"epoch": 0.2591049813163179,
"grad_norm": 0.5576704740524292,
"learning_rate": 3.7049707123773345e-05,
"loss": 0.343,
"step": 112400
},
{
"epoch": 0.25956602220833974,
"grad_norm": 0.28931859135627747,
"learning_rate": 3.702665507917225e-05,
"loss": 0.3759,
"step": 112600
},
{
"epoch": 0.26002706310036167,
"grad_norm": 0.3293726146221161,
"learning_rate": 3.7003603034571156e-05,
"loss": 0.3628,
"step": 112800
},
{
"epoch": 0.2604881039923836,
"grad_norm": 0.41042861342430115,
"learning_rate": 3.698055098997006e-05,
"loss": 0.4341,
"step": 113000
},
{
"epoch": 0.2609491448844055,
"grad_norm": 0.39789122343063354,
"learning_rate": 3.695749894536896e-05,
"loss": 0.3487,
"step": 113200
},
{
"epoch": 0.26141018577642744,
"grad_norm": 0.35746097564697266,
"learning_rate": 3.6934446900767866e-05,
"loss": 0.3373,
"step": 113400
},
{
"epoch": 0.26187122666844936,
"grad_norm": 0.36856091022491455,
"learning_rate": 3.691139485616677e-05,
"loss": 0.3654,
"step": 113600
},
{
"epoch": 0.2623322675604713,
"grad_norm": 0.4984491467475891,
"learning_rate": 3.688834281156567e-05,
"loss": 0.3868,
"step": 113800
},
{
"epoch": 0.2627933084524932,
"grad_norm": 0.3570007085800171,
"learning_rate": 3.6865290766964576e-05,
"loss": 0.3884,
"step": 114000
},
{
"epoch": 0.26325434934451514,
"grad_norm": 0.6426229476928711,
"learning_rate": 3.684223872236348e-05,
"loss": 0.3609,
"step": 114200
},
{
"epoch": 0.263715390236537,
"grad_norm": 0.8099267482757568,
"learning_rate": 3.681918667776239e-05,
"loss": 0.3449,
"step": 114400
},
{
"epoch": 0.26417643112855893,
"grad_norm": 0.2700574994087219,
"learning_rate": 3.679613463316129e-05,
"loss": 0.365,
"step": 114600
},
{
"epoch": 0.26463747202058086,
"grad_norm": 0.5823246836662292,
"learning_rate": 3.677308258856019e-05,
"loss": 0.3602,
"step": 114800
},
{
"epoch": 0.2650985129126028,
"grad_norm": 0.6081487536430359,
"learning_rate": 3.67500305439591e-05,
"loss": 0.3636,
"step": 115000
},
{
"epoch": 0.2655595538046247,
"grad_norm": 0.54152512550354,
"learning_rate": 3.6726978499358e-05,
"loss": 0.3867,
"step": 115200
},
{
"epoch": 0.26602059469664663,
"grad_norm": 0.4905381500720978,
"learning_rate": 3.67039264547569e-05,
"loss": 0.3979,
"step": 115400
},
{
"epoch": 0.26648163558866855,
"grad_norm": 0.5496036410331726,
"learning_rate": 3.6680874410155815e-05,
"loss": 0.3633,
"step": 115600
},
{
"epoch": 0.2669426764806905,
"grad_norm": 0.3739512264728546,
"learning_rate": 3.6657937625777724e-05,
"loss": 0.3696,
"step": 115800
},
{
"epoch": 0.26740371737271235,
"grad_norm": 0.5083029866218567,
"learning_rate": 3.663488558117662e-05,
"loss": 0.3503,
"step": 116000
},
{
"epoch": 0.26786475826473427,
"grad_norm": 0.3220144510269165,
"learning_rate": 3.661183353657553e-05,
"loss": 0.3357,
"step": 116200
},
{
"epoch": 0.2683257991567562,
"grad_norm": 0.4314993917942047,
"learning_rate": 3.6588781491974434e-05,
"loss": 0.3882,
"step": 116400
},
{
"epoch": 0.2687868400487781,
"grad_norm": 0.3649137318134308,
"learning_rate": 3.656572944737334e-05,
"loss": 0.3614,
"step": 116600
},
{
"epoch": 0.26924788094080004,
"grad_norm": 0.3894297182559967,
"learning_rate": 3.654267740277224e-05,
"loss": 0.397,
"step": 116800
},
{
"epoch": 0.26970892183282197,
"grad_norm": 0.5946565270423889,
"learning_rate": 3.651974061839415e-05,
"loss": 0.3693,
"step": 117000
},
{
"epoch": 0.2701699627248439,
"grad_norm": 0.4491449296474457,
"learning_rate": 3.6496688573793054e-05,
"loss": 0.3515,
"step": 117200
},
{
"epoch": 0.2706310036168658,
"grad_norm": 0.6024273037910461,
"learning_rate": 3.647363652919196e-05,
"loss": 0.3783,
"step": 117400
},
{
"epoch": 0.27109204450888774,
"grad_norm": 0.7367308735847473,
"learning_rate": 3.645058448459086e-05,
"loss": 0.3341,
"step": 117600
},
{
"epoch": 0.2715530854009096,
"grad_norm": 0.33618679642677307,
"learning_rate": 3.6427532439989764e-05,
"loss": 0.3715,
"step": 117800
},
{
"epoch": 0.27201412629293154,
"grad_norm": 0.5849551558494568,
"learning_rate": 3.640448039538867e-05,
"loss": 0.3448,
"step": 118000
},
{
"epoch": 0.27247516718495346,
"grad_norm": 0.6120061278343201,
"learning_rate": 3.6381428350787576e-05,
"loss": 0.3821,
"step": 118200
},
{
"epoch": 0.2729362080769754,
"grad_norm": 0.5839222073554993,
"learning_rate": 3.635837630618648e-05,
"loss": 0.3572,
"step": 118400
},
{
"epoch": 0.2733972489689973,
"grad_norm": 0.6287630200386047,
"learning_rate": 3.633532426158539e-05,
"loss": 0.3998,
"step": 118600
},
{
"epoch": 0.27385828986101923,
"grad_norm": 0.5530860424041748,
"learning_rate": 3.6312272216984286e-05,
"loss": 0.3421,
"step": 118800
},
{
"epoch": 0.27431933075304116,
"grad_norm": 0.2004370242357254,
"learning_rate": 3.628922017238319e-05,
"loss": 0.4035,
"step": 119000
},
{
"epoch": 0.2747803716450631,
"grad_norm": 0.2633645534515381,
"learning_rate": 3.62662833880051e-05,
"loss": 0.3639,
"step": 119200
},
{
"epoch": 0.27524141253708495,
"grad_norm": 0.24792851507663727,
"learning_rate": 3.6243231343404006e-05,
"loss": 0.3699,
"step": 119400
},
{
"epoch": 0.2757024534291069,
"grad_norm": 0.4882837235927582,
"learning_rate": 3.622017929880291e-05,
"loss": 0.3675,
"step": 119600
},
{
"epoch": 0.2761634943211288,
"grad_norm": 0.17261354625225067,
"learning_rate": 3.619712725420181e-05,
"loss": 0.3776,
"step": 119800
},
{
"epoch": 0.2766245352131507,
"grad_norm": 0.32434573769569397,
"learning_rate": 3.6174075209600717e-05,
"loss": 0.362,
"step": 120000
},
{
"epoch": 0.27708557610517265,
"grad_norm": 0.33273622393608093,
"learning_rate": 3.615102316499962e-05,
"loss": 0.3278,
"step": 120200
},
{
"epoch": 0.2775466169971946,
"grad_norm": 0.4601978063583374,
"learning_rate": 3.612797112039852e-05,
"loss": 0.3419,
"step": 120400
},
{
"epoch": 0.2780076578892165,
"grad_norm": 0.1586538702249527,
"learning_rate": 3.610491907579743e-05,
"loss": 0.3518,
"step": 120600
},
{
"epoch": 0.2784686987812384,
"grad_norm": 0.26959162950515747,
"learning_rate": 3.608198229141934e-05,
"loss": 0.4276,
"step": 120800
},
{
"epoch": 0.27892973967326035,
"grad_norm": 0.32289379835128784,
"learning_rate": 3.605893024681825e-05,
"loss": 0.3759,
"step": 121000
},
{
"epoch": 0.2793907805652822,
"grad_norm": 0.30840426683425903,
"learning_rate": 3.603587820221715e-05,
"loss": 0.3508,
"step": 121200
},
{
"epoch": 0.27985182145730414,
"grad_norm": 0.31268706917762756,
"learning_rate": 3.601282615761605e-05,
"loss": 0.366,
"step": 121400
},
{
"epoch": 0.28031286234932606,
"grad_norm": 0.4846327602863312,
"learning_rate": 3.598977411301496e-05,
"loss": 0.381,
"step": 121600
},
{
"epoch": 0.280773903241348,
"grad_norm": 0.2978130877017975,
"learning_rate": 3.596672206841386e-05,
"loss": 0.3774,
"step": 121800
},
{
"epoch": 0.2812349441333699,
"grad_norm": 0.2098592072725296,
"learning_rate": 3.594367002381276e-05,
"loss": 0.4139,
"step": 122000
},
{
"epoch": 0.28169598502539184,
"grad_norm": 0.7932277917861938,
"learning_rate": 3.592061797921167e-05,
"loss": 0.3784,
"step": 122200
},
{
"epoch": 0.28215702591741376,
"grad_norm": 0.38202640414237976,
"learning_rate": 3.5897565934610575e-05,
"loss": 0.3619,
"step": 122400
},
{
"epoch": 0.2826180668094357,
"grad_norm": 0.721820056438446,
"learning_rate": 3.5874513890009474e-05,
"loss": 0.3554,
"step": 122600
},
{
"epoch": 0.28307910770145756,
"grad_norm": 0.2776962220668793,
"learning_rate": 3.585146184540838e-05,
"loss": 0.3506,
"step": 122800
},
{
"epoch": 0.2835401485934795,
"grad_norm": 0.2675781548023224,
"learning_rate": 3.5828409800807285e-05,
"loss": 0.4022,
"step": 123000
},
{
"epoch": 0.2840011894855014,
"grad_norm": 0.33172500133514404,
"learning_rate": 3.5805357756206184e-05,
"loss": 0.3806,
"step": 123200
},
{
"epoch": 0.28446223037752333,
"grad_norm": 0.8561096787452698,
"learning_rate": 3.578230571160509e-05,
"loss": 0.3721,
"step": 123400
},
{
"epoch": 0.28492327126954525,
"grad_norm": 0.36494210362434387,
"learning_rate": 3.5759253667003995e-05,
"loss": 0.3801,
"step": 123600
},
{
"epoch": 0.2853843121615672,
"grad_norm": 0.19472463428974152,
"learning_rate": 3.57362016224029e-05,
"loss": 0.3504,
"step": 123800
},
{
"epoch": 0.2858453530535891,
"grad_norm": 0.25896406173706055,
"learning_rate": 3.57131495778018e-05,
"loss": 0.4038,
"step": 124000
},
{
"epoch": 0.286306393945611,
"grad_norm": 0.19645366072654724,
"learning_rate": 3.569009753320071e-05,
"loss": 0.3839,
"step": 124200
},
{
"epoch": 0.28676743483763295,
"grad_norm": 0.3399136960506439,
"learning_rate": 3.566704548859962e-05,
"loss": 0.4061,
"step": 124400
},
{
"epoch": 0.2872284757296548,
"grad_norm": 0.44564691185951233,
"learning_rate": 3.564399344399852e-05,
"loss": 0.3816,
"step": 124600
},
{
"epoch": 0.28768951662167674,
"grad_norm": 0.360584020614624,
"learning_rate": 3.562094139939742e-05,
"loss": 0.3855,
"step": 124800
},
{
"epoch": 0.28815055751369867,
"grad_norm": 0.5581395030021667,
"learning_rate": 3.559788935479633e-05,
"loss": 0.3838,
"step": 125000
},
{
"epoch": 0.2886115984057206,
"grad_norm": 0.3356562554836273,
"learning_rate": 3.557483731019523e-05,
"loss": 0.3627,
"step": 125200
},
{
"epoch": 0.2890726392977425,
"grad_norm": 0.5357953906059265,
"learning_rate": 3.555178526559413e-05,
"loss": 0.3621,
"step": 125400
},
{
"epoch": 0.28953368018976444,
"grad_norm": 0.917598307132721,
"learning_rate": 3.552873322099304e-05,
"loss": 0.3559,
"step": 125600
},
{
"epoch": 0.28999472108178637,
"grad_norm": 0.41881221532821655,
"learning_rate": 3.5505681176391944e-05,
"loss": 0.3691,
"step": 125800
},
{
"epoch": 0.2904557619738083,
"grad_norm": 0.19681178033351898,
"learning_rate": 3.548262913179084e-05,
"loss": 0.3906,
"step": 126000
},
{
"epoch": 0.29091680286583016,
"grad_norm": 0.39573216438293457,
"learning_rate": 3.545957708718975e-05,
"loss": 0.3128,
"step": 126200
},
{
"epoch": 0.2913778437578521,
"grad_norm": 0.6583923697471619,
"learning_rate": 3.5436525042588654e-05,
"loss": 0.3515,
"step": 126400
},
{
"epoch": 0.291838884649874,
"grad_norm": 0.7501808404922485,
"learning_rate": 3.541347299798756e-05,
"loss": 0.3552,
"step": 126600
},
{
"epoch": 0.29229992554189593,
"grad_norm": 0.5151230692863464,
"learning_rate": 3.539042095338646e-05,
"loss": 0.3594,
"step": 126800
},
{
"epoch": 0.29276096643391786,
"grad_norm": 1.6434541940689087,
"learning_rate": 3.5367368908785365e-05,
"loss": 0.3525,
"step": 127000
},
{
"epoch": 0.2932220073259398,
"grad_norm": 0.5371947288513184,
"learning_rate": 3.534443212440728e-05,
"loss": 0.3204,
"step": 127200
},
{
"epoch": 0.2936830482179617,
"grad_norm": 0.5988975763320923,
"learning_rate": 3.532138007980618e-05,
"loss": 0.3895,
"step": 127400
},
{
"epoch": 0.29414408910998363,
"grad_norm": 0.6697775721549988,
"learning_rate": 3.529844329542809e-05,
"loss": 0.3967,
"step": 127600
},
{
"epoch": 0.29460513000200556,
"grad_norm": 0.5715062618255615,
"learning_rate": 3.5275391250826994e-05,
"loss": 0.3771,
"step": 127800
},
{
"epoch": 0.2950661708940274,
"grad_norm": 0.5021243691444397,
"learning_rate": 3.52523392062259e-05,
"loss": 0.3299,
"step": 128000
},
{
"epoch": 0.29552721178604935,
"grad_norm": 0.3863165080547333,
"learning_rate": 3.52292871616248e-05,
"loss": 0.3812,
"step": 128200
},
{
"epoch": 0.2959882526780713,
"grad_norm": 0.5982155799865723,
"learning_rate": 3.5206235117023705e-05,
"loss": 0.3938,
"step": 128400
},
{
"epoch": 0.2964492935700932,
"grad_norm": 0.2971329092979431,
"learning_rate": 3.5183298332645614e-05,
"loss": 0.3658,
"step": 128600
},
{
"epoch": 0.2969103344621151,
"grad_norm": 0.4200974702835083,
"learning_rate": 3.516024628804452e-05,
"loss": 0.3504,
"step": 128800
},
{
"epoch": 0.29737137535413705,
"grad_norm": 0.3119615316390991,
"learning_rate": 3.5137194243443425e-05,
"loss": 0.4029,
"step": 129000
},
{
"epoch": 0.29783241624615897,
"grad_norm": 0.4038570523262024,
"learning_rate": 3.5114142198842324e-05,
"loss": 0.3641,
"step": 129200
},
{
"epoch": 0.2982934571381809,
"grad_norm": 0.42492878437042236,
"learning_rate": 3.509109015424123e-05,
"loss": 0.3451,
"step": 129400
},
{
"epoch": 0.29875449803020276,
"grad_norm": 0.29803556203842163,
"learning_rate": 3.506803810964014e-05,
"loss": 0.3291,
"step": 129600
},
{
"epoch": 0.2992155389222247,
"grad_norm": 0.3618007004261017,
"learning_rate": 3.504498606503904e-05,
"loss": 0.3792,
"step": 129800
},
{
"epoch": 0.2996765798142466,
"grad_norm": 0.4154590666294098,
"learning_rate": 3.502193402043795e-05,
"loss": 0.3735,
"step": 130000
},
{
"epoch": 0.30013762070626854,
"grad_norm": 0.46263667941093445,
"learning_rate": 3.499888197583685e-05,
"loss": 0.3744,
"step": 130200
},
{
"epoch": 0.30059866159829046,
"grad_norm": 0.304043710231781,
"learning_rate": 3.497582993123575e-05,
"loss": 0.3492,
"step": 130400
},
{
"epoch": 0.3010597024903124,
"grad_norm": 0.2621666193008423,
"learning_rate": 3.495277788663466e-05,
"loss": 0.3646,
"step": 130600
},
{
"epoch": 0.3015207433823343,
"grad_norm": 0.2853315770626068,
"learning_rate": 3.492972584203356e-05,
"loss": 0.3668,
"step": 130800
},
{
"epoch": 0.30198178427435624,
"grad_norm": 0.5060180425643921,
"learning_rate": 3.490667379743247e-05,
"loss": 0.3556,
"step": 131000
},
{
"epoch": 0.3024428251663781,
"grad_norm": 0.3390871286392212,
"learning_rate": 3.488362175283137e-05,
"loss": 0.3362,
"step": 131200
},
{
"epoch": 0.30290386605840003,
"grad_norm": 0.24981509149074554,
"learning_rate": 3.486056970823027e-05,
"loss": 0.3824,
"step": 131400
},
{
"epoch": 0.30336490695042195,
"grad_norm": 0.5607753992080688,
"learning_rate": 3.483751766362918e-05,
"loss": 0.3819,
"step": 131600
},
{
"epoch": 0.3038259478424439,
"grad_norm": 0.42747706174850464,
"learning_rate": 3.481446561902808e-05,
"loss": 0.3431,
"step": 131800
},
{
"epoch": 0.3042869887344658,
"grad_norm": 0.3314417600631714,
"learning_rate": 3.479141357442698e-05,
"loss": 0.3728,
"step": 132000
},
{
"epoch": 0.3047480296264877,
"grad_norm": 0.18589608371257782,
"learning_rate": 3.476836152982589e-05,
"loss": 0.3517,
"step": 132200
},
{
"epoch": 0.30520907051850965,
"grad_norm": 0.8516743183135986,
"learning_rate": 3.4745309485224795e-05,
"loss": 0.3371,
"step": 132400
},
{
"epoch": 0.3056701114105316,
"grad_norm": 0.3326774835586548,
"learning_rate": 3.4722257440623693e-05,
"loss": 0.3634,
"step": 132600
},
{
"epoch": 0.3061311523025535,
"grad_norm": 0.33257177472114563,
"learning_rate": 3.46992053960226e-05,
"loss": 0.3877,
"step": 132800
},
{
"epoch": 0.30659219319457537,
"grad_norm": 0.30312174558639526,
"learning_rate": 3.467615335142151e-05,
"loss": 0.3876,
"step": 133000
},
{
"epoch": 0.3070532340865973,
"grad_norm": 0.6828961968421936,
"learning_rate": 3.465310130682041e-05,
"loss": 0.3678,
"step": 133200
},
{
"epoch": 0.3075142749786192,
"grad_norm": 0.807819664478302,
"learning_rate": 3.4630049262219316e-05,
"loss": 0.3931,
"step": 133400
},
{
"epoch": 0.30797531587064114,
"grad_norm": 0.4881526529788971,
"learning_rate": 3.460699721761822e-05,
"loss": 0.359,
"step": 133600
},
{
"epoch": 0.30843635676266307,
"grad_norm": 0.3525223135948181,
"learning_rate": 3.458394517301712e-05,
"loss": 0.3485,
"step": 133800
},
{
"epoch": 0.308897397654685,
"grad_norm": 0.585360586643219,
"learning_rate": 3.4560893128416026e-05,
"loss": 0.3501,
"step": 134000
},
{
"epoch": 0.3093584385467069,
"grad_norm": 0.6815674304962158,
"learning_rate": 3.4537956344037936e-05,
"loss": 0.3918,
"step": 134200
},
{
"epoch": 0.30981947943872884,
"grad_norm": 0.3282436728477478,
"learning_rate": 3.451490429943684e-05,
"loss": 0.3809,
"step": 134400
},
{
"epoch": 0.3102805203307507,
"grad_norm": 0.7657988667488098,
"learning_rate": 3.449185225483575e-05,
"loss": 0.3542,
"step": 134600
},
{
"epoch": 0.31074156122277263,
"grad_norm": 0.10266648232936859,
"learning_rate": 3.4468800210234646e-05,
"loss": 0.3286,
"step": 134800
},
{
"epoch": 0.31120260211479456,
"grad_norm": 0.7599309682846069,
"learning_rate": 3.444574816563355e-05,
"loss": 0.4035,
"step": 135000
},
{
"epoch": 0.3116636430068165,
"grad_norm": 0.7811179757118225,
"learning_rate": 3.442269612103246e-05,
"loss": 0.3777,
"step": 135200
},
{
"epoch": 0.3121246838988384,
"grad_norm": 0.32962509989738464,
"learning_rate": 3.4399644076431356e-05,
"loss": 0.3933,
"step": 135400
},
{
"epoch": 0.31258572479086033,
"grad_norm": 0.2695685029029846,
"learning_rate": 3.437659203183026e-05,
"loss": 0.3488,
"step": 135600
},
{
"epoch": 0.31304676568288226,
"grad_norm": 0.19855330884456635,
"learning_rate": 3.435353998722917e-05,
"loss": 0.431,
"step": 135800
},
{
"epoch": 0.3135078065749042,
"grad_norm": 0.3451155424118042,
"learning_rate": 3.433048794262807e-05,
"loss": 0.3484,
"step": 136000
},
{
"epoch": 0.3139688474669261,
"grad_norm": 0.39298340678215027,
"learning_rate": 3.430743589802698e-05,
"loss": 0.3772,
"step": 136200
},
{
"epoch": 0.314429888358948,
"grad_norm": 0.66849684715271,
"learning_rate": 3.428449911364889e-05,
"loss": 0.3694,
"step": 136400
},
{
"epoch": 0.3148909292509699,
"grad_norm": 0.13003210723400116,
"learning_rate": 3.4261447069047794e-05,
"loss": 0.3615,
"step": 136600
},
{
"epoch": 0.3153519701429918,
"grad_norm": 0.4612247943878174,
"learning_rate": 3.423839502444669e-05,
"loss": 0.3447,
"step": 136800
},
{
"epoch": 0.31581301103501375,
"grad_norm": 0.5026991963386536,
"learning_rate": 3.42153429798456e-05,
"loss": 0.3431,
"step": 137000
},
{
"epoch": 0.31627405192703567,
"grad_norm": 0.09885114431381226,
"learning_rate": 3.4192290935244504e-05,
"loss": 0.3644,
"step": 137200
},
{
"epoch": 0.3167350928190576,
"grad_norm": 0.5941045880317688,
"learning_rate": 3.416923889064341e-05,
"loss": 0.3834,
"step": 137400
},
{
"epoch": 0.3171961337110795,
"grad_norm": 0.19133034348487854,
"learning_rate": 3.414618684604231e-05,
"loss": 0.4226,
"step": 137600
},
{
"epoch": 0.31765717460310144,
"grad_norm": 0.6926956176757812,
"learning_rate": 3.4123134801441214e-05,
"loss": 0.3627,
"step": 137800
},
{
"epoch": 0.3181182154951233,
"grad_norm": 0.40847301483154297,
"learning_rate": 3.410008275684012e-05,
"loss": 0.378,
"step": 138000
},
{
"epoch": 0.31857925638714524,
"grad_norm": 0.1576453298330307,
"learning_rate": 3.407703071223902e-05,
"loss": 0.3619,
"step": 138200
},
{
"epoch": 0.31904029727916716,
"grad_norm": 0.3131788372993469,
"learning_rate": 3.4053978667637924e-05,
"loss": 0.3627,
"step": 138400
},
{
"epoch": 0.3195013381711891,
"grad_norm": 0.43251073360443115,
"learning_rate": 3.403092662303683e-05,
"loss": 0.3512,
"step": 138600
},
{
"epoch": 0.319962379063211,
"grad_norm": 0.5372440218925476,
"learning_rate": 3.4007874578435736e-05,
"loss": 0.3255,
"step": 138800
},
{
"epoch": 0.32042341995523294,
"grad_norm": 0.12687037885189056,
"learning_rate": 3.398482253383464e-05,
"loss": 0.3604,
"step": 139000
},
{
"epoch": 0.32088446084725486,
"grad_norm": 0.5006986856460571,
"learning_rate": 3.396177048923355e-05,
"loss": 0.3645,
"step": 139200
},
{
"epoch": 0.3213455017392768,
"grad_norm": 0.21529662609100342,
"learning_rate": 3.393871844463245e-05,
"loss": 0.3547,
"step": 139400
},
{
"epoch": 0.3218065426312987,
"grad_norm": 0.29573652148246765,
"learning_rate": 3.391566640003135e-05,
"loss": 0.4053,
"step": 139600
},
{
"epoch": 0.3222675835233206,
"grad_norm": 2.115875720977783,
"learning_rate": 3.389261435543026e-05,
"loss": 0.362,
"step": 139800
},
{
"epoch": 0.3227286244153425,
"grad_norm": 0.45856016874313354,
"learning_rate": 3.386956231082916e-05,
"loss": 0.3758,
"step": 140000
},
{
"epoch": 0.3231896653073644,
"grad_norm": 0.5273020267486572,
"learning_rate": 3.384651026622806e-05,
"loss": 0.3999,
"step": 140200
},
{
"epoch": 0.32365070619938635,
"grad_norm": 0.3578619360923767,
"learning_rate": 3.382345822162697e-05,
"loss": 0.3878,
"step": 140400
},
{
"epoch": 0.3241117470914083,
"grad_norm": 1.3471113443374634,
"learning_rate": 3.380040617702587e-05,
"loss": 0.388,
"step": 140600
},
{
"epoch": 0.3245727879834302,
"grad_norm": 0.6462660431861877,
"learning_rate": 3.377746939264778e-05,
"loss": 0.3548,
"step": 140800
},
{
"epoch": 0.3250338288754521,
"grad_norm": 0.35268470644950867,
"learning_rate": 3.375441734804669e-05,
"loss": 0.3416,
"step": 141000
},
{
"epoch": 0.32549486976747405,
"grad_norm": 0.550477921962738,
"learning_rate": 3.373136530344559e-05,
"loss": 0.4072,
"step": 141200
},
{
"epoch": 0.3259559106594959,
"grad_norm": 0.4942178726196289,
"learning_rate": 3.370831325884449e-05,
"loss": 0.3698,
"step": 141400
},
{
"epoch": 0.32641695155151784,
"grad_norm": 0.5894572734832764,
"learning_rate": 3.36852612142434e-05,
"loss": 0.3597,
"step": 141600
},
{
"epoch": 0.32687799244353977,
"grad_norm": 0.3127411901950836,
"learning_rate": 3.366232442986531e-05,
"loss": 0.3811,
"step": 141800
},
{
"epoch": 0.3273390333355617,
"grad_norm": 0.13163967430591583,
"learning_rate": 3.363927238526421e-05,
"loss": 0.3539,
"step": 142000
},
{
"epoch": 0.3278000742275836,
"grad_norm": 0.2200063318014145,
"learning_rate": 3.361622034066312e-05,
"loss": 0.3721,
"step": 142200
},
{
"epoch": 0.32826111511960554,
"grad_norm": 0.29487523436546326,
"learning_rate": 3.3593168296062025e-05,
"loss": 0.3633,
"step": 142400
},
{
"epoch": 0.32872215601162746,
"grad_norm": 1.0105232000350952,
"learning_rate": 3.3570116251460924e-05,
"loss": 0.3418,
"step": 142600
},
{
"epoch": 0.3291831969036494,
"grad_norm": 0.7691797614097595,
"learning_rate": 3.354706420685983e-05,
"loss": 0.3391,
"step": 142800
},
{
"epoch": 0.3296442377956713,
"grad_norm": 0.39732518792152405,
"learning_rate": 3.3524012162258735e-05,
"loss": 0.3645,
"step": 143000
},
{
"epoch": 0.3301052786876932,
"grad_norm": 0.803773820400238,
"learning_rate": 3.3500960117657634e-05,
"loss": 0.393,
"step": 143200
},
{
"epoch": 0.3305663195797151,
"grad_norm": 0.6117516160011292,
"learning_rate": 3.347802333327954e-05,
"loss": 0.3921,
"step": 143400
},
{
"epoch": 0.33102736047173703,
"grad_norm": 0.44789832830429077,
"learning_rate": 3.345497128867845e-05,
"loss": 0.3448,
"step": 143600
},
{
"epoch": 0.33148840136375896,
"grad_norm": 0.2916577458381653,
"learning_rate": 3.3431919244077354e-05,
"loss": 0.3646,
"step": 143800
},
{
"epoch": 0.3319494422557809,
"grad_norm": 0.40400460362434387,
"learning_rate": 3.340886719947626e-05,
"loss": 0.3671,
"step": 144000
},
{
"epoch": 0.3324104831478028,
"grad_norm": 0.3641209602355957,
"learning_rate": 3.338581515487516e-05,
"loss": 0.3441,
"step": 144200
},
{
"epoch": 0.33287152403982473,
"grad_norm": 0.2829863131046295,
"learning_rate": 3.336276311027407e-05,
"loss": 0.3808,
"step": 144400
},
{
"epoch": 0.33333256493184665,
"grad_norm": 0.5035982728004456,
"learning_rate": 3.333971106567298e-05,
"loss": 0.3448,
"step": 144600
},
{
"epoch": 0.3337936058238685,
"grad_norm": 0.5748453140258789,
"learning_rate": 3.3316659021071876e-05,
"loss": 0.3579,
"step": 144800
},
{
"epoch": 0.33425464671589045,
"grad_norm": 0.2809416949748993,
"learning_rate": 3.329360697647078e-05,
"loss": 0.3717,
"step": 145000
},
{
"epoch": 0.33471568760791237,
"grad_norm": 0.6484798192977905,
"learning_rate": 3.327055493186969e-05,
"loss": 0.3724,
"step": 145200
},
{
"epoch": 0.3351767284999343,
"grad_norm": 0.9448397755622864,
"learning_rate": 3.3247502887268586e-05,
"loss": 0.3637,
"step": 145400
},
{
"epoch": 0.3356377693919562,
"grad_norm": 0.6135724782943726,
"learning_rate": 3.322445084266749e-05,
"loss": 0.3346,
"step": 145600
},
{
"epoch": 0.33609881028397814,
"grad_norm": 0.6418340802192688,
"learning_rate": 3.32013987980664e-05,
"loss": 0.4078,
"step": 145800
},
{
"epoch": 0.33655985117600007,
"grad_norm": 0.3337765038013458,
"learning_rate": 3.31783467534653e-05,
"loss": 0.3822,
"step": 146000
},
{
"epoch": 0.337020892068022,
"grad_norm": 0.5620834231376648,
"learning_rate": 3.31552947088642e-05,
"loss": 0.3983,
"step": 146200
},
{
"epoch": 0.3374819329600439,
"grad_norm": 0.48074471950531006,
"learning_rate": 3.313224266426311e-05,
"loss": 0.3393,
"step": 146400
},
{
"epoch": 0.3379429738520658,
"grad_norm": 0.4533754289150238,
"learning_rate": 3.3109190619662014e-05,
"loss": 0.4026,
"step": 146600
},
{
"epoch": 0.3384040147440877,
"grad_norm": 0.26355665922164917,
"learning_rate": 3.308613857506091e-05,
"loss": 0.3575,
"step": 146800
},
{
"epoch": 0.33886505563610964,
"grad_norm": 0.518338680267334,
"learning_rate": 3.306308653045982e-05,
"loss": 0.3459,
"step": 147000
},
{
"epoch": 0.33932609652813156,
"grad_norm": 0.42096418142318726,
"learning_rate": 3.3040034485858724e-05,
"loss": 0.3495,
"step": 147200
},
{
"epoch": 0.3397871374201535,
"grad_norm": 0.49113744497299194,
"learning_rate": 3.301698244125763e-05,
"loss": 0.3588,
"step": 147400
},
{
"epoch": 0.3402481783121754,
"grad_norm": 0.7098137736320496,
"learning_rate": 3.299393039665653e-05,
"loss": 0.3575,
"step": 147600
},
{
"epoch": 0.34070921920419733,
"grad_norm": 0.2632584869861603,
"learning_rate": 3.297087835205544e-05,
"loss": 0.3423,
"step": 147800
},
{
"epoch": 0.34117026009621926,
"grad_norm": 0.30534541606903076,
"learning_rate": 3.2947826307454346e-05,
"loss": 0.3698,
"step": 148000
},
{
"epoch": 0.3416313009882411,
"grad_norm": 0.528553307056427,
"learning_rate": 3.2924774262853245e-05,
"loss": 0.3642,
"step": 148200
},
{
"epoch": 0.34209234188026305,
"grad_norm": 0.760848879814148,
"learning_rate": 3.290172221825215e-05,
"loss": 0.3216,
"step": 148400
},
{
"epoch": 0.342553382772285,
"grad_norm": 0.30855801701545715,
"learning_rate": 3.287867017365106e-05,
"loss": 0.3593,
"step": 148600
},
{
"epoch": 0.3430144236643069,
"grad_norm": 0.4254257082939148,
"learning_rate": 3.2855618129049956e-05,
"loss": 0.3681,
"step": 148800
},
{
"epoch": 0.3434754645563288,
"grad_norm": 0.08992951363325119,
"learning_rate": 3.2832681344671865e-05,
"loss": 0.3681,
"step": 149000
},
{
"epoch": 0.34393650544835075,
"grad_norm": 0.2630908489227295,
"learning_rate": 3.280962930007077e-05,
"loss": 0.3939,
"step": 149200
},
{
"epoch": 0.3443975463403727,
"grad_norm": 0.4658574163913727,
"learning_rate": 3.2786577255469676e-05,
"loss": 0.3763,
"step": 149400
},
{
"epoch": 0.3448585872323946,
"grad_norm": 0.5836665630340576,
"learning_rate": 3.276352521086858e-05,
"loss": 0.3489,
"step": 149600
},
{
"epoch": 0.3453196281244165,
"grad_norm": 0.21611973643302917,
"learning_rate": 3.274047316626748e-05,
"loss": 0.3759,
"step": 149800
},
{
"epoch": 0.3457806690164384,
"grad_norm": 0.41169923543930054,
"learning_rate": 3.2717421121666386e-05,
"loss": 0.3499,
"step": 150000
},
{
"epoch": 0.3457806690164384,
"eval_loss": 0.37137243151664734,
"eval_runtime": 221.1568,
"eval_samples_per_second": 19.814,
"eval_steps_per_second": 19.814,
"step": 150000
},
{
"epoch": 0.3462417099084603,
"grad_norm": 0.4313965439796448,
"learning_rate": 3.269436907706529e-05,
"loss": 0.3839,
"step": 150200
},
{
"epoch": 0.34670275080048224,
"grad_norm": 0.7531281113624573,
"learning_rate": 3.267131703246419e-05,
"loss": 0.4073,
"step": 150400
},
{
"epoch": 0.34716379169250416,
"grad_norm": 0.3439179062843323,
"learning_rate": 3.26482649878631e-05,
"loss": 0.3944,
"step": 150600
},
{
"epoch": 0.3476248325845261,
"grad_norm": 0.2340434342622757,
"learning_rate": 3.262521294326201e-05,
"loss": 0.3425,
"step": 150800
},
{
"epoch": 0.348085873476548,
"grad_norm": 0.34943705797195435,
"learning_rate": 3.260216089866091e-05,
"loss": 0.3645,
"step": 151000
},
{
"epoch": 0.34854691436856994,
"grad_norm": 0.2850348651409149,
"learning_rate": 3.2579108854059814e-05,
"loss": 0.3706,
"step": 151200
},
{
"epoch": 0.34900795526059186,
"grad_norm": 0.47787681221961975,
"learning_rate": 3.255605680945872e-05,
"loss": 0.3715,
"step": 151400
},
{
"epoch": 0.34946899615261373,
"grad_norm": 0.5039366483688354,
"learning_rate": 3.2533004764857625e-05,
"loss": 0.3512,
"step": 151600
},
{
"epoch": 0.34993003704463566,
"grad_norm": 0.6152231693267822,
"learning_rate": 3.251006798047953e-05,
"loss": 0.3988,
"step": 151800
},
{
"epoch": 0.3503910779366576,
"grad_norm": 0.4916805326938629,
"learning_rate": 3.248701593587843e-05,
"loss": 0.3704,
"step": 152000
},
{
"epoch": 0.3508521188286795,
"grad_norm": 0.4728657007217407,
"learning_rate": 3.246396389127734e-05,
"loss": 0.3847,
"step": 152200
},
{
"epoch": 0.35131315972070143,
"grad_norm": 0.7450031638145447,
"learning_rate": 3.2440911846676245e-05,
"loss": 0.3593,
"step": 152400
},
{
"epoch": 0.35177420061272335,
"grad_norm": 0.41172704100608826,
"learning_rate": 3.2417859802075143e-05,
"loss": 0.354,
"step": 152600
},
{
"epoch": 0.3522352415047453,
"grad_norm": 1.0868451595306396,
"learning_rate": 3.239480775747405e-05,
"loss": 0.3515,
"step": 152800
},
{
"epoch": 0.3526962823967672,
"grad_norm": 0.38387322425842285,
"learning_rate": 3.2371755712872955e-05,
"loss": 0.3977,
"step": 153000
},
{
"epoch": 0.3531573232887891,
"grad_norm": 1.5377986431121826,
"learning_rate": 3.2348703668271854e-05,
"loss": 0.3565,
"step": 153200
},
{
"epoch": 0.353618364180811,
"grad_norm": 0.21696561574935913,
"learning_rate": 3.232565162367076e-05,
"loss": 0.3571,
"step": 153400
},
{
"epoch": 0.3540794050728329,
"grad_norm": 0.45013427734375,
"learning_rate": 3.2302599579069665e-05,
"loss": 0.3321,
"step": 153600
},
{
"epoch": 0.35454044596485484,
"grad_norm": 0.25292137265205383,
"learning_rate": 3.227954753446857e-05,
"loss": 0.3819,
"step": 153800
},
{
"epoch": 0.35500148685687677,
"grad_norm": 0.478595495223999,
"learning_rate": 3.2256495489867476e-05,
"loss": 0.3597,
"step": 154000
},
{
"epoch": 0.3554625277488987,
"grad_norm": 0.09058533608913422,
"learning_rate": 3.223344344526638e-05,
"loss": 0.3605,
"step": 154200
},
{
"epoch": 0.3559235686409206,
"grad_norm": 0.6636003851890564,
"learning_rate": 3.221039140066529e-05,
"loss": 0.3523,
"step": 154400
},
{
"epoch": 0.35638460953294254,
"grad_norm": 0.45204317569732666,
"learning_rate": 3.218733935606419e-05,
"loss": 0.3943,
"step": 154600
},
{
"epoch": 0.35684565042496447,
"grad_norm": 0.2915719449520111,
"learning_rate": 3.216428731146309e-05,
"loss": 0.3488,
"step": 154800
},
{
"epoch": 0.35730669131698634,
"grad_norm": 0.5923722386360168,
"learning_rate": 3.2141350527085e-05,
"loss": 0.3838,
"step": 155000
},
{
"epoch": 0.35776773220900826,
"grad_norm": 0.7770951390266418,
"learning_rate": 3.211829848248391e-05,
"loss": 0.3696,
"step": 155200
},
{
"epoch": 0.3582287731010302,
"grad_norm": 0.41361093521118164,
"learning_rate": 3.2095246437882806e-05,
"loss": 0.3501,
"step": 155400
},
{
"epoch": 0.3586898139930521,
"grad_norm": 0.27737417817115784,
"learning_rate": 3.207219439328171e-05,
"loss": 0.4097,
"step": 155600
},
{
"epoch": 0.35915085488507403,
"grad_norm": 0.19973890483379364,
"learning_rate": 3.204914234868062e-05,
"loss": 0.3427,
"step": 155800
},
{
"epoch": 0.35961189577709596,
"grad_norm": 0.4910184144973755,
"learning_rate": 3.202609030407952e-05,
"loss": 0.3669,
"step": 156000
},
{
"epoch": 0.3600729366691179,
"grad_norm": 0.21652765572071075,
"learning_rate": 3.200303825947842e-05,
"loss": 0.3388,
"step": 156200
},
{
"epoch": 0.3605339775611398,
"grad_norm": 0.5381657481193542,
"learning_rate": 3.197998621487733e-05,
"loss": 0.3737,
"step": 156400
},
{
"epoch": 0.3609950184531617,
"grad_norm": 0.2810543477535248,
"learning_rate": 3.195693417027623e-05,
"loss": 0.3726,
"step": 156600
},
{
"epoch": 0.3614560593451836,
"grad_norm": 0.6559444069862366,
"learning_rate": 3.193388212567514e-05,
"loss": 0.3326,
"step": 156800
},
{
"epoch": 0.3619171002372055,
"grad_norm": 0.26379209756851196,
"learning_rate": 3.1910830081074045e-05,
"loss": 0.3866,
"step": 157000
},
{
"epoch": 0.36237814112922745,
"grad_norm": 0.5602028965950012,
"learning_rate": 3.188777803647295e-05,
"loss": 0.3198,
"step": 157200
},
{
"epoch": 0.3628391820212494,
"grad_norm": 0.75434809923172,
"learning_rate": 3.186472599187185e-05,
"loss": 0.381,
"step": 157400
},
{
"epoch": 0.3633002229132713,
"grad_norm": 0.6652121543884277,
"learning_rate": 3.184178920749376e-05,
"loss": 0.3596,
"step": 157600
},
{
"epoch": 0.3637612638052932,
"grad_norm": 0.2597079575061798,
"learning_rate": 3.1818737162892664e-05,
"loss": 0.3435,
"step": 157800
},
{
"epoch": 0.36422230469731515,
"grad_norm": 0.3559524118900299,
"learning_rate": 3.179568511829157e-05,
"loss": 0.3256,
"step": 158000
},
{
"epoch": 0.36468334558933707,
"grad_norm": 0.6930160522460938,
"learning_rate": 3.177263307369047e-05,
"loss": 0.3654,
"step": 158200
},
{
"epoch": 0.36514438648135894,
"grad_norm": 0.292402058839798,
"learning_rate": 3.1749581029089374e-05,
"loss": 0.3639,
"step": 158400
},
{
"epoch": 0.36560542737338086,
"grad_norm": 0.22204717993736267,
"learning_rate": 3.172652898448828e-05,
"loss": 0.3602,
"step": 158600
},
{
"epoch": 0.3660664682654028,
"grad_norm": 0.2245527058839798,
"learning_rate": 3.1703476939887186e-05,
"loss": 0.3404,
"step": 158800
},
{
"epoch": 0.3665275091574247,
"grad_norm": 0.3532883822917938,
"learning_rate": 3.1680424895286085e-05,
"loss": 0.3444,
"step": 159000
},
{
"epoch": 0.36698855004944664,
"grad_norm": 0.4170125126838684,
"learning_rate": 3.165737285068499e-05,
"loss": 0.3648,
"step": 159200
},
{
"epoch": 0.36744959094146856,
"grad_norm": 0.5711910128593445,
"learning_rate": 3.1634320806083896e-05,
"loss": 0.3678,
"step": 159400
},
{
"epoch": 0.3679106318334905,
"grad_norm": 0.588743269443512,
"learning_rate": 3.161138402170581e-05,
"loss": 0.3585,
"step": 159600
},
{
"epoch": 0.3683716727255124,
"grad_norm": 0.568601667881012,
"learning_rate": 3.158833197710471e-05,
"loss": 0.3709,
"step": 159800
},
{
"epoch": 0.3688327136175343,
"grad_norm": 0.3680901825428009,
"learning_rate": 3.1565279932503617e-05,
"loss": 0.3963,
"step": 160000
},
{
"epoch": 0.3692937545095562,
"grad_norm": 0.18371005356311798,
"learning_rate": 3.154222788790252e-05,
"loss": 0.3637,
"step": 160200
},
{
"epoch": 0.36975479540157813,
"grad_norm": 0.40033024549484253,
"learning_rate": 3.151917584330142e-05,
"loss": 0.3469,
"step": 160400
},
{
"epoch": 0.37021583629360005,
"grad_norm": 0.4920560121536255,
"learning_rate": 3.149612379870033e-05,
"loss": 0.3626,
"step": 160600
},
{
"epoch": 0.370676877185622,
"grad_norm": 0.2677069902420044,
"learning_rate": 3.147307175409923e-05,
"loss": 0.3825,
"step": 160800
},
{
"epoch": 0.3711379180776439,
"grad_norm": 0.6608094573020935,
"learning_rate": 3.145001970949814e-05,
"loss": 0.343,
"step": 161000
},
{
"epoch": 0.3715989589696658,
"grad_norm": 0.4441094398498535,
"learning_rate": 3.142696766489704e-05,
"loss": 0.3384,
"step": 161200
},
{
"epoch": 0.37205999986168775,
"grad_norm": 0.5351240634918213,
"learning_rate": 3.140391562029594e-05,
"loss": 0.3675,
"step": 161400
},
{
"epoch": 0.3725210407537097,
"grad_norm": 0.4210915267467499,
"learning_rate": 3.138086357569485e-05,
"loss": 0.3683,
"step": 161600
},
{
"epoch": 0.37298208164573154,
"grad_norm": 0.4051463007926941,
"learning_rate": 3.135781153109375e-05,
"loss": 0.3382,
"step": 161800
},
{
"epoch": 0.37344312253775347,
"grad_norm": 0.39557942748069763,
"learning_rate": 3.1334874746715657e-05,
"loss": 0.3746,
"step": 162000
},
{
"epoch": 0.3739041634297754,
"grad_norm": 0.4568265676498413,
"learning_rate": 3.131182270211457e-05,
"loss": 0.3292,
"step": 162200
},
{
"epoch": 0.3743652043217973,
"grad_norm": 0.635125994682312,
"learning_rate": 3.1288770657513475e-05,
"loss": 0.38,
"step": 162400
},
{
"epoch": 0.37482624521381924,
"grad_norm": 0.3739936351776123,
"learning_rate": 3.1265718612912374e-05,
"loss": 0.3324,
"step": 162600
},
{
"epoch": 0.37528728610584117,
"grad_norm": 0.22976283729076385,
"learning_rate": 3.124266656831128e-05,
"loss": 0.3721,
"step": 162800
},
{
"epoch": 0.3757483269978631,
"grad_norm": 0.5608423352241516,
"learning_rate": 3.1219614523710185e-05,
"loss": 0.3858,
"step": 163000
},
{
"epoch": 0.376209367889885,
"grad_norm": 0.623247504234314,
"learning_rate": 3.1196562479109084e-05,
"loss": 0.3803,
"step": 163200
},
{
"epoch": 0.3766704087819069,
"grad_norm": 0.16723869740962982,
"learning_rate": 3.117351043450799e-05,
"loss": 0.3606,
"step": 163400
},
{
"epoch": 0.3771314496739288,
"grad_norm": 0.6487811803817749,
"learning_rate": 3.1150458389906895e-05,
"loss": 0.3727,
"step": 163600
},
{
"epoch": 0.37759249056595073,
"grad_norm": 0.8133066892623901,
"learning_rate": 3.11274063453058e-05,
"loss": 0.33,
"step": 163800
},
{
"epoch": 0.37805353145797266,
"grad_norm": 0.12610138952732086,
"learning_rate": 3.11043543007047e-05,
"loss": 0.3791,
"step": 164000
},
{
"epoch": 0.3785145723499946,
"grad_norm": 0.2422199547290802,
"learning_rate": 3.108141751632661e-05,
"loss": 0.377,
"step": 164200
},
{
"epoch": 0.3789756132420165,
"grad_norm": 0.5799381136894226,
"learning_rate": 3.1058365471725515e-05,
"loss": 0.3877,
"step": 164400
},
{
"epoch": 0.37943665413403843,
"grad_norm": 0.536668062210083,
"learning_rate": 3.103531342712442e-05,
"loss": 0.3819,
"step": 164600
},
{
"epoch": 0.37989769502606036,
"grad_norm": 0.3266075551509857,
"learning_rate": 3.101226138252332e-05,
"loss": 0.33,
"step": 164800
},
{
"epoch": 0.3803587359180823,
"grad_norm": 1.5646326541900635,
"learning_rate": 3.0989209337922225e-05,
"loss": 0.3804,
"step": 165000
},
{
"epoch": 0.38081977681010415,
"grad_norm": 0.4181995987892151,
"learning_rate": 3.096615729332114e-05,
"loss": 0.3309,
"step": 165200
},
{
"epoch": 0.3812808177021261,
"grad_norm": 1.0578486919403076,
"learning_rate": 3.0943105248720036e-05,
"loss": 0.3788,
"step": 165400
},
{
"epoch": 0.381741858594148,
"grad_norm": 0.1641608625650406,
"learning_rate": 3.092005320411894e-05,
"loss": 0.3626,
"step": 165600
},
{
"epoch": 0.3822028994861699,
"grad_norm": 0.41752827167510986,
"learning_rate": 3.089700115951785e-05,
"loss": 0.352,
"step": 165800
},
{
"epoch": 0.38266394037819185,
"grad_norm": 0.5097513794898987,
"learning_rate": 3.087394911491675e-05,
"loss": 0.3778,
"step": 166000
},
{
"epoch": 0.38312498127021377,
"grad_norm": 0.20745956897735596,
"learning_rate": 3.085089707031565e-05,
"loss": 0.3733,
"step": 166200
},
{
"epoch": 0.3835860221622357,
"grad_norm": 0.5006649494171143,
"learning_rate": 3.082784502571456e-05,
"loss": 0.3526,
"step": 166400
},
{
"epoch": 0.3840470630542576,
"grad_norm": 0.675798773765564,
"learning_rate": 3.0804792981113464e-05,
"loss": 0.3667,
"step": 166600
},
{
"epoch": 0.3845081039462795,
"grad_norm": 0.4306504428386688,
"learning_rate": 3.078174093651236e-05,
"loss": 0.3668,
"step": 166800
},
{
"epoch": 0.3849691448383014,
"grad_norm": 0.5318405032157898,
"learning_rate": 3.075868889191127e-05,
"loss": 0.356,
"step": 167000
},
{
"epoch": 0.38543018573032334,
"grad_norm": 0.3948398232460022,
"learning_rate": 3.0735636847310174e-05,
"loss": 0.3652,
"step": 167200
},
{
"epoch": 0.38589122662234526,
"grad_norm": 0.16420459747314453,
"learning_rate": 3.071258480270908e-05,
"loss": 0.3631,
"step": 167400
},
{
"epoch": 0.3863522675143672,
"grad_norm": 0.20000016689300537,
"learning_rate": 3.068953275810798e-05,
"loss": 0.3692,
"step": 167600
},
{
"epoch": 0.3868133084063891,
"grad_norm": 0.24762466549873352,
"learning_rate": 3.0666480713506884e-05,
"loss": 0.3466,
"step": 167800
},
{
"epoch": 0.38727434929841104,
"grad_norm": 0.4398600161075592,
"learning_rate": 3.064342866890579e-05,
"loss": 0.3402,
"step": 168000
},
{
"epoch": 0.38773539019043296,
"grad_norm": 0.4209122061729431,
"learning_rate": 3.062037662430469e-05,
"loss": 0.3637,
"step": 168200
},
{
"epoch": 0.3881964310824549,
"grad_norm": 0.7803798317909241,
"learning_rate": 3.0597324579703594e-05,
"loss": 0.3975,
"step": 168400
},
{
"epoch": 0.38865747197447675,
"grad_norm": 1.5571372509002686,
"learning_rate": 3.057427253510251e-05,
"loss": 0.3043,
"step": 168600
},
{
"epoch": 0.3891185128664987,
"grad_norm": 0.5619482398033142,
"learning_rate": 3.0551335750724416e-05,
"loss": 0.3588,
"step": 168800
},
{
"epoch": 0.3895795537585206,
"grad_norm": 0.3675914406776428,
"learning_rate": 3.0528283706123315e-05,
"loss": 0.3447,
"step": 169000
},
{
"epoch": 0.3900405946505425,
"grad_norm": 0.634750247001648,
"learning_rate": 3.050523166152222e-05,
"loss": 0.4005,
"step": 169200
},
{
"epoch": 0.39050163554256445,
"grad_norm": 0.22075869143009186,
"learning_rate": 3.0482179616921126e-05,
"loss": 0.3735,
"step": 169400
},
{
"epoch": 0.3909626764345864,
"grad_norm": 0.6765059232711792,
"learning_rate": 3.045912757232003e-05,
"loss": 0.3328,
"step": 169600
},
{
"epoch": 0.3914237173266083,
"grad_norm": 0.4642723500728607,
"learning_rate": 3.043607552771893e-05,
"loss": 0.3533,
"step": 169800
},
{
"epoch": 0.3918847582186302,
"grad_norm": 0.47227638959884644,
"learning_rate": 3.0413023483117836e-05,
"loss": 0.3677,
"step": 170000
},
{
"epoch": 0.3923457991106521,
"grad_norm": 0.4289513826370239,
"learning_rate": 3.038997143851674e-05,
"loss": 0.3872,
"step": 170200
},
{
"epoch": 0.392806840002674,
"grad_norm": 0.28258103132247925,
"learning_rate": 3.0366919393915644e-05,
"loss": 0.3523,
"step": 170400
},
{
"epoch": 0.39326788089469594,
"grad_norm": 0.22584015130996704,
"learning_rate": 3.0343867349314547e-05,
"loss": 0.3155,
"step": 170600
},
{
"epoch": 0.39372892178671787,
"grad_norm": 0.3940613567829132,
"learning_rate": 3.0320815304713452e-05,
"loss": 0.3714,
"step": 170800
},
{
"epoch": 0.3941899626787398,
"grad_norm": 1.565744400024414,
"learning_rate": 3.029787852033536e-05,
"loss": 0.3636,
"step": 171000
},
{
"epoch": 0.3946510035707617,
"grad_norm": 0.2669508457183838,
"learning_rate": 3.027482647573427e-05,
"loss": 0.3792,
"step": 171200
},
{
"epoch": 0.39511204446278364,
"grad_norm": 0.3558444380760193,
"learning_rate": 3.0251774431133173e-05,
"loss": 0.3859,
"step": 171400
},
{
"epoch": 0.39557308535480556,
"grad_norm": 0.44814062118530273,
"learning_rate": 3.0228722386532075e-05,
"loss": 0.3213,
"step": 171600
},
{
"epoch": 0.3960341262468275,
"grad_norm": 0.2663359045982361,
"learning_rate": 3.020567034193098e-05,
"loss": 0.3421,
"step": 171800
},
{
"epoch": 0.39649516713884936,
"grad_norm": 0.20757733285427094,
"learning_rate": 3.0182618297329883e-05,
"loss": 0.3539,
"step": 172000
},
{
"epoch": 0.3969562080308713,
"grad_norm": 0.9609633684158325,
"learning_rate": 3.015956625272879e-05,
"loss": 0.338,
"step": 172200
},
{
"epoch": 0.3974172489228932,
"grad_norm": 0.23215247690677643,
"learning_rate": 3.013651420812769e-05,
"loss": 0.3854,
"step": 172400
},
{
"epoch": 0.39787828981491513,
"grad_norm": 0.16182895004749298,
"learning_rate": 3.0113462163526597e-05,
"loss": 0.3395,
"step": 172600
},
{
"epoch": 0.39833933070693706,
"grad_norm": 0.1874891072511673,
"learning_rate": 3.00904101189255e-05,
"loss": 0.3958,
"step": 172800
},
{
"epoch": 0.398800371598959,
"grad_norm": 0.5180594325065613,
"learning_rate": 3.0067358074324405e-05,
"loss": 0.3845,
"step": 173000
},
{
"epoch": 0.3992614124909809,
"grad_norm": 0.33921676874160767,
"learning_rate": 3.0044306029723307e-05,
"loss": 0.3373,
"step": 173200
},
{
"epoch": 0.39972245338300283,
"grad_norm": 0.3913256525993347,
"learning_rate": 3.002125398512221e-05,
"loss": 0.3362,
"step": 173400
},
{
"epoch": 0.4001834942750247,
"grad_norm": 0.6773241758346558,
"learning_rate": 2.999831720074412e-05,
"loss": 0.3762,
"step": 173600
},
{
"epoch": 0.4006445351670466,
"grad_norm": 0.4255892038345337,
"learning_rate": 2.9975265156143024e-05,
"loss": 0.3851,
"step": 173800
},
{
"epoch": 0.40110557605906855,
"grad_norm": 0.3191074728965759,
"learning_rate": 2.9952213111541927e-05,
"loss": 0.3841,
"step": 174000
},
{
"epoch": 0.40156661695109047,
"grad_norm": 0.48218560218811035,
"learning_rate": 2.9929161066940836e-05,
"loss": 0.3539,
"step": 174200
},
{
"epoch": 0.4020276578431124,
"grad_norm": 0.539185643196106,
"learning_rate": 2.990610902233974e-05,
"loss": 0.371,
"step": 174400
},
{
"epoch": 0.4024886987351343,
"grad_norm": 0.717741072177887,
"learning_rate": 2.9883056977738644e-05,
"loss": 0.3342,
"step": 174600
},
{
"epoch": 0.40294973962715624,
"grad_norm": 0.3354889452457428,
"learning_rate": 2.9860004933137546e-05,
"loss": 0.3453,
"step": 174800
},
{
"epoch": 0.40341078051917817,
"grad_norm": 0.612332284450531,
"learning_rate": 2.983695288853645e-05,
"loss": 0.3595,
"step": 175000
},
{
"epoch": 0.4038718214112001,
"grad_norm": 0.41890302300453186,
"learning_rate": 2.981401610415836e-05,
"loss": 0.3628,
"step": 175200
},
{
"epoch": 0.40433286230322196,
"grad_norm": 0.5085733532905579,
"learning_rate": 2.9790964059557263e-05,
"loss": 0.353,
"step": 175400
},
{
"epoch": 0.4047939031952439,
"grad_norm": 0.38240352272987366,
"learning_rate": 2.976791201495617e-05,
"loss": 0.3728,
"step": 175600
},
{
"epoch": 0.4052549440872658,
"grad_norm": 0.28365951776504517,
"learning_rate": 2.974485997035507e-05,
"loss": 0.3734,
"step": 175800
},
{
"epoch": 0.40571598497928774,
"grad_norm": 0.541234016418457,
"learning_rate": 2.9721807925753977e-05,
"loss": 0.3635,
"step": 176000
},
{
"epoch": 0.40617702587130966,
"grad_norm": 0.46731624007225037,
"learning_rate": 2.969875588115288e-05,
"loss": 0.3766,
"step": 176200
},
{
"epoch": 0.4066380667633316,
"grad_norm": 0.49456965923309326,
"learning_rate": 2.967570383655178e-05,
"loss": 0.3849,
"step": 176400
},
{
"epoch": 0.4070991076553535,
"grad_norm": 0.7649428248405457,
"learning_rate": 2.9652651791950687e-05,
"loss": 0.3723,
"step": 176600
},
{
"epoch": 0.40756014854737543,
"grad_norm": 0.13006378710269928,
"learning_rate": 2.962959974734959e-05,
"loss": 0.3357,
"step": 176800
},
{
"epoch": 0.4080211894393973,
"grad_norm": 0.5410711765289307,
"learning_rate": 2.9606547702748498e-05,
"loss": 0.3545,
"step": 177000
},
{
"epoch": 0.4084822303314192,
"grad_norm": 0.4390261769294739,
"learning_rate": 2.9583495658147404e-05,
"loss": 0.329,
"step": 177200
},
{
"epoch": 0.40894327122344115,
"grad_norm": 0.7517630457878113,
"learning_rate": 2.9560558873769313e-05,
"loss": 0.4301,
"step": 177400
},
{
"epoch": 0.4094043121154631,
"grad_norm": 0.3450630307197571,
"learning_rate": 2.9537506829168215e-05,
"loss": 0.4252,
"step": 177600
},
{
"epoch": 0.409865353007485,
"grad_norm": 0.5781650543212891,
"learning_rate": 2.9514454784567118e-05,
"loss": 0.3579,
"step": 177800
},
{
"epoch": 0.4103263938995069,
"grad_norm": 0.35803350806236267,
"learning_rate": 2.9491402739966023e-05,
"loss": 0.3327,
"step": 178000
},
{
"epoch": 0.41078743479152885,
"grad_norm": 0.25453415513038635,
"learning_rate": 2.9468350695364926e-05,
"loss": 0.3547,
"step": 178200
},
{
"epoch": 0.4112484756835508,
"grad_norm": 0.23303724825382233,
"learning_rate": 2.944529865076383e-05,
"loss": 0.382,
"step": 178400
},
{
"epoch": 0.41170951657557264,
"grad_norm": 0.633512556552887,
"learning_rate": 2.9422246606162734e-05,
"loss": 0.4116,
"step": 178600
},
{
"epoch": 0.41217055746759457,
"grad_norm": 0.8377290368080139,
"learning_rate": 2.939919456156164e-05,
"loss": 0.3229,
"step": 178800
},
{
"epoch": 0.4126315983596165,
"grad_norm": 0.3785347640514374,
"learning_rate": 2.937614251696054e-05,
"loss": 0.3494,
"step": 179000
},
{
"epoch": 0.4130926392516384,
"grad_norm": 0.3271910548210144,
"learning_rate": 2.9353090472359447e-05,
"loss": 0.3765,
"step": 179200
},
{
"epoch": 0.41355368014366034,
"grad_norm": 0.5429534316062927,
"learning_rate": 2.933003842775835e-05,
"loss": 0.405,
"step": 179400
},
{
"epoch": 0.41401472103568226,
"grad_norm": 0.25977814197540283,
"learning_rate": 2.9306986383157252e-05,
"loss": 0.3348,
"step": 179600
},
{
"epoch": 0.4144757619277042,
"grad_norm": 0.8113681077957153,
"learning_rate": 2.9283934338556158e-05,
"loss": 0.3925,
"step": 179800
},
{
"epoch": 0.4149368028197261,
"grad_norm": 0.25824105739593506,
"learning_rate": 2.926099755417807e-05,
"loss": 0.367,
"step": 180000
},
{
"epoch": 0.41539784371174804,
"grad_norm": 0.48763301968574524,
"learning_rate": 2.9237945509576976e-05,
"loss": 0.361,
"step": 180200
},
{
"epoch": 0.4158588846037699,
"grad_norm": 0.7430822849273682,
"learning_rate": 2.9214893464975878e-05,
"loss": 0.3683,
"step": 180400
},
{
"epoch": 0.41631992549579183,
"grad_norm": 0.4560760259628296,
"learning_rate": 2.9191841420374784e-05,
"loss": 0.3855,
"step": 180600
},
{
"epoch": 0.41678096638781376,
"grad_norm": 0.7530708909034729,
"learning_rate": 2.9168789375773686e-05,
"loss": 0.354,
"step": 180800
},
{
"epoch": 0.4172420072798357,
"grad_norm": 8.643664360046387,
"learning_rate": 2.9145737331172592e-05,
"loss": 0.3783,
"step": 181000
},
{
"epoch": 0.4177030481718576,
"grad_norm": 1.6691298484802246,
"learning_rate": 2.9122685286571494e-05,
"loss": 0.3413,
"step": 181200
},
{
"epoch": 0.41816408906387953,
"grad_norm": 0.5962491631507874,
"learning_rate": 2.9099633241970396e-05,
"loss": 0.3715,
"step": 181400
},
{
"epoch": 0.41862512995590145,
"grad_norm": 0.5724808573722839,
"learning_rate": 2.9076581197369302e-05,
"loss": 0.3599,
"step": 181600
},
{
"epoch": 0.4190861708479234,
"grad_norm": 1.6196781396865845,
"learning_rate": 2.9053529152768204e-05,
"loss": 0.3475,
"step": 181800
},
{
"epoch": 0.41954721173994525,
"grad_norm": 0.21996203064918518,
"learning_rate": 2.903047710816711e-05,
"loss": 0.4023,
"step": 182000
},
{
"epoch": 0.42000825263196717,
"grad_norm": 0.38969552516937256,
"learning_rate": 2.9007425063566012e-05,
"loss": 0.4015,
"step": 182200
},
{
"epoch": 0.4204692935239891,
"grad_norm": 0.926848828792572,
"learning_rate": 2.8984373018964918e-05,
"loss": 0.367,
"step": 182400
},
{
"epoch": 0.420930334416011,
"grad_norm": 0.5029491782188416,
"learning_rate": 2.896143623458683e-05,
"loss": 0.3743,
"step": 182600
},
{
"epoch": 0.42139137530803294,
"grad_norm": 0.528469979763031,
"learning_rate": 2.8938384189985733e-05,
"loss": 0.383,
"step": 182800
},
{
"epoch": 0.42185241620005487,
"grad_norm": 0.26606419682502747,
"learning_rate": 2.8915447405607642e-05,
"loss": 0.3678,
"step": 183000
},
{
"epoch": 0.4223134570920768,
"grad_norm": 0.3999669551849365,
"learning_rate": 2.8892395361006548e-05,
"loss": 0.3596,
"step": 183200
},
{
"epoch": 0.4227744979840987,
"grad_norm": 0.20571675896644592,
"learning_rate": 2.886934331640545e-05,
"loss": 0.3083,
"step": 183400
},
{
"epoch": 0.42323553887612064,
"grad_norm": 0.7978609204292297,
"learning_rate": 2.8846291271804356e-05,
"loss": 0.3577,
"step": 183600
},
{
"epoch": 0.4236965797681425,
"grad_norm": 0.5432894825935364,
"learning_rate": 2.8823239227203258e-05,
"loss": 0.3537,
"step": 183800
},
{
"epoch": 0.42415762066016444,
"grad_norm": 0.7559936046600342,
"learning_rate": 2.880018718260216e-05,
"loss": 0.3599,
"step": 184000
},
{
"epoch": 0.42461866155218636,
"grad_norm": 0.43294602632522583,
"learning_rate": 2.8777135138001066e-05,
"loss": 0.3685,
"step": 184200
},
{
"epoch": 0.4250797024442083,
"grad_norm": 0.5277533531188965,
"learning_rate": 2.8754083093399968e-05,
"loss": 0.3711,
"step": 184400
},
{
"epoch": 0.4255407433362302,
"grad_norm": 0.27945324778556824,
"learning_rate": 2.8731031048798874e-05,
"loss": 0.3497,
"step": 184600
},
{
"epoch": 0.42600178422825213,
"grad_norm": 0.39421379566192627,
"learning_rate": 2.8707979004197776e-05,
"loss": 0.3578,
"step": 184800
},
{
"epoch": 0.42646282512027406,
"grad_norm": 0.6292276978492737,
"learning_rate": 2.8684926959596682e-05,
"loss": 0.3854,
"step": 185000
},
{
"epoch": 0.426923866012296,
"grad_norm": 0.4721149504184723,
"learning_rate": 2.8661874914995584e-05,
"loss": 0.3399,
"step": 185200
},
{
"epoch": 0.42738490690431785,
"grad_norm": 0.7457978129386902,
"learning_rate": 2.863882287039449e-05,
"loss": 0.3572,
"step": 185400
},
{
"epoch": 0.4278459477963398,
"grad_norm": 0.3315475583076477,
"learning_rate": 2.86157708257934e-05,
"loss": 0.3338,
"step": 185600
},
{
"epoch": 0.4283069886883617,
"grad_norm": 0.16093194484710693,
"learning_rate": 2.85927187811923e-05,
"loss": 0.3546,
"step": 185800
},
{
"epoch": 0.4287680295803836,
"grad_norm": 0.12288182973861694,
"learning_rate": 2.8569666736591207e-05,
"loss": 0.3824,
"step": 186000
},
{
"epoch": 0.42922907047240555,
"grad_norm": 0.39890772104263306,
"learning_rate": 2.854661469199011e-05,
"loss": 0.3824,
"step": 186200
},
{
"epoch": 0.4296901113644275,
"grad_norm": 0.5391467809677124,
"learning_rate": 2.852356264738901e-05,
"loss": 0.3725,
"step": 186400
},
{
"epoch": 0.4301511522564494,
"grad_norm": 0.44454967975616455,
"learning_rate": 2.8500510602787917e-05,
"loss": 0.3432,
"step": 186600
},
{
"epoch": 0.4306121931484713,
"grad_norm": 3.51567006111145,
"learning_rate": 2.847745855818682e-05,
"loss": 0.3408,
"step": 186800
},
{
"epoch": 0.43107323404049325,
"grad_norm": 0.5184333324432373,
"learning_rate": 2.8454406513585725e-05,
"loss": 0.3294,
"step": 187000
},
{
"epoch": 0.4315342749325151,
"grad_norm": 0.3833082318305969,
"learning_rate": 2.8431354468984627e-05,
"loss": 0.363,
"step": 187200
},
{
"epoch": 0.43199531582453704,
"grad_norm": 0.5856477618217468,
"learning_rate": 2.8408417684606537e-05,
"loss": 0.3914,
"step": 187400
},
{
"epoch": 0.43245635671655897,
"grad_norm": 2.194856643676758,
"learning_rate": 2.838536564000544e-05,
"loss": 0.3899,
"step": 187600
},
{
"epoch": 0.4329173976085809,
"grad_norm": 0.9959438443183899,
"learning_rate": 2.8362313595404345e-05,
"loss": 0.3547,
"step": 187800
},
{
"epoch": 0.4333784385006028,
"grad_norm": 0.48765039443969727,
"learning_rate": 2.8339261550803247e-05,
"loss": 0.3666,
"step": 188000
},
{
"epoch": 0.43383947939262474,
"grad_norm": 0.6853535175323486,
"learning_rate": 2.8316209506202152e-05,
"loss": 0.3288,
"step": 188200
},
{
"epoch": 0.43430052028464666,
"grad_norm": 0.8288220763206482,
"learning_rate": 2.8293157461601055e-05,
"loss": 0.3955,
"step": 188400
},
{
"epoch": 0.4347615611766686,
"grad_norm": 0.14839661121368408,
"learning_rate": 2.8270105416999964e-05,
"loss": 0.392,
"step": 188600
},
{
"epoch": 0.43522260206869046,
"grad_norm": 0.210972398519516,
"learning_rate": 2.824705337239887e-05,
"loss": 0.3915,
"step": 188800
},
{
"epoch": 0.4356836429607124,
"grad_norm": 0.7078728675842285,
"learning_rate": 2.8224001327797772e-05,
"loss": 0.3538,
"step": 189000
},
{
"epoch": 0.4361446838527343,
"grad_norm": 0.2222944051027298,
"learning_rate": 2.8200949283196677e-05,
"loss": 0.3471,
"step": 189200
},
{
"epoch": 0.43660572474475623,
"grad_norm": 0.5914934277534485,
"learning_rate": 2.817789723859558e-05,
"loss": 0.3522,
"step": 189400
},
{
"epoch": 0.43706676563677815,
"grad_norm": 0.4873872995376587,
"learning_rate": 2.8154845193994482e-05,
"loss": 0.3399,
"step": 189600
},
{
"epoch": 0.4375278065288001,
"grad_norm": 0.3443751037120819,
"learning_rate": 2.8131793149393388e-05,
"loss": 0.3416,
"step": 189800
},
{
"epoch": 0.437988847420822,
"grad_norm": 0.6381067037582397,
"learning_rate": 2.810874110479229e-05,
"loss": 0.3375,
"step": 190000
},
{
"epoch": 0.4384498883128439,
"grad_norm": 0.49749723076820374,
"learning_rate": 2.8085689060191196e-05,
"loss": 0.35,
"step": 190200
},
{
"epoch": 0.43891092920486585,
"grad_norm": 0.17748087644577026,
"learning_rate": 2.8062637015590098e-05,
"loss": 0.3652,
"step": 190400
},
{
"epoch": 0.4393719700968877,
"grad_norm": 0.3006545603275299,
"learning_rate": 2.8039584970989004e-05,
"loss": 0.3956,
"step": 190600
},
{
"epoch": 0.43983301098890965,
"grad_norm": 0.21853938698768616,
"learning_rate": 2.8016532926387906e-05,
"loss": 0.3516,
"step": 190800
},
{
"epoch": 0.44029405188093157,
"grad_norm": 0.7327430248260498,
"learning_rate": 2.7993596142009815e-05,
"loss": 0.3641,
"step": 191000
},
{
"epoch": 0.4407550927729535,
"grad_norm": 0.2265714704990387,
"learning_rate": 2.7970544097408717e-05,
"loss": 0.3908,
"step": 191200
},
{
"epoch": 0.4412161336649754,
"grad_norm": 0.2358558624982834,
"learning_rate": 2.7947492052807626e-05,
"loss": 0.3763,
"step": 191400
},
{
"epoch": 0.44167717455699734,
"grad_norm": 0.6405865550041199,
"learning_rate": 2.7924440008206532e-05,
"loss": 0.3656,
"step": 191600
},
{
"epoch": 0.44213821544901927,
"grad_norm": 0.5846646428108215,
"learning_rate": 2.7901387963605434e-05,
"loss": 0.3638,
"step": 191800
},
{
"epoch": 0.4425992563410412,
"grad_norm": 0.32096606492996216,
"learning_rate": 2.787833591900434e-05,
"loss": 0.3779,
"step": 192000
},
{
"epoch": 0.44306029723306306,
"grad_norm": 0.37898677587509155,
"learning_rate": 2.7855283874403242e-05,
"loss": 0.3424,
"step": 192200
},
{
"epoch": 0.443521338125085,
"grad_norm": 0.34216663241386414,
"learning_rate": 2.7832231829802148e-05,
"loss": 0.3218,
"step": 192400
},
{
"epoch": 0.4439823790171069,
"grad_norm": 0.38816431164741516,
"learning_rate": 2.7809295045424054e-05,
"loss": 0.3721,
"step": 192600
},
{
"epoch": 0.44444341990912883,
"grad_norm": 0.19693952798843384,
"learning_rate": 2.778624300082296e-05,
"loss": 0.3495,
"step": 192800
},
{
"epoch": 0.44490446080115076,
"grad_norm": 0.5186366438865662,
"learning_rate": 2.7763190956221862e-05,
"loss": 0.3485,
"step": 193000
},
{
"epoch": 0.4453655016931727,
"grad_norm": 0.35819756984710693,
"learning_rate": 2.7740138911620768e-05,
"loss": 0.3545,
"step": 193200
},
{
"epoch": 0.4458265425851946,
"grad_norm": 0.44119149446487427,
"learning_rate": 2.771708686701967e-05,
"loss": 0.3515,
"step": 193400
},
{
"epoch": 0.44628758347721653,
"grad_norm": 0.8689220547676086,
"learning_rate": 2.7694034822418576e-05,
"loss": 0.382,
"step": 193600
},
{
"epoch": 0.44674862436923846,
"grad_norm": 0.29297760128974915,
"learning_rate": 2.7670982777817478e-05,
"loss": 0.3283,
"step": 193800
},
{
"epoch": 0.4472096652612603,
"grad_norm": 0.414132684469223,
"learning_rate": 2.764793073321638e-05,
"loss": 0.3848,
"step": 194000
},
{
"epoch": 0.44767070615328225,
"grad_norm": 0.3628985583782196,
"learning_rate": 2.7624878688615286e-05,
"loss": 0.3735,
"step": 194200
},
{
"epoch": 0.4481317470453042,
"grad_norm": 0.5289758443832397,
"learning_rate": 2.7601826644014195e-05,
"loss": 0.338,
"step": 194400
},
{
"epoch": 0.4485927879373261,
"grad_norm": 4.156925678253174,
"learning_rate": 2.7578774599413097e-05,
"loss": 0.392,
"step": 194600
},
{
"epoch": 0.449053828829348,
"grad_norm": 0.3998264670372009,
"learning_rate": 2.7555722554812003e-05,
"loss": 0.3523,
"step": 194800
},
{
"epoch": 0.44951486972136995,
"grad_norm": 0.3306404948234558,
"learning_rate": 2.7532670510210905e-05,
"loss": 0.3124,
"step": 195000
},
{
"epoch": 0.44997591061339187,
"grad_norm": 0.41828057169914246,
"learning_rate": 2.750961846560981e-05,
"loss": 0.3753,
"step": 195200
},
{
"epoch": 0.4504369515054138,
"grad_norm": 0.5500707030296326,
"learning_rate": 2.7486566421008713e-05,
"loss": 0.4012,
"step": 195400
},
{
"epoch": 0.45089799239743567,
"grad_norm": 0.3156859874725342,
"learning_rate": 2.746351437640762e-05,
"loss": 0.3423,
"step": 195600
},
{
"epoch": 0.4513590332894576,
"grad_norm": 0.5669901967048645,
"learning_rate": 2.744046233180652e-05,
"loss": 0.3618,
"step": 195800
},
{
"epoch": 0.4518200741814795,
"grad_norm": 0.6025803089141846,
"learning_rate": 2.7417410287205427e-05,
"loss": 0.4058,
"step": 196000
},
{
"epoch": 0.45228111507350144,
"grad_norm": 0.38647380471229553,
"learning_rate": 2.739435824260433e-05,
"loss": 0.3387,
"step": 196200
},
{
"epoch": 0.45274215596552336,
"grad_norm": 0.6687199473381042,
"learning_rate": 2.737130619800323e-05,
"loss": 0.3785,
"step": 196400
},
{
"epoch": 0.4532031968575453,
"grad_norm": 0.6177826523780823,
"learning_rate": 2.7348254153402137e-05,
"loss": 0.3182,
"step": 196600
},
{
"epoch": 0.4536642377495672,
"grad_norm": 0.18052087724208832,
"learning_rate": 2.732520210880104e-05,
"loss": 0.3429,
"step": 196800
},
{
"epoch": 0.45412527864158914,
"grad_norm": 0.34992730617523193,
"learning_rate": 2.7302150064199945e-05,
"loss": 0.3834,
"step": 197000
},
{
"epoch": 0.45458631953361106,
"grad_norm": Infinity,
"learning_rate": 2.7279213279821854e-05,
"loss": 0.3778,
"step": 197200
},
{
"epoch": 0.45504736042563293,
"grad_norm": 0.10320476442575455,
"learning_rate": 2.7256161235220763e-05,
"loss": 0.3591,
"step": 197400
},
{
"epoch": 0.45550840131765485,
"grad_norm": 0.4679946303367615,
"learning_rate": 2.7233109190619665e-05,
"loss": 0.3449,
"step": 197600
},
{
"epoch": 0.4559694422096768,
"grad_norm": 0.9031148552894592,
"learning_rate": 2.7210057146018568e-05,
"loss": 0.3661,
"step": 197800
},
{
"epoch": 0.4564304831016987,
"grad_norm": 0.5634335279464722,
"learning_rate": 2.7187005101417473e-05,
"loss": 0.3553,
"step": 198000
},
{
"epoch": 0.4568915239937206,
"grad_norm": 0.4014586806297302,
"learning_rate": 2.7163953056816376e-05,
"loss": 0.3447,
"step": 198200
},
{
"epoch": 0.45735256488574255,
"grad_norm": 0.4172525405883789,
"learning_rate": 2.714090101221528e-05,
"loss": 0.3593,
"step": 198400
},
{
"epoch": 0.4578136057777645,
"grad_norm": 0.4799288809299469,
"learning_rate": 2.7117848967614184e-05,
"loss": 0.3929,
"step": 198600
},
{
"epoch": 0.4582746466697864,
"grad_norm": 0.3821302056312561,
"learning_rate": 2.709479692301309e-05,
"loss": 0.3366,
"step": 198800
},
{
"epoch": 0.45873568756180827,
"grad_norm": 0.370421826839447,
"learning_rate": 2.707174487841199e-05,
"loss": 0.3137,
"step": 199000
},
{
"epoch": 0.4591967284538302,
"grad_norm": 0.3619524836540222,
"learning_rate": 2.7048692833810897e-05,
"loss": 0.354,
"step": 199200
},
{
"epoch": 0.4596577693458521,
"grad_norm": 0.4108444154262543,
"learning_rate": 2.70256407892098e-05,
"loss": 0.3598,
"step": 199400
},
{
"epoch": 0.46011881023787404,
"grad_norm": 0.3430013656616211,
"learning_rate": 2.7002588744608702e-05,
"loss": 0.3815,
"step": 199600
},
{
"epoch": 0.46057985112989597,
"grad_norm": 0.8741142153739929,
"learning_rate": 2.6979536700007608e-05,
"loss": 0.3517,
"step": 199800
},
{
"epoch": 0.4610408920219179,
"grad_norm": 0.5065380930900574,
"learning_rate": 2.6956599915629517e-05,
"loss": 0.3661,
"step": 200000
},
{
"epoch": 0.4610408920219179,
"eval_loss": 0.3632255792617798,
"eval_runtime": 223.8591,
"eval_samples_per_second": 19.575,
"eval_steps_per_second": 19.575,
"step": 200000
},
{
"epoch": 0.4615019329139398,
"grad_norm": 0.32413604855537415,
"learning_rate": 2.693354787102842e-05,
"loss": 0.3644,
"step": 200200
},
{
"epoch": 0.46196297380596174,
"grad_norm": 0.41090449690818787,
"learning_rate": 2.6910495826427328e-05,
"loss": 0.3813,
"step": 200400
},
{
"epoch": 0.46242401469798367,
"grad_norm": 0.8529661297798157,
"learning_rate": 2.6887443781826234e-05,
"loss": 0.3542,
"step": 200600
},
{
"epoch": 0.46288505559000553,
"grad_norm": 0.7049174308776855,
"learning_rate": 2.6864391737225136e-05,
"loss": 0.3432,
"step": 200800
},
{
"epoch": 0.46334609648202746,
"grad_norm": 0.3541184365749359,
"learning_rate": 2.6841339692624042e-05,
"loss": 0.3412,
"step": 201000
},
{
"epoch": 0.4638071373740494,
"grad_norm": 0.49452289938926697,
"learning_rate": 2.6818287648022944e-05,
"loss": 0.3766,
"step": 201200
},
{
"epoch": 0.4642681782660713,
"grad_norm": 0.4755121171474457,
"learning_rate": 2.6795235603421846e-05,
"loss": 0.3712,
"step": 201400
},
{
"epoch": 0.46472921915809323,
"grad_norm": 0.43829742074012756,
"learning_rate": 2.6772183558820752e-05,
"loss": 0.3475,
"step": 201600
},
{
"epoch": 0.46519026005011516,
"grad_norm": 1.0975539684295654,
"learning_rate": 2.6749131514219654e-05,
"loss": 0.3879,
"step": 201800
},
{
"epoch": 0.4656513009421371,
"grad_norm": 0.20283125340938568,
"learning_rate": 2.672607946961856e-05,
"loss": 0.3583,
"step": 202000
},
{
"epoch": 0.466112341834159,
"grad_norm": 0.3632067143917084,
"learning_rate": 2.6703027425017462e-05,
"loss": 0.322,
"step": 202200
},
{
"epoch": 0.4665733827261809,
"grad_norm": 0.4200218617916107,
"learning_rate": 2.6679975380416368e-05,
"loss": 0.3861,
"step": 202400
},
{
"epoch": 0.4670344236182028,
"grad_norm": 0.7758521437644958,
"learning_rate": 2.665692333581527e-05,
"loss": 0.3553,
"step": 202600
},
{
"epoch": 0.4674954645102247,
"grad_norm": 0.2579694390296936,
"learning_rate": 2.6633871291214173e-05,
"loss": 0.3825,
"step": 202800
},
{
"epoch": 0.46795650540224665,
"grad_norm": 0.5407618880271912,
"learning_rate": 2.6610819246613078e-05,
"loss": 0.3763,
"step": 203000
},
{
"epoch": 0.46841754629426857,
"grad_norm": 0.5165262222290039,
"learning_rate": 2.658776720201198e-05,
"loss": 0.3735,
"step": 203200
},
{
"epoch": 0.4688785871862905,
"grad_norm": 0.4201817810535431,
"learning_rate": 2.6564715157410886e-05,
"loss": 0.3706,
"step": 203400
},
{
"epoch": 0.4693396280783124,
"grad_norm": 0.261212021112442,
"learning_rate": 2.654166311280979e-05,
"loss": 0.3526,
"step": 203600
},
{
"epoch": 0.46980066897033435,
"grad_norm": 0.39151546359062195,
"learning_rate": 2.6518611068208697e-05,
"loss": 0.3323,
"step": 203800
},
{
"epoch": 0.4702617098623562,
"grad_norm": 0.4191110134124756,
"learning_rate": 2.6495559023607603e-05,
"loss": 0.355,
"step": 204000
},
{
"epoch": 0.47072275075437814,
"grad_norm": 0.3264467120170593,
"learning_rate": 2.6472506979006505e-05,
"loss": 0.3159,
"step": 204200
},
{
"epoch": 0.47118379164640006,
"grad_norm": 0.41095060110092163,
"learning_rate": 2.6449570194628415e-05,
"loss": 0.3712,
"step": 204400
},
{
"epoch": 0.471644832538422,
"grad_norm": 0.21805429458618164,
"learning_rate": 2.6426518150027317e-05,
"loss": 0.386,
"step": 204600
},
{
"epoch": 0.4721058734304439,
"grad_norm": 0.46556156873703003,
"learning_rate": 2.6403466105426223e-05,
"loss": 0.3039,
"step": 204800
},
{
"epoch": 0.47256691432246584,
"grad_norm": 0.4233507215976715,
"learning_rate": 2.6380529321048132e-05,
"loss": 0.3534,
"step": 205000
},
{
"epoch": 0.47302795521448776,
"grad_norm": 0.41719740629196167,
"learning_rate": 2.6357477276447034e-05,
"loss": 0.321,
"step": 205200
},
{
"epoch": 0.4734889961065097,
"grad_norm": 0.43227747082710266,
"learning_rate": 2.633442523184594e-05,
"loss": 0.3446,
"step": 205400
},
{
"epoch": 0.4739500369985316,
"grad_norm": 0.8075616955757141,
"learning_rate": 2.6311373187244842e-05,
"loss": 0.3575,
"step": 205600
},
{
"epoch": 0.4744110778905535,
"grad_norm": 0.48316138982772827,
"learning_rate": 2.6288321142643744e-05,
"loss": 0.3432,
"step": 205800
},
{
"epoch": 0.4748721187825754,
"grad_norm": 0.697849452495575,
"learning_rate": 2.626526909804265e-05,
"loss": 0.3618,
"step": 206000
},
{
"epoch": 0.4753331596745973,
"grad_norm": 1.1473220586776733,
"learning_rate": 2.624221705344156e-05,
"loss": 0.3621,
"step": 206200
},
{
"epoch": 0.47579420056661925,
"grad_norm": 0.35896036028862,
"learning_rate": 2.621916500884046e-05,
"loss": 0.3653,
"step": 206400
},
{
"epoch": 0.4762552414586412,
"grad_norm": 0.4214785695075989,
"learning_rate": 2.6196112964239367e-05,
"loss": 0.3663,
"step": 206600
},
{
"epoch": 0.4767162823506631,
"grad_norm": 0.6042930483818054,
"learning_rate": 2.617306091963827e-05,
"loss": 0.3765,
"step": 206800
},
{
"epoch": 0.477177323242685,
"grad_norm": 0.19935666024684906,
"learning_rate": 2.6150008875037175e-05,
"loss": 0.3685,
"step": 207000
},
{
"epoch": 0.47763836413470695,
"grad_norm": 0.5323235392570496,
"learning_rate": 2.6126956830436077e-05,
"loss": 0.3351,
"step": 207200
},
{
"epoch": 0.4780994050267288,
"grad_norm": 0.5466133952140808,
"learning_rate": 2.6103904785834983e-05,
"loss": 0.3499,
"step": 207400
},
{
"epoch": 0.47856044591875074,
"grad_norm": 0.4856882095336914,
"learning_rate": 2.6080852741233885e-05,
"loss": 0.3478,
"step": 207600
},
{
"epoch": 0.47902148681077267,
"grad_norm": 0.6916505694389343,
"learning_rate": 2.6057800696632788e-05,
"loss": 0.3367,
"step": 207800
},
{
"epoch": 0.4794825277027946,
"grad_norm": 0.5421025156974792,
"learning_rate": 2.6034748652031693e-05,
"loss": 0.3324,
"step": 208000
},
{
"epoch": 0.4799435685948165,
"grad_norm": 0.5142760872840881,
"learning_rate": 2.6011696607430596e-05,
"loss": 0.3528,
"step": 208200
},
{
"epoch": 0.48040460948683844,
"grad_norm": 0.5182974934577942,
"learning_rate": 2.59886445628295e-05,
"loss": 0.3312,
"step": 208400
},
{
"epoch": 0.48086565037886037,
"grad_norm": 0.1986812800168991,
"learning_rate": 2.5965592518228404e-05,
"loss": 0.3433,
"step": 208600
},
{
"epoch": 0.4813266912708823,
"grad_norm": 0.33641108870506287,
"learning_rate": 2.594254047362731e-05,
"loss": 0.3388,
"step": 208800
},
{
"epoch": 0.4817877321629042,
"grad_norm": 0.3251590430736542,
"learning_rate": 2.591948842902621e-05,
"loss": 0.3784,
"step": 209000
},
{
"epoch": 0.4822487730549261,
"grad_norm": 0.9723164439201355,
"learning_rate": 2.5896436384425117e-05,
"loss": 0.3798,
"step": 209200
},
{
"epoch": 0.482709813946948,
"grad_norm": 0.8655120134353638,
"learning_rate": 2.587338433982402e-05,
"loss": 0.3198,
"step": 209400
},
{
"epoch": 0.48317085483896993,
"grad_norm": 0.5627156496047974,
"learning_rate": 2.5850447555445932e-05,
"loss": 0.3499,
"step": 209600
},
{
"epoch": 0.48363189573099186,
"grad_norm": 0.17791125178337097,
"learning_rate": 2.5827395510844838e-05,
"loss": 0.3425,
"step": 209800
},
{
"epoch": 0.4840929366230138,
"grad_norm": 0.3033307194709778,
"learning_rate": 2.580434346624374e-05,
"loss": 0.3864,
"step": 210000
},
{
"epoch": 0.4845539775150357,
"grad_norm": 3.21570086479187,
"learning_rate": 2.5781291421642646e-05,
"loss": 0.355,
"step": 210200
},
{
"epoch": 0.48501501840705763,
"grad_norm": 0.3438258469104767,
"learning_rate": 2.5758239377041548e-05,
"loss": 0.3866,
"step": 210400
},
{
"epoch": 0.48547605929907955,
"grad_norm": 0.31849002838134766,
"learning_rate": 2.5735187332440454e-05,
"loss": 0.4093,
"step": 210600
},
{
"epoch": 0.4859371001911014,
"grad_norm": 0.5319012403488159,
"learning_rate": 2.5712135287839356e-05,
"loss": 0.3687,
"step": 210800
},
{
"epoch": 0.48639814108312335,
"grad_norm": 0.6535025835037231,
"learning_rate": 2.568908324323826e-05,
"loss": 0.3505,
"step": 211000
},
{
"epoch": 0.48685918197514527,
"grad_norm": 0.5355439782142639,
"learning_rate": 2.5666031198637164e-05,
"loss": 0.3309,
"step": 211200
},
{
"epoch": 0.4873202228671672,
"grad_norm": 0.22699476778507233,
"learning_rate": 2.5643094414259073e-05,
"loss": 0.3667,
"step": 211400
},
{
"epoch": 0.4877812637591891,
"grad_norm": 0.5435298085212708,
"learning_rate": 2.5620042369657975e-05,
"loss": 0.3565,
"step": 211600
},
{
"epoch": 0.48824230465121105,
"grad_norm": 0.562765896320343,
"learning_rate": 2.559710558527989e-05,
"loss": 0.3604,
"step": 211800
},
{
"epoch": 0.48870334554323297,
"grad_norm": 0.14014822244644165,
"learning_rate": 2.5574053540678794e-05,
"loss": 0.3412,
"step": 212000
},
{
"epoch": 0.4891643864352549,
"grad_norm": 0.21219348907470703,
"learning_rate": 2.55510014960777e-05,
"loss": 0.3547,
"step": 212200
},
{
"epoch": 0.4896254273272768,
"grad_norm": 0.3475276231765747,
"learning_rate": 2.55279494514766e-05,
"loss": 0.3801,
"step": 212400
},
{
"epoch": 0.4900864682192987,
"grad_norm": 0.5693538784980774,
"learning_rate": 2.5504897406875504e-05,
"loss": 0.3585,
"step": 212600
},
{
"epoch": 0.4905475091113206,
"grad_norm": 0.5602753758430481,
"learning_rate": 2.548184536227441e-05,
"loss": 0.3973,
"step": 212800
},
{
"epoch": 0.49100855000334254,
"grad_norm": 1.5884393453598022,
"learning_rate": 2.5458793317673312e-05,
"loss": 0.382,
"step": 213000
},
{
"epoch": 0.49146959089536446,
"grad_norm": 0.6807895302772522,
"learning_rate": 2.5435741273072218e-05,
"loss": 0.3551,
"step": 213200
},
{
"epoch": 0.4919306317873864,
"grad_norm": 0.576314389705658,
"learning_rate": 2.541268922847112e-05,
"loss": 0.3117,
"step": 213400
},
{
"epoch": 0.4923916726794083,
"grad_norm": 0.4557921886444092,
"learning_rate": 2.5389637183870026e-05,
"loss": 0.3377,
"step": 213600
},
{
"epoch": 0.49285271357143023,
"grad_norm": 0.6265081167221069,
"learning_rate": 2.5366585139268928e-05,
"loss": 0.3302,
"step": 213800
},
{
"epoch": 0.49331375446345216,
"grad_norm": 0.5826780796051025,
"learning_rate": 2.534353309466783e-05,
"loss": 0.3331,
"step": 214000
},
{
"epoch": 0.493774795355474,
"grad_norm": 0.33094868063926697,
"learning_rate": 2.5320481050066736e-05,
"loss": 0.3958,
"step": 214200
},
{
"epoch": 0.49423583624749595,
"grad_norm": 0.3664953410625458,
"learning_rate": 2.5297429005465638e-05,
"loss": 0.3522,
"step": 214400
},
{
"epoch": 0.4946968771395179,
"grad_norm": 0.8045748472213745,
"learning_rate": 2.5274376960864544e-05,
"loss": 0.3364,
"step": 214600
},
{
"epoch": 0.4951579180315398,
"grad_norm": 0.45286139845848083,
"learning_rate": 2.5251324916263446e-05,
"loss": 0.3899,
"step": 214800
},
{
"epoch": 0.4956189589235617,
"grad_norm": 0.2446468323469162,
"learning_rate": 2.522827287166235e-05,
"loss": 0.3338,
"step": 215000
},
{
"epoch": 0.49607999981558365,
"grad_norm": 0.5783166289329529,
"learning_rate": 2.520522082706126e-05,
"loss": 0.3378,
"step": 215200
},
{
"epoch": 0.4965410407076056,
"grad_norm": 0.4312882125377655,
"learning_rate": 2.5182168782460163e-05,
"loss": 0.3354,
"step": 215400
},
{
"epoch": 0.4970020815996275,
"grad_norm": 0.4972662329673767,
"learning_rate": 2.515911673785907e-05,
"loss": 0.3506,
"step": 215600
},
{
"epoch": 0.4974631224916494,
"grad_norm": 0.23885759711265564,
"learning_rate": 2.513606469325797e-05,
"loss": 0.3388,
"step": 215800
},
{
"epoch": 0.4979241633836713,
"grad_norm": 0.4714123010635376,
"learning_rate": 2.5113012648656877e-05,
"loss": 0.3611,
"step": 216000
},
{
"epoch": 0.4983852042756932,
"grad_norm": 0.43313103914260864,
"learning_rate": 2.5090075864278783e-05,
"loss": 0.3444,
"step": 216200
},
{
"epoch": 0.49884624516771514,
"grad_norm": 0.5866301655769348,
"learning_rate": 2.5067023819677688e-05,
"loss": 0.3329,
"step": 216400
},
{
"epoch": 0.49930728605973707,
"grad_norm": 0.34571877121925354,
"learning_rate": 2.504397177507659e-05,
"loss": 0.36,
"step": 216600
},
{
"epoch": 0.499768326951759,
"grad_norm": 0.14547963440418243,
"learning_rate": 2.5020919730475496e-05,
"loss": 0.361,
"step": 216800
},
{
"epoch": 0.5002293678437809,
"grad_norm": 0.5833833813667297,
"learning_rate": 2.49978676858744e-05,
"loss": 0.3469,
"step": 217000
},
{
"epoch": 0.5006904087358028,
"grad_norm": 0.33161187171936035,
"learning_rate": 2.4974815641273304e-05,
"loss": 0.353,
"step": 217200
},
{
"epoch": 0.5011514496278248,
"grad_norm": 0.700287401676178,
"learning_rate": 2.495176359667221e-05,
"loss": 0.3702,
"step": 217400
},
{
"epoch": 0.5016124905198467,
"grad_norm": 0.49698981642723083,
"learning_rate": 2.4928711552071112e-05,
"loss": 0.3624,
"step": 217600
},
{
"epoch": 0.5020735314118686,
"grad_norm": 1.1397374868392944,
"learning_rate": 2.4905659507470018e-05,
"loss": 0.3468,
"step": 217800
},
{
"epoch": 0.5025345723038905,
"grad_norm": 0.3919994831085205,
"learning_rate": 2.488260746286892e-05,
"loss": 0.3557,
"step": 218000
},
{
"epoch": 0.5029956131959125,
"grad_norm": 0.3460191488265991,
"learning_rate": 2.4859555418267826e-05,
"loss": 0.3653,
"step": 218200
},
{
"epoch": 0.5034566540879343,
"grad_norm": 0.3638404607772827,
"learning_rate": 2.4836503373666728e-05,
"loss": 0.3127,
"step": 218400
},
{
"epoch": 0.5039176949799562,
"grad_norm": 0.20301935076713562,
"learning_rate": 2.481345132906563e-05,
"loss": 0.3503,
"step": 218600
},
{
"epoch": 0.5043787358719781,
"grad_norm": 0.5420706868171692,
"learning_rate": 2.4790399284464536e-05,
"loss": 0.3478,
"step": 218800
},
{
"epoch": 0.504839776764,
"grad_norm": 0.401091605424881,
"learning_rate": 2.476734723986344e-05,
"loss": 0.3287,
"step": 219000
},
{
"epoch": 0.505300817656022,
"grad_norm": 0.24808503687381744,
"learning_rate": 2.4744295195262347e-05,
"loss": 0.3221,
"step": 219200
},
{
"epoch": 0.5057618585480439,
"grad_norm": 0.30465951561927795,
"learning_rate": 2.472124315066125e-05,
"loss": 0.3521,
"step": 219400
},
{
"epoch": 0.5062228994400658,
"grad_norm": 0.2896556556224823,
"learning_rate": 2.4698191106060152e-05,
"loss": 0.3335,
"step": 219600
},
{
"epoch": 0.5066839403320877,
"grad_norm": 0.30277949571609497,
"learning_rate": 2.4675139061459058e-05,
"loss": 0.3526,
"step": 219800
},
{
"epoch": 0.5071449812241097,
"grad_norm": 0.22430256009101868,
"learning_rate": 2.465208701685796e-05,
"loss": 0.3454,
"step": 220000
},
{
"epoch": 0.5076060221161316,
"grad_norm": 0.28608962893486023,
"learning_rate": 2.4629034972256866e-05,
"loss": 0.3798,
"step": 220200
},
{
"epoch": 0.5080670630081535,
"grad_norm": 0.6046766638755798,
"learning_rate": 2.4605982927655768e-05,
"loss": 0.364,
"step": 220400
},
{
"epoch": 0.5085281039001754,
"grad_norm": 0.43866434693336487,
"learning_rate": 2.4582930883054673e-05,
"loss": 0.345,
"step": 220600
},
{
"epoch": 0.5089891447921974,
"grad_norm": 0.39291495084762573,
"learning_rate": 2.4559994098676583e-05,
"loss": 0.3805,
"step": 220800
},
{
"epoch": 0.5094501856842193,
"grad_norm": 0.5116698741912842,
"learning_rate": 2.453694205407549e-05,
"loss": 0.3367,
"step": 221000
},
{
"epoch": 0.5099112265762412,
"grad_norm": 0.07824862003326416,
"learning_rate": 2.451389000947439e-05,
"loss": 0.311,
"step": 221200
},
{
"epoch": 0.5103722674682631,
"grad_norm": 0.47447752952575684,
"learning_rate": 2.44909532250963e-05,
"loss": 0.3678,
"step": 221400
},
{
"epoch": 0.5108333083602851,
"grad_norm": 0.18225307762622833,
"learning_rate": 2.4467901180495206e-05,
"loss": 0.3435,
"step": 221600
},
{
"epoch": 0.5112943492523069,
"grad_norm": 0.21971538662910461,
"learning_rate": 2.444484913589411e-05,
"loss": 0.3495,
"step": 221800
},
{
"epoch": 0.5117553901443288,
"grad_norm": 0.4918324053287506,
"learning_rate": 2.4421797091293014e-05,
"loss": 0.3465,
"step": 222000
},
{
"epoch": 0.5122164310363507,
"grad_norm": 0.6665719747543335,
"learning_rate": 2.439874504669192e-05,
"loss": 0.3679,
"step": 222200
},
{
"epoch": 0.5126774719283727,
"grad_norm": 0.20663170516490936,
"learning_rate": 2.437569300209082e-05,
"loss": 0.352,
"step": 222400
},
{
"epoch": 0.5131385128203946,
"grad_norm": 0.5381636619567871,
"learning_rate": 2.4352640957489724e-05,
"loss": 0.3477,
"step": 222600
},
{
"epoch": 0.5135995537124165,
"grad_norm": 0.621925950050354,
"learning_rate": 2.432958891288863e-05,
"loss": 0.3271,
"step": 222800
},
{
"epoch": 0.5140605946044384,
"grad_norm": 0.48457425832748413,
"learning_rate": 2.4306536868287532e-05,
"loss": 0.3827,
"step": 223000
},
{
"epoch": 0.5145216354964604,
"grad_norm": 0.3504721224308014,
"learning_rate": 2.428348482368644e-05,
"loss": 0.3364,
"step": 223200
},
{
"epoch": 0.5149826763884823,
"grad_norm": 0.4487791955471039,
"learning_rate": 2.4260432779085343e-05,
"loss": 0.3376,
"step": 223400
},
{
"epoch": 0.5154437172805042,
"grad_norm": 0.44401684403419495,
"learning_rate": 2.4237380734484245e-05,
"loss": 0.374,
"step": 223600
},
{
"epoch": 0.5159047581725261,
"grad_norm": 0.37068814039230347,
"learning_rate": 2.421432868988315e-05,
"loss": 0.3542,
"step": 223800
},
{
"epoch": 0.516365799064548,
"grad_norm": 0.7339873313903809,
"learning_rate": 2.4191276645282053e-05,
"loss": 0.3532,
"step": 224000
},
{
"epoch": 0.51682683995657,
"grad_norm": 0.9264025092124939,
"learning_rate": 2.416822460068096e-05,
"loss": 0.4053,
"step": 224200
},
{
"epoch": 0.5172878808485919,
"grad_norm": 0.7314459681510925,
"learning_rate": 2.414517255607986e-05,
"loss": 0.3376,
"step": 224400
},
{
"epoch": 0.5177489217406138,
"grad_norm": 0.184389129281044,
"learning_rate": 2.4122120511478767e-05,
"loss": 0.3643,
"step": 224600
},
{
"epoch": 0.5182099626326357,
"grad_norm": 0.6588215827941895,
"learning_rate": 2.409906846687767e-05,
"loss": 0.3549,
"step": 224800
},
{
"epoch": 0.5186710035246577,
"grad_norm": 0.37764155864715576,
"learning_rate": 2.4076016422276575e-05,
"loss": 0.3305,
"step": 225000
},
{
"epoch": 0.5191320444166795,
"grad_norm": 0.1763819456100464,
"learning_rate": 2.405296437767548e-05,
"loss": 0.3311,
"step": 225200
},
{
"epoch": 0.5195930853087014,
"grad_norm": 0.26624786853790283,
"learning_rate": 2.4029912333074383e-05,
"loss": 0.3688,
"step": 225400
},
{
"epoch": 0.5200541262007233,
"grad_norm": 0.6269540786743164,
"learning_rate": 2.400686028847329e-05,
"loss": 0.3835,
"step": 225600
},
{
"epoch": 0.5205151670927453,
"grad_norm": 0.22113919258117676,
"learning_rate": 2.398380824387219e-05,
"loss": 0.3678,
"step": 225800
},
{
"epoch": 0.5209762079847672,
"grad_norm": 1.164993405342102,
"learning_rate": 2.3960756199271097e-05,
"loss": 0.3185,
"step": 226000
},
{
"epoch": 0.5214372488767891,
"grad_norm": 0.6420906186103821,
"learning_rate": 2.393770415467e-05,
"loss": 0.3369,
"step": 226200
},
{
"epoch": 0.521898289768811,
"grad_norm": 0.41654080152511597,
"learning_rate": 2.39146521100689e-05,
"loss": 0.3131,
"step": 226400
},
{
"epoch": 0.522359330660833,
"grad_norm": 0.35806384682655334,
"learning_rate": 2.389160006546781e-05,
"loss": 0.3558,
"step": 226600
},
{
"epoch": 0.5228203715528549,
"grad_norm": 0.5440140962600708,
"learning_rate": 2.3868548020866712e-05,
"loss": 0.3477,
"step": 226800
},
{
"epoch": 0.5232814124448768,
"grad_norm": 0.5610035061836243,
"learning_rate": 2.3845495976265618e-05,
"loss": 0.3358,
"step": 227000
},
{
"epoch": 0.5237424533368987,
"grad_norm": 0.28187263011932373,
"learning_rate": 2.3822559191887524e-05,
"loss": 0.3599,
"step": 227200
},
{
"epoch": 0.5242034942289207,
"grad_norm": 0.5849646925926208,
"learning_rate": 2.379950714728643e-05,
"loss": 0.3884,
"step": 227400
},
{
"epoch": 0.5246645351209426,
"grad_norm": 0.34255972504615784,
"learning_rate": 2.3776455102685332e-05,
"loss": 0.3524,
"step": 227600
},
{
"epoch": 0.5251255760129645,
"grad_norm": 1.5666743516921997,
"learning_rate": 2.3753403058084238e-05,
"loss": 0.3348,
"step": 227800
},
{
"epoch": 0.5255866169049864,
"grad_norm": 0.4880357086658478,
"learning_rate": 2.3730351013483143e-05,
"loss": 0.3285,
"step": 228000
},
{
"epoch": 0.5260476577970084,
"grad_norm": 0.403134286403656,
"learning_rate": 2.3707298968882046e-05,
"loss": 0.364,
"step": 228200
},
{
"epoch": 0.5265086986890303,
"grad_norm": 0.4551350474357605,
"learning_rate": 2.368424692428095e-05,
"loss": 0.3532,
"step": 228400
},
{
"epoch": 0.5269697395810521,
"grad_norm": 0.4039537012577057,
"learning_rate": 2.3661194879679854e-05,
"loss": 0.3438,
"step": 228600
},
{
"epoch": 0.527430780473074,
"grad_norm": 0.36679184436798096,
"learning_rate": 2.363814283507876e-05,
"loss": 0.369,
"step": 228800
},
{
"epoch": 0.5278918213650959,
"grad_norm": 0.6764769554138184,
"learning_rate": 2.361509079047766e-05,
"loss": 0.3655,
"step": 229000
},
{
"epoch": 0.5283528622571179,
"grad_norm": 0.562163233757019,
"learning_rate": 2.3592038745876567e-05,
"loss": 0.3294,
"step": 229200
},
{
"epoch": 0.5288139031491398,
"grad_norm": 1.8040056228637695,
"learning_rate": 2.3569101961498476e-05,
"loss": 0.3375,
"step": 229400
},
{
"epoch": 0.5292749440411617,
"grad_norm": 0.3456130027770996,
"learning_rate": 2.3546049916897382e-05,
"loss": 0.3603,
"step": 229600
},
{
"epoch": 0.5297359849331836,
"grad_norm": 0.5677599310874939,
"learning_rate": 2.3522997872296284e-05,
"loss": 0.3942,
"step": 229800
},
{
"epoch": 0.5301970258252056,
"grad_norm": 0.34969085454940796,
"learning_rate": 2.3500061087918194e-05,
"loss": 0.3699,
"step": 230000
},
{
"epoch": 0.5306580667172275,
"grad_norm": 0.23239371180534363,
"learning_rate": 2.3477009043317096e-05,
"loss": 0.3459,
"step": 230200
},
{
"epoch": 0.5311191076092494,
"grad_norm": 0.21217826008796692,
"learning_rate": 2.3453956998716005e-05,
"loss": 0.3744,
"step": 230400
},
{
"epoch": 0.5315801485012713,
"grad_norm": 0.5926826596260071,
"learning_rate": 2.3430904954114907e-05,
"loss": 0.3403,
"step": 230600
},
{
"epoch": 0.5320411893932933,
"grad_norm": 0.1639653891324997,
"learning_rate": 2.340785290951381e-05,
"loss": 0.3301,
"step": 230800
},
{
"epoch": 0.5325022302853152,
"grad_norm": 0.186824232339859,
"learning_rate": 2.3384800864912715e-05,
"loss": 0.3454,
"step": 231000
},
{
"epoch": 0.5329632711773371,
"grad_norm": 0.33217012882232666,
"learning_rate": 2.3361748820311617e-05,
"loss": 0.3633,
"step": 231200
},
{
"epoch": 0.533424312069359,
"grad_norm": 0.18646745383739471,
"learning_rate": 2.3338696775710523e-05,
"loss": 0.3456,
"step": 231400
},
{
"epoch": 0.533885352961381,
"grad_norm": 0.6676353812217712,
"learning_rate": 2.3315644731109425e-05,
"loss": 0.3685,
"step": 231600
},
{
"epoch": 0.5343463938534029,
"grad_norm": 0.18490371108055115,
"learning_rate": 2.329259268650833e-05,
"loss": 0.3482,
"step": 231800
},
{
"epoch": 0.5348074347454247,
"grad_norm": 0.3708191215991974,
"learning_rate": 2.3269540641907233e-05,
"loss": 0.3458,
"step": 232000
},
{
"epoch": 0.5352684756374466,
"grad_norm": 0.6311525702476501,
"learning_rate": 2.324648859730614e-05,
"loss": 0.3915,
"step": 232200
},
{
"epoch": 0.5357295165294685,
"grad_norm": 0.16813811659812927,
"learning_rate": 2.3223436552705045e-05,
"loss": 0.3629,
"step": 232400
},
{
"epoch": 0.5361905574214905,
"grad_norm": 0.5054236054420471,
"learning_rate": 2.3200384508103947e-05,
"loss": 0.378,
"step": 232600
},
{
"epoch": 0.5366515983135124,
"grad_norm": 0.4837896525859833,
"learning_rate": 2.3177332463502853e-05,
"loss": 0.3627,
"step": 232800
},
{
"epoch": 0.5371126392055343,
"grad_norm": 0.6555543541908264,
"learning_rate": 2.3154280418901755e-05,
"loss": 0.3386,
"step": 233000
},
{
"epoch": 0.5375736800975562,
"grad_norm": 0.3020295798778534,
"learning_rate": 2.313122837430066e-05,
"loss": 0.3384,
"step": 233200
},
{
"epoch": 0.5380347209895782,
"grad_norm": 0.7077879905700684,
"learning_rate": 2.3108176329699563e-05,
"loss": 0.3504,
"step": 233400
},
{
"epoch": 0.5384957618816001,
"grad_norm": 0.3409593999385834,
"learning_rate": 2.3085124285098465e-05,
"loss": 0.3971,
"step": 233600
},
{
"epoch": 0.538956802773622,
"grad_norm": 0.7338614463806152,
"learning_rate": 2.3062072240497374e-05,
"loss": 0.3386,
"step": 233800
},
{
"epoch": 0.5394178436656439,
"grad_norm": 0.47513458132743835,
"learning_rate": 2.3039020195896277e-05,
"loss": 0.3627,
"step": 234000
},
{
"epoch": 0.5398788845576659,
"grad_norm": 0.7971121668815613,
"learning_rate": 2.3015968151295182e-05,
"loss": 0.3451,
"step": 234200
},
{
"epoch": 0.5403399254496878,
"grad_norm": 0.36057165265083313,
"learning_rate": 2.2992916106694085e-05,
"loss": 0.3373,
"step": 234400
},
{
"epoch": 0.5408009663417097,
"grad_norm": 0.328387051820755,
"learning_rate": 2.2969864062092987e-05,
"loss": 0.3457,
"step": 234600
},
{
"epoch": 0.5412620072337316,
"grad_norm": 0.36626243591308594,
"learning_rate": 2.2946812017491892e-05,
"loss": 0.3243,
"step": 234800
},
{
"epoch": 0.5417230481257536,
"grad_norm": 0.703411340713501,
"learning_rate": 2.2923759972890795e-05,
"loss": 0.3601,
"step": 235000
},
{
"epoch": 0.5421840890177755,
"grad_norm": 0.14761626720428467,
"learning_rate": 2.29007079282897e-05,
"loss": 0.3622,
"step": 235200
},
{
"epoch": 0.5426451299097973,
"grad_norm": 0.24205593764781952,
"learning_rate": 2.2877655883688606e-05,
"loss": 0.352,
"step": 235400
},
{
"epoch": 0.5431061708018192,
"grad_norm": 0.3818623423576355,
"learning_rate": 2.285460383908751e-05,
"loss": 0.3355,
"step": 235600
},
{
"epoch": 0.5435672116938411,
"grad_norm": 0.38053658604621887,
"learning_rate": 2.2831551794486414e-05,
"loss": 0.3757,
"step": 235800
},
{
"epoch": 0.5440282525858631,
"grad_norm": 1.4526900053024292,
"learning_rate": 2.2808499749885316e-05,
"loss": 0.3516,
"step": 236000
},
{
"epoch": 0.544489293477885,
"grad_norm": 0.6705203652381897,
"learning_rate": 2.2785447705284222e-05,
"loss": 0.3874,
"step": 236200
},
{
"epoch": 0.5449503343699069,
"grad_norm": 1.1619576215744019,
"learning_rate": 2.2762395660683124e-05,
"loss": 0.3557,
"step": 236400
},
{
"epoch": 0.5454113752619288,
"grad_norm": 0.22762836515903473,
"learning_rate": 2.2739458876305034e-05,
"loss": 0.3626,
"step": 236600
},
{
"epoch": 0.5458724161539508,
"grad_norm": 0.3335479497909546,
"learning_rate": 2.271640683170394e-05,
"loss": 0.3846,
"step": 236800
},
{
"epoch": 0.5463334570459727,
"grad_norm": 0.15371176600456238,
"learning_rate": 2.269347004732585e-05,
"loss": 0.3413,
"step": 237000
},
{
"epoch": 0.5467944979379946,
"grad_norm": 2.1034395694732666,
"learning_rate": 2.2670418002724754e-05,
"loss": 0.3586,
"step": 237200
},
{
"epoch": 0.5472555388300165,
"grad_norm": 0.5972880721092224,
"learning_rate": 2.2647365958123656e-05,
"loss": 0.3768,
"step": 237400
},
{
"epoch": 0.5477165797220385,
"grad_norm": 0.5265405178070068,
"learning_rate": 2.262431391352256e-05,
"loss": 0.3541,
"step": 237600
},
{
"epoch": 0.5481776206140604,
"grad_norm": 0.16690166294574738,
"learning_rate": 2.2601261868921464e-05,
"loss": 0.382,
"step": 237800
},
{
"epoch": 0.5486386615060823,
"grad_norm": 0.3716908395290375,
"learning_rate": 2.257820982432037e-05,
"loss": 0.3358,
"step": 238000
},
{
"epoch": 0.5490997023981042,
"grad_norm": 0.663103461265564,
"learning_rate": 2.2555157779719276e-05,
"loss": 0.3404,
"step": 238200
},
{
"epoch": 0.5495607432901262,
"grad_norm": 0.37385207414627075,
"learning_rate": 2.2532105735118178e-05,
"loss": 0.3701,
"step": 238400
},
{
"epoch": 0.5500217841821481,
"grad_norm": 0.41825392842292786,
"learning_rate": 2.2509168950740087e-05,
"loss": 0.3319,
"step": 238600
},
{
"epoch": 0.5504828250741699,
"grad_norm": 0.2013470083475113,
"learning_rate": 2.248611690613899e-05,
"loss": 0.3486,
"step": 238800
},
{
"epoch": 0.5509438659661918,
"grad_norm": 0.3899303376674652,
"learning_rate": 2.2463064861537895e-05,
"loss": 0.3422,
"step": 239000
},
{
"epoch": 0.5514049068582138,
"grad_norm": 0.6563234925270081,
"learning_rate": 2.2440012816936797e-05,
"loss": 0.3687,
"step": 239200
},
{
"epoch": 0.5518659477502357,
"grad_norm": 0.3409351110458374,
"learning_rate": 2.2416960772335703e-05,
"loss": 0.3556,
"step": 239400
},
{
"epoch": 0.5523269886422576,
"grad_norm": 0.6538074612617493,
"learning_rate": 2.239390872773461e-05,
"loss": 0.3613,
"step": 239600
},
{
"epoch": 0.5527880295342795,
"grad_norm": 0.3599226772785187,
"learning_rate": 2.237085668313351e-05,
"loss": 0.356,
"step": 239800
},
{
"epoch": 0.5532490704263014,
"grad_norm": 0.655312180519104,
"learning_rate": 2.2347804638532417e-05,
"loss": 0.3705,
"step": 240000
},
{
"epoch": 0.5537101113183234,
"grad_norm": 0.3415062725543976,
"learning_rate": 2.232475259393132e-05,
"loss": 0.3574,
"step": 240200
},
{
"epoch": 0.5541711522103453,
"grad_norm": 0.21841752529144287,
"learning_rate": 2.2301700549330225e-05,
"loss": 0.3384,
"step": 240400
},
{
"epoch": 0.5546321931023672,
"grad_norm": 0.5347087979316711,
"learning_rate": 2.2278648504729127e-05,
"loss": 0.3833,
"step": 240600
},
{
"epoch": 0.5550932339943891,
"grad_norm": 0.4798644483089447,
"learning_rate": 2.225559646012803e-05,
"loss": 0.3587,
"step": 240800
},
{
"epoch": 0.5555542748864111,
"grad_norm": 0.3243103325366974,
"learning_rate": 2.223254441552694e-05,
"loss": 0.36,
"step": 241000
},
{
"epoch": 0.556015315778433,
"grad_norm": 0.2395264357328415,
"learning_rate": 2.2209607631148848e-05,
"loss": 0.3705,
"step": 241200
},
{
"epoch": 0.5564763566704549,
"grad_norm": 0.445444256067276,
"learning_rate": 2.218655558654775e-05,
"loss": 0.3345,
"step": 241400
},
{
"epoch": 0.5569373975624768,
"grad_norm": 0.37865936756134033,
"learning_rate": 2.2163503541946652e-05,
"loss": 0.3447,
"step": 241600
},
{
"epoch": 0.5573984384544988,
"grad_norm": 0.6776481866836548,
"learning_rate": 2.2140451497345558e-05,
"loss": 0.3658,
"step": 241800
},
{
"epoch": 0.5578594793465207,
"grad_norm": 0.7007972598075867,
"learning_rate": 2.211739945274446e-05,
"loss": 0.3518,
"step": 242000
},
{
"epoch": 0.5583205202385425,
"grad_norm": 0.2781836688518524,
"learning_rate": 2.209434740814337e-05,
"loss": 0.3575,
"step": 242200
},
{
"epoch": 0.5587815611305644,
"grad_norm": 0.2978268265724182,
"learning_rate": 2.207129536354227e-05,
"loss": 0.3476,
"step": 242400
},
{
"epoch": 0.5592426020225864,
"grad_norm": 0.5814823508262634,
"learning_rate": 2.2048243318941174e-05,
"loss": 0.3442,
"step": 242600
},
{
"epoch": 0.5597036429146083,
"grad_norm": 0.5247181057929993,
"learning_rate": 2.202519127434008e-05,
"loss": 0.3294,
"step": 242800
},
{
"epoch": 0.5601646838066302,
"grad_norm": 0.4408251941204071,
"learning_rate": 2.2002139229738982e-05,
"loss": 0.3616,
"step": 243000
},
{
"epoch": 0.5606257246986521,
"grad_norm": 0.160074383020401,
"learning_rate": 2.1979087185137887e-05,
"loss": 0.3804,
"step": 243200
},
{
"epoch": 0.561086765590674,
"grad_norm": 1.4818050861358643,
"learning_rate": 2.195603514053679e-05,
"loss": 0.378,
"step": 243400
},
{
"epoch": 0.561547806482696,
"grad_norm": 0.20598101615905762,
"learning_rate": 2.1932983095935695e-05,
"loss": 0.3354,
"step": 243600
},
{
"epoch": 0.5620088473747179,
"grad_norm": 0.18128018081188202,
"learning_rate": 2.1910046311557605e-05,
"loss": 0.3813,
"step": 243800
},
{
"epoch": 0.5624698882667398,
"grad_norm": 0.33038538694381714,
"learning_rate": 2.188699426695651e-05,
"loss": 0.3415,
"step": 244000
},
{
"epoch": 0.5629309291587618,
"grad_norm": 0.24530620872974396,
"learning_rate": 2.1863942222355413e-05,
"loss": 0.3693,
"step": 244200
},
{
"epoch": 0.5633919700507837,
"grad_norm": 0.26805493235588074,
"learning_rate": 2.1841005437977322e-05,
"loss": 0.3657,
"step": 244400
},
{
"epoch": 0.5638530109428056,
"grad_norm": 0.4422042965888977,
"learning_rate": 2.1817953393376224e-05,
"loss": 0.3703,
"step": 244600
},
{
"epoch": 0.5643140518348275,
"grad_norm": 0.4342189133167267,
"learning_rate": 2.1794901348775133e-05,
"loss": 0.3295,
"step": 244800
},
{
"epoch": 0.5647750927268494,
"grad_norm": 0.30248162150382996,
"learning_rate": 2.1771849304174035e-05,
"loss": 0.3688,
"step": 245000
},
{
"epoch": 0.5652361336188714,
"grad_norm": 0.8874296545982361,
"learning_rate": 2.174879725957294e-05,
"loss": 0.3462,
"step": 245200
},
{
"epoch": 0.5656971745108933,
"grad_norm": 0.3769371211528778,
"learning_rate": 2.1725745214971843e-05,
"loss": 0.33,
"step": 245400
},
{
"epoch": 0.5661582154029151,
"grad_norm": 0.6655548810958862,
"learning_rate": 2.1702693170370746e-05,
"loss": 0.3478,
"step": 245600
},
{
"epoch": 0.566619256294937,
"grad_norm": 0.5909347534179688,
"learning_rate": 2.167964112576965e-05,
"loss": 0.3314,
"step": 245800
},
{
"epoch": 0.567080297186959,
"grad_norm": 0.38579100370407104,
"learning_rate": 2.1656589081168554e-05,
"loss": 0.3441,
"step": 246000
},
{
"epoch": 0.5675413380789809,
"grad_norm": 0.3247869610786438,
"learning_rate": 2.163353703656746e-05,
"loss": 0.3496,
"step": 246200
},
{
"epoch": 0.5680023789710028,
"grad_norm": 0.7525120973587036,
"learning_rate": 2.161048499196636e-05,
"loss": 0.3644,
"step": 246400
},
{
"epoch": 0.5684634198630247,
"grad_norm": 0.6024205088615417,
"learning_rate": 2.1587432947365267e-05,
"loss": 0.3265,
"step": 246600
},
{
"epoch": 0.5689244607550467,
"grad_norm": 0.2748749256134033,
"learning_rate": 2.1564380902764173e-05,
"loss": 0.3415,
"step": 246800
},
{
"epoch": 0.5693855016470686,
"grad_norm": 0.5099011659622192,
"learning_rate": 2.1541328858163075e-05,
"loss": 0.3354,
"step": 247000
},
{
"epoch": 0.5698465425390905,
"grad_norm": 0.46670734882354736,
"learning_rate": 2.151827681356198e-05,
"loss": 0.3642,
"step": 247200
},
{
"epoch": 0.5703075834311124,
"grad_norm": 0.17403045296669006,
"learning_rate": 2.1495224768960883e-05,
"loss": 0.3143,
"step": 247400
},
{
"epoch": 0.5707686243231344,
"grad_norm": 0.4343968331813812,
"learning_rate": 2.147217272435979e-05,
"loss": 0.3552,
"step": 247600
},
{
"epoch": 0.5712296652151563,
"grad_norm": 0.3077774941921234,
"learning_rate": 2.144912067975869e-05,
"loss": 0.371,
"step": 247800
},
{
"epoch": 0.5716907061071782,
"grad_norm": 0.27224865555763245,
"learning_rate": 2.1426068635157593e-05,
"loss": 0.3295,
"step": 248000
},
{
"epoch": 0.5721517469992001,
"grad_norm": 0.16844475269317627,
"learning_rate": 2.1403016590556502e-05,
"loss": 0.3573,
"step": 248200
},
{
"epoch": 0.572612787891222,
"grad_norm": 0.3499503433704376,
"learning_rate": 2.1379964545955405e-05,
"loss": 0.3118,
"step": 248400
},
{
"epoch": 0.573073828783244,
"grad_norm": 0.16508857905864716,
"learning_rate": 2.1357027761577314e-05,
"loss": 0.3891,
"step": 248600
},
{
"epoch": 0.5735348696752659,
"grad_norm": 0.5639446973800659,
"learning_rate": 2.1333975716976216e-05,
"loss": 0.3474,
"step": 248800
},
{
"epoch": 0.5739959105672877,
"grad_norm": 0.4935666620731354,
"learning_rate": 2.1310923672375122e-05,
"loss": 0.359,
"step": 249000
},
{
"epoch": 0.5744569514593096,
"grad_norm": 0.19311217963695526,
"learning_rate": 2.1287871627774024e-05,
"loss": 0.3633,
"step": 249200
},
{
"epoch": 0.5749179923513316,
"grad_norm": 0.678378701210022,
"learning_rate": 2.1264819583172933e-05,
"loss": 0.352,
"step": 249400
},
{
"epoch": 0.5753790332433535,
"grad_norm": 0.425797700881958,
"learning_rate": 2.1241767538571836e-05,
"loss": 0.3342,
"step": 249600
},
{
"epoch": 0.5758400741353754,
"grad_norm": 0.6611183881759644,
"learning_rate": 2.1218715493970738e-05,
"loss": 0.3479,
"step": 249800
},
{
"epoch": 0.5763011150273973,
"grad_norm": 0.6298643350601196,
"learning_rate": 2.1195663449369644e-05,
"loss": 0.3454,
"step": 250000
},
{
"epoch": 0.5763011150273973,
"eval_loss": 0.35522738099098206,
"eval_runtime": 223.3987,
"eval_samples_per_second": 19.615,
"eval_steps_per_second": 19.615,
"step": 250000
},
{
"epoch": 0.5767621559194193,
"grad_norm": 0.3302873373031616,
"learning_rate": 2.1172611404768546e-05,
"loss": 0.332,
"step": 250200
},
{
"epoch": 0.5772231968114412,
"grad_norm": 0.3365733325481415,
"learning_rate": 2.1149674620390455e-05,
"loss": 0.355,
"step": 250400
},
{
"epoch": 0.5776842377034631,
"grad_norm": 0.5758484601974487,
"learning_rate": 2.112662257578936e-05,
"loss": 0.3411,
"step": 250600
},
{
"epoch": 0.578145278595485,
"grad_norm": 0.5052395462989807,
"learning_rate": 2.1103570531188266e-05,
"loss": 0.3722,
"step": 250800
},
{
"epoch": 0.578606319487507,
"grad_norm": 0.37034520506858826,
"learning_rate": 2.108051848658717e-05,
"loss": 0.3388,
"step": 251000
},
{
"epoch": 0.5790673603795289,
"grad_norm": 0.823411226272583,
"learning_rate": 2.1057466441986074e-05,
"loss": 0.355,
"step": 251200
},
{
"epoch": 0.5795284012715508,
"grad_norm": 0.9128584265708923,
"learning_rate": 2.1034414397384977e-05,
"loss": 0.3564,
"step": 251400
},
{
"epoch": 0.5799894421635727,
"grad_norm": 0.3455374538898468,
"learning_rate": 2.1011362352783882e-05,
"loss": 0.3322,
"step": 251600
},
{
"epoch": 0.5804504830555947,
"grad_norm": 0.7634621858596802,
"learning_rate": 2.0988310308182785e-05,
"loss": 0.38,
"step": 251800
},
{
"epoch": 0.5809115239476166,
"grad_norm": 0.42627963423728943,
"learning_rate": 2.0965258263581687e-05,
"loss": 0.3143,
"step": 252000
},
{
"epoch": 0.5813725648396385,
"grad_norm": 0.9239504933357239,
"learning_rate": 2.0942206218980593e-05,
"loss": 0.3617,
"step": 252200
},
{
"epoch": 0.5818336057316603,
"grad_norm": 0.42547696828842163,
"learning_rate": 2.0919154174379498e-05,
"loss": 0.3366,
"step": 252400
},
{
"epoch": 0.5822946466236822,
"grad_norm": 0.21705199778079987,
"learning_rate": 2.0896102129778404e-05,
"loss": 0.3644,
"step": 252600
},
{
"epoch": 0.5827556875157042,
"grad_norm": 0.3512963354587555,
"learning_rate": 2.0873050085177306e-05,
"loss": 0.3542,
"step": 252800
},
{
"epoch": 0.5832167284077261,
"grad_norm": 0.27958160638809204,
"learning_rate": 2.0849998040576212e-05,
"loss": 0.3861,
"step": 253000
},
{
"epoch": 0.583677769299748,
"grad_norm": 0.34779396653175354,
"learning_rate": 2.0826945995975114e-05,
"loss": 0.3681,
"step": 253200
},
{
"epoch": 0.5841388101917699,
"grad_norm": 0.7137684226036072,
"learning_rate": 2.0803893951374016e-05,
"loss": 0.3475,
"step": 253400
},
{
"epoch": 0.5845998510837919,
"grad_norm": 1.9228719472885132,
"learning_rate": 2.0780841906772922e-05,
"loss": 0.3275,
"step": 253600
},
{
"epoch": 0.5850608919758138,
"grad_norm": 0.32466796040534973,
"learning_rate": 2.0757789862171824e-05,
"loss": 0.3225,
"step": 253800
},
{
"epoch": 0.5855219328678357,
"grad_norm": 0.23439137637615204,
"learning_rate": 2.073473781757073e-05,
"loss": 0.3916,
"step": 254000
},
{
"epoch": 0.5859829737598576,
"grad_norm": 0.6748563647270203,
"learning_rate": 2.0711685772969636e-05,
"loss": 0.3349,
"step": 254200
},
{
"epoch": 0.5864440146518796,
"grad_norm": 0.6619309782981873,
"learning_rate": 2.0688633728368538e-05,
"loss": 0.3694,
"step": 254400
},
{
"epoch": 0.5869050555439015,
"grad_norm": 0.5003235340118408,
"learning_rate": 2.0665581683767444e-05,
"loss": 0.3308,
"step": 254600
},
{
"epoch": 0.5873660964359234,
"grad_norm": 0.08772457391023636,
"learning_rate": 2.0642529639166346e-05,
"loss": 0.3203,
"step": 254800
},
{
"epoch": 0.5878271373279453,
"grad_norm": 0.939509928226471,
"learning_rate": 2.0619477594565252e-05,
"loss": 0.3432,
"step": 255000
},
{
"epoch": 0.5882881782199673,
"grad_norm": 0.5043662190437317,
"learning_rate": 2.0596425549964154e-05,
"loss": 0.3345,
"step": 255200
},
{
"epoch": 0.5887492191119892,
"grad_norm": 0.3611208200454712,
"learning_rate": 2.057337350536306e-05,
"loss": 0.3082,
"step": 255400
},
{
"epoch": 0.5892102600040111,
"grad_norm": 0.9913475513458252,
"learning_rate": 2.0550321460761962e-05,
"loss": 0.3548,
"step": 255600
},
{
"epoch": 0.5896713008960329,
"grad_norm": 0.3058832585811615,
"learning_rate": 2.0527269416160868e-05,
"loss": 0.3443,
"step": 255800
},
{
"epoch": 0.5901323417880548,
"grad_norm": 0.4234495759010315,
"learning_rate": 2.0504217371559773e-05,
"loss": 0.3319,
"step": 256000
},
{
"epoch": 0.5905933826800768,
"grad_norm": 0.5669376254081726,
"learning_rate": 2.0481165326958676e-05,
"loss": 0.4187,
"step": 256200
},
{
"epoch": 0.5910544235720987,
"grad_norm": 0.3601876199245453,
"learning_rate": 2.045811328235758e-05,
"loss": 0.3278,
"step": 256400
},
{
"epoch": 0.5915154644641206,
"grad_norm": 0.30421602725982666,
"learning_rate": 2.0435061237756484e-05,
"loss": 0.344,
"step": 256600
},
{
"epoch": 0.5919765053561425,
"grad_norm": 0.19996659457683563,
"learning_rate": 2.0412009193155386e-05,
"loss": 0.3191,
"step": 256800
},
{
"epoch": 0.5924375462481645,
"grad_norm": 0.3626260757446289,
"learning_rate": 2.038895714855429e-05,
"loss": 0.3599,
"step": 257000
},
{
"epoch": 0.5928985871401864,
"grad_norm": 0.8263654112815857,
"learning_rate": 2.0365905103953194e-05,
"loss": 0.4095,
"step": 257200
},
{
"epoch": 0.5933596280322083,
"grad_norm": 0.5469064116477966,
"learning_rate": 2.03428530593521e-05,
"loss": 0.3265,
"step": 257400
},
{
"epoch": 0.5938206689242302,
"grad_norm": 0.6776677966117859,
"learning_rate": 2.0319801014751005e-05,
"loss": 0.3639,
"step": 257600
},
{
"epoch": 0.5942817098162522,
"grad_norm": 0.43688419461250305,
"learning_rate": 2.029674897014991e-05,
"loss": 0.3872,
"step": 257800
},
{
"epoch": 0.5947427507082741,
"grad_norm": 0.31415465474128723,
"learning_rate": 2.0273927445994824e-05,
"loss": 0.3296,
"step": 258000
},
{
"epoch": 0.595203791600296,
"grad_norm": 0.34074509143829346,
"learning_rate": 2.0250875401393726e-05,
"loss": 0.3471,
"step": 258200
},
{
"epoch": 0.5956648324923179,
"grad_norm": 0.39756181836128235,
"learning_rate": 2.022793861701564e-05,
"loss": 0.3595,
"step": 258400
},
{
"epoch": 0.5961258733843399,
"grad_norm": 0.5300395488739014,
"learning_rate": 2.020488657241454e-05,
"loss": 0.3569,
"step": 258600
},
{
"epoch": 0.5965869142763618,
"grad_norm": 0.1204846054315567,
"learning_rate": 2.0181834527813446e-05,
"loss": 0.3552,
"step": 258800
},
{
"epoch": 0.5970479551683837,
"grad_norm": 0.6028696894645691,
"learning_rate": 2.015878248321235e-05,
"loss": 0.366,
"step": 259000
},
{
"epoch": 0.5975089960604055,
"grad_norm": 0.6177276968955994,
"learning_rate": 2.0135730438611254e-05,
"loss": 0.3559,
"step": 259200
},
{
"epoch": 0.5979700369524275,
"grad_norm": 0.1689945012331009,
"learning_rate": 2.0112678394010157e-05,
"loss": 0.3374,
"step": 259400
},
{
"epoch": 0.5984310778444494,
"grad_norm": 0.5522930026054382,
"learning_rate": 2.0089626349409062e-05,
"loss": 0.3193,
"step": 259600
},
{
"epoch": 0.5988921187364713,
"grad_norm": 0.7787148356437683,
"learning_rate": 2.0066574304807968e-05,
"loss": 0.3276,
"step": 259800
},
{
"epoch": 0.5993531596284932,
"grad_norm": 0.5511178970336914,
"learning_rate": 2.004352226020687e-05,
"loss": 0.3683,
"step": 260000
},
{
"epoch": 0.5998142005205152,
"grad_norm": 0.6916815638542175,
"learning_rate": 2.0020470215605776e-05,
"loss": 0.3789,
"step": 260200
},
{
"epoch": 0.6002752414125371,
"grad_norm": 0.7070001363754272,
"learning_rate": 1.9997418171004678e-05,
"loss": 0.3558,
"step": 260400
},
{
"epoch": 0.600736282304559,
"grad_norm": 0.6469032764434814,
"learning_rate": 1.997436612640358e-05,
"loss": 0.3496,
"step": 260600
},
{
"epoch": 0.6011973231965809,
"grad_norm": 0.6180225014686584,
"learning_rate": 1.9951314081802486e-05,
"loss": 0.3359,
"step": 260800
},
{
"epoch": 0.6016583640886028,
"grad_norm": 0.2272895723581314,
"learning_rate": 1.992826203720139e-05,
"loss": 0.3209,
"step": 261000
},
{
"epoch": 0.6021194049806248,
"grad_norm": 0.5025333166122437,
"learning_rate": 1.99053252528233e-05,
"loss": 0.385,
"step": 261200
},
{
"epoch": 0.6025804458726467,
"grad_norm": 0.6692554354667664,
"learning_rate": 1.9882273208222203e-05,
"loss": 0.3871,
"step": 261400
},
{
"epoch": 0.6030414867646686,
"grad_norm": 0.4041476547718048,
"learning_rate": 1.985922116362111e-05,
"loss": 0.3439,
"step": 261600
},
{
"epoch": 0.6035025276566905,
"grad_norm": 0.25086113810539246,
"learning_rate": 1.983616911902001e-05,
"loss": 0.338,
"step": 261800
},
{
"epoch": 0.6039635685487125,
"grad_norm": 0.4528360366821289,
"learning_rate": 1.9813117074418917e-05,
"loss": 0.3323,
"step": 262000
},
{
"epoch": 0.6044246094407344,
"grad_norm": 0.18659475445747375,
"learning_rate": 1.979006502981782e-05,
"loss": 0.3389,
"step": 262200
},
{
"epoch": 0.6048856503327562,
"grad_norm": 0.7832996845245361,
"learning_rate": 1.9767012985216725e-05,
"loss": 0.3366,
"step": 262400
},
{
"epoch": 0.6053466912247781,
"grad_norm": 0.40535497665405273,
"learning_rate": 1.974396094061563e-05,
"loss": 0.3432,
"step": 262600
},
{
"epoch": 0.6058077321168001,
"grad_norm": 0.8284438848495483,
"learning_rate": 1.9720908896014533e-05,
"loss": 0.3551,
"step": 262800
},
{
"epoch": 0.606268773008822,
"grad_norm": 0.6948146224021912,
"learning_rate": 1.969785685141344e-05,
"loss": 0.3009,
"step": 263000
},
{
"epoch": 0.6067298139008439,
"grad_norm": 0.4383789896965027,
"learning_rate": 1.967480480681234e-05,
"loss": 0.312,
"step": 263200
},
{
"epoch": 0.6071908547928658,
"grad_norm": 0.19989456236362457,
"learning_rate": 1.9651752762211247e-05,
"loss": 0.3654,
"step": 263400
},
{
"epoch": 0.6076518956848878,
"grad_norm": 0.20686183869838715,
"learning_rate": 1.962870071761015e-05,
"loss": 0.3405,
"step": 263600
},
{
"epoch": 0.6081129365769097,
"grad_norm": 0.29917094111442566,
"learning_rate": 1.960564867300905e-05,
"loss": 0.3437,
"step": 263800
},
{
"epoch": 0.6085739774689316,
"grad_norm": 0.19797885417938232,
"learning_rate": 1.9582596628407957e-05,
"loss": 0.3454,
"step": 264000
},
{
"epoch": 0.6090350183609535,
"grad_norm": 0.6084311008453369,
"learning_rate": 1.9559544583806863e-05,
"loss": 0.338,
"step": 264200
},
{
"epoch": 0.6094960592529755,
"grad_norm": 0.5069258809089661,
"learning_rate": 1.9536492539205768e-05,
"loss": 0.3492,
"step": 264400
},
{
"epoch": 0.6099571001449974,
"grad_norm": 0.20755960047245026,
"learning_rate": 1.951344049460467e-05,
"loss": 0.3348,
"step": 264600
},
{
"epoch": 0.6104181410370193,
"grad_norm": 0.3054589331150055,
"learning_rate": 1.9490388450003573e-05,
"loss": 0.2915,
"step": 264800
},
{
"epoch": 0.6108791819290412,
"grad_norm": 0.24750877916812897,
"learning_rate": 1.946733640540248e-05,
"loss": 0.3264,
"step": 265000
},
{
"epoch": 0.6113402228210632,
"grad_norm": 0.4821327030658722,
"learning_rate": 1.944428436080138e-05,
"loss": 0.3519,
"step": 265200
},
{
"epoch": 0.6118012637130851,
"grad_norm": 0.2095753401517868,
"learning_rate": 1.9421232316200286e-05,
"loss": 0.3379,
"step": 265400
},
{
"epoch": 0.612262304605107,
"grad_norm": 0.6056123375892639,
"learning_rate": 1.939818027159919e-05,
"loss": 0.3425,
"step": 265600
},
{
"epoch": 0.6127233454971288,
"grad_norm": 0.6389504671096802,
"learning_rate": 1.9375128226998094e-05,
"loss": 0.3508,
"step": 265800
},
{
"epoch": 0.6131843863891507,
"grad_norm": 0.7660078406333923,
"learning_rate": 1.9352191442620004e-05,
"loss": 0.3851,
"step": 266000
},
{
"epoch": 0.6136454272811727,
"grad_norm": 0.5258800387382507,
"learning_rate": 1.932913939801891e-05,
"loss": 0.3492,
"step": 266200
},
{
"epoch": 0.6141064681731946,
"grad_norm": 0.6075520515441895,
"learning_rate": 1.930608735341781e-05,
"loss": 0.3734,
"step": 266400
},
{
"epoch": 0.6145675090652165,
"grad_norm": 0.17170804738998413,
"learning_rate": 1.9283035308816717e-05,
"loss": 0.3496,
"step": 266600
},
{
"epoch": 0.6150285499572384,
"grad_norm": 0.2692940831184387,
"learning_rate": 1.925998326421562e-05,
"loss": 0.3072,
"step": 266800
},
{
"epoch": 0.6154895908492604,
"grad_norm": 0.609910786151886,
"learning_rate": 1.9236931219614522e-05,
"loss": 0.3492,
"step": 267000
},
{
"epoch": 0.6159506317412823,
"grad_norm": 0.5418997406959534,
"learning_rate": 1.921387917501343e-05,
"loss": 0.3085,
"step": 267200
},
{
"epoch": 0.6164116726333042,
"grad_norm": 0.19821816682815552,
"learning_rate": 1.9190827130412333e-05,
"loss": 0.3292,
"step": 267400
},
{
"epoch": 0.6168727135253261,
"grad_norm": 0.497954398393631,
"learning_rate": 1.916777508581124e-05,
"loss": 0.3311,
"step": 267600
},
{
"epoch": 0.6173337544173481,
"grad_norm": 0.3816941976547241,
"learning_rate": 1.914472304121014e-05,
"loss": 0.3401,
"step": 267800
},
{
"epoch": 0.61779479530937,
"grad_norm": 0.27292588353157043,
"learning_rate": 1.9121670996609047e-05,
"loss": 0.3438,
"step": 268000
},
{
"epoch": 0.6182558362013919,
"grad_norm": 0.7247112989425659,
"learning_rate": 1.909861895200795e-05,
"loss": 0.3413,
"step": 268200
},
{
"epoch": 0.6187168770934138,
"grad_norm": 0.283292680978775,
"learning_rate": 1.907556690740685e-05,
"loss": 0.3419,
"step": 268400
},
{
"epoch": 0.6191779179854358,
"grad_norm": 0.5497933626174927,
"learning_rate": 1.9052514862805757e-05,
"loss": 0.3034,
"step": 268600
},
{
"epoch": 0.6196389588774577,
"grad_norm": 0.47790005803108215,
"learning_rate": 1.902946281820466e-05,
"loss": 0.3425,
"step": 268800
},
{
"epoch": 0.6200999997694796,
"grad_norm": 0.3246474266052246,
"learning_rate": 1.900641077360357e-05,
"loss": 0.3357,
"step": 269000
},
{
"epoch": 0.6205610406615014,
"grad_norm": 0.2585969865322113,
"learning_rate": 1.898335872900247e-05,
"loss": 0.3227,
"step": 269200
},
{
"epoch": 0.6210220815535233,
"grad_norm": 0.5520527958869934,
"learning_rate": 1.8960306684401373e-05,
"loss": 0.3501,
"step": 269400
},
{
"epoch": 0.6214831224455453,
"grad_norm": 0.7343946099281311,
"learning_rate": 1.893725463980028e-05,
"loss": 0.3436,
"step": 269600
},
{
"epoch": 0.6219441633375672,
"grad_norm": 0.3210945725440979,
"learning_rate": 1.891420259519918e-05,
"loss": 0.3152,
"step": 269800
},
{
"epoch": 0.6224052042295891,
"grad_norm": 0.8530191779136658,
"learning_rate": 1.8891150550598087e-05,
"loss": 0.3823,
"step": 270000
},
{
"epoch": 0.622866245121611,
"grad_norm": 0.31033867597579956,
"learning_rate": 1.8868329026443003e-05,
"loss": 0.3261,
"step": 270200
},
{
"epoch": 0.623327286013633,
"grad_norm": 1.5595014095306396,
"learning_rate": 1.8845276981841905e-05,
"loss": 0.3501,
"step": 270400
},
{
"epoch": 0.6237883269056549,
"grad_norm": 0.4385709762573242,
"learning_rate": 1.882222493724081e-05,
"loss": 0.3177,
"step": 270600
},
{
"epoch": 0.6242493677976768,
"grad_norm": 0.5608983039855957,
"learning_rate": 1.8799172892639713e-05,
"loss": 0.3314,
"step": 270800
},
{
"epoch": 0.6247104086896987,
"grad_norm": 0.39087677001953125,
"learning_rate": 1.8776120848038615e-05,
"loss": 0.3407,
"step": 271000
},
{
"epoch": 0.6251714495817207,
"grad_norm": 0.3214382529258728,
"learning_rate": 1.875306880343752e-05,
"loss": 0.3623,
"step": 271200
},
{
"epoch": 0.6256324904737426,
"grad_norm": 0.38440006971359253,
"learning_rate": 1.8730016758836427e-05,
"loss": 0.3189,
"step": 271400
},
{
"epoch": 0.6260935313657645,
"grad_norm": 0.700528085231781,
"learning_rate": 1.8706964714235332e-05,
"loss": 0.3526,
"step": 271600
},
{
"epoch": 0.6265545722577864,
"grad_norm": 0.2343386709690094,
"learning_rate": 1.8683912669634235e-05,
"loss": 0.3662,
"step": 271800
},
{
"epoch": 0.6270156131498084,
"grad_norm": 0.18461963534355164,
"learning_rate": 1.8660860625033137e-05,
"loss": 0.3564,
"step": 272000
},
{
"epoch": 0.6274766540418303,
"grad_norm": 0.32862597703933716,
"learning_rate": 1.8637808580432043e-05,
"loss": 0.3444,
"step": 272200
},
{
"epoch": 0.6279376949338522,
"grad_norm": 0.6182105541229248,
"learning_rate": 1.8614756535830945e-05,
"loss": 0.362,
"step": 272400
},
{
"epoch": 0.628398735825874,
"grad_norm": 0.24440090358257294,
"learning_rate": 1.8591819751452854e-05,
"loss": 0.338,
"step": 272600
},
{
"epoch": 0.628859776717896,
"grad_norm": 0.4330989718437195,
"learning_rate": 1.856876770685176e-05,
"loss": 0.3412,
"step": 272800
},
{
"epoch": 0.6293208176099179,
"grad_norm": 0.46450743079185486,
"learning_rate": 1.8545715662250665e-05,
"loss": 0.3651,
"step": 273000
},
{
"epoch": 0.6297818585019398,
"grad_norm": 0.5325597524642944,
"learning_rate": 1.8522663617649568e-05,
"loss": 0.3657,
"step": 273200
},
{
"epoch": 0.6302428993939617,
"grad_norm": 0.5117043256759644,
"learning_rate": 1.8499611573048473e-05,
"loss": 0.3488,
"step": 273400
},
{
"epoch": 0.6307039402859836,
"grad_norm": 0.8107678294181824,
"learning_rate": 1.8476559528447376e-05,
"loss": 0.3549,
"step": 273600
},
{
"epoch": 0.6311649811780056,
"grad_norm": 0.2212335169315338,
"learning_rate": 1.845350748384628e-05,
"loss": 0.3951,
"step": 273800
},
{
"epoch": 0.6316260220700275,
"grad_norm": 0.40533021092414856,
"learning_rate": 1.8430455439245184e-05,
"loss": 0.3285,
"step": 274000
},
{
"epoch": 0.6320870629620494,
"grad_norm": 0.42866331338882446,
"learning_rate": 1.840740339464409e-05,
"loss": 0.3469,
"step": 274200
},
{
"epoch": 0.6325481038540713,
"grad_norm": 0.197109192609787,
"learning_rate": 1.8384351350042995e-05,
"loss": 0.3117,
"step": 274400
},
{
"epoch": 0.6330091447460933,
"grad_norm": 0.2393321990966797,
"learning_rate": 1.8361299305441897e-05,
"loss": 0.3373,
"step": 274600
},
{
"epoch": 0.6334701856381152,
"grad_norm": 0.3973337411880493,
"learning_rate": 1.8338247260840803e-05,
"loss": 0.3274,
"step": 274800
},
{
"epoch": 0.6339312265301371,
"grad_norm": 0.11754538118839264,
"learning_rate": 1.831531047646271e-05,
"loss": 0.3895,
"step": 275000
},
{
"epoch": 0.634392267422159,
"grad_norm": 0.4234245717525482,
"learning_rate": 1.8292258431861614e-05,
"loss": 0.3814,
"step": 275200
},
{
"epoch": 0.634853308314181,
"grad_norm": 0.22934797406196594,
"learning_rate": 1.8269206387260517e-05,
"loss": 0.3123,
"step": 275400
},
{
"epoch": 0.6353143492062029,
"grad_norm": 0.34416547417640686,
"learning_rate": 1.8246154342659422e-05,
"loss": 0.3331,
"step": 275600
},
{
"epoch": 0.6357753900982248,
"grad_norm": 0.5943341851234436,
"learning_rate": 1.8223102298058328e-05,
"loss": 0.3385,
"step": 275800
},
{
"epoch": 0.6362364309902466,
"grad_norm": 0.1284875124692917,
"learning_rate": 1.820005025345723e-05,
"loss": 0.3151,
"step": 276000
},
{
"epoch": 0.6366974718822686,
"grad_norm": 0.6041738390922546,
"learning_rate": 1.8176998208856136e-05,
"loss": 0.338,
"step": 276200
},
{
"epoch": 0.6371585127742905,
"grad_norm": 0.7428380846977234,
"learning_rate": 1.815394616425504e-05,
"loss": 0.3695,
"step": 276400
},
{
"epoch": 0.6376195536663124,
"grad_norm": 0.23863576352596283,
"learning_rate": 1.8130894119653944e-05,
"loss": 0.3437,
"step": 276600
},
{
"epoch": 0.6380805945583343,
"grad_norm": 0.8898888230323792,
"learning_rate": 1.8107842075052846e-05,
"loss": 0.3513,
"step": 276800
},
{
"epoch": 0.6385416354503562,
"grad_norm": 0.41250279545783997,
"learning_rate": 1.8084790030451752e-05,
"loss": 0.3346,
"step": 277000
},
{
"epoch": 0.6390026763423782,
"grad_norm": 0.25133389234542847,
"learning_rate": 1.8061737985850654e-05,
"loss": 0.3481,
"step": 277200
},
{
"epoch": 0.6394637172344001,
"grad_norm": 0.23383691906929016,
"learning_rate": 1.803868594124956e-05,
"loss": 0.3645,
"step": 277400
},
{
"epoch": 0.639924758126422,
"grad_norm": 0.20733849704265594,
"learning_rate": 1.8015633896648466e-05,
"loss": 0.325,
"step": 277600
},
{
"epoch": 0.640385799018444,
"grad_norm": 0.2532244622707367,
"learning_rate": 1.7992581852047368e-05,
"loss": 0.312,
"step": 277800
},
{
"epoch": 0.6408468399104659,
"grad_norm": 0.2864329218864441,
"learning_rate": 1.7969529807446274e-05,
"loss": 0.3687,
"step": 278000
},
{
"epoch": 0.6413078808024878,
"grad_norm": 0.40944796800613403,
"learning_rate": 1.7946477762845176e-05,
"loss": 0.3106,
"step": 278200
},
{
"epoch": 0.6417689216945097,
"grad_norm": 0.3300291895866394,
"learning_rate": 1.792342571824408e-05,
"loss": 0.3314,
"step": 278400
},
{
"epoch": 0.6422299625865316,
"grad_norm": 0.5356222987174988,
"learning_rate": 1.7900373673642984e-05,
"loss": 0.3097,
"step": 278600
},
{
"epoch": 0.6426910034785536,
"grad_norm": 0.4537760317325592,
"learning_rate": 1.7877321629041886e-05,
"loss": 0.334,
"step": 278800
},
{
"epoch": 0.6431520443705755,
"grad_norm": 0.6557602882385254,
"learning_rate": 1.78543848446638e-05,
"loss": 0.3511,
"step": 279000
},
{
"epoch": 0.6436130852625974,
"grad_norm": 0.680885374546051,
"learning_rate": 1.7831332800062704e-05,
"loss": 0.3514,
"step": 279200
},
{
"epoch": 0.6440741261546192,
"grad_norm": 0.5116409063339233,
"learning_rate": 1.780839601568461e-05,
"loss": 0.3398,
"step": 279400
},
{
"epoch": 0.6445351670466412,
"grad_norm": 0.2696648836135864,
"learning_rate": 1.7785343971083516e-05,
"loss": 0.3566,
"step": 279600
},
{
"epoch": 0.6449962079386631,
"grad_norm": 0.22589097917079926,
"learning_rate": 1.7762291926482418e-05,
"loss": 0.3471,
"step": 279800
},
{
"epoch": 0.645457248830685,
"grad_norm": 0.18393610417842865,
"learning_rate": 1.7739239881881324e-05,
"loss": 0.3202,
"step": 280000
},
{
"epoch": 0.6459182897227069,
"grad_norm": 0.5251309871673584,
"learning_rate": 1.771618783728023e-05,
"loss": 0.3312,
"step": 280200
},
{
"epoch": 0.6463793306147289,
"grad_norm": 0.29071182012557983,
"learning_rate": 1.7693135792679132e-05,
"loss": 0.3569,
"step": 280400
},
{
"epoch": 0.6468403715067508,
"grad_norm": 0.6973856687545776,
"learning_rate": 1.7670083748078038e-05,
"loss": 0.3912,
"step": 280600
},
{
"epoch": 0.6473014123987727,
"grad_norm": 0.488626092672348,
"learning_rate": 1.764703170347694e-05,
"loss": 0.3398,
"step": 280800
},
{
"epoch": 0.6477624532907946,
"grad_norm": 0.4092373251914978,
"learning_rate": 1.7623979658875845e-05,
"loss": 0.3756,
"step": 281000
},
{
"epoch": 0.6482234941828166,
"grad_norm": 0.39807140827178955,
"learning_rate": 1.7600927614274748e-05,
"loss": 0.3178,
"step": 281200
},
{
"epoch": 0.6486845350748385,
"grad_norm": 0.36558443307876587,
"learning_rate": 1.7577875569673653e-05,
"loss": 0.3401,
"step": 281400
},
{
"epoch": 0.6491455759668604,
"grad_norm": 0.36944320797920227,
"learning_rate": 1.755482352507256e-05,
"loss": 0.3851,
"step": 281600
},
{
"epoch": 0.6496066168588823,
"grad_norm": 0.5965989828109741,
"learning_rate": 1.753177148047146e-05,
"loss": 0.3416,
"step": 281800
},
{
"epoch": 0.6500676577509042,
"grad_norm": 0.4634806215763092,
"learning_rate": 1.7508719435870367e-05,
"loss": 0.3327,
"step": 282000
},
{
"epoch": 0.6505286986429262,
"grad_norm": 0.2554210424423218,
"learning_rate": 1.748566739126927e-05,
"loss": 0.3374,
"step": 282200
},
{
"epoch": 0.6509897395349481,
"grad_norm": 0.6238657236099243,
"learning_rate": 1.7462615346668175e-05,
"loss": 0.3655,
"step": 282400
},
{
"epoch": 0.65145078042697,
"grad_norm": 0.2654569745063782,
"learning_rate": 1.7439563302067077e-05,
"loss": 0.3364,
"step": 282600
},
{
"epoch": 0.6519118213189918,
"grad_norm": 0.4311518669128418,
"learning_rate": 1.741651125746598e-05,
"loss": 0.3669,
"step": 282800
},
{
"epoch": 0.6523728622110138,
"grad_norm": 0.5249995589256287,
"learning_rate": 1.7393459212864885e-05,
"loss": 0.3538,
"step": 283000
},
{
"epoch": 0.6528339031030357,
"grad_norm": 0.1709951013326645,
"learning_rate": 1.7370407168263788e-05,
"loss": 0.3348,
"step": 283200
},
{
"epoch": 0.6532949439950576,
"grad_norm": 0.4482337236404419,
"learning_rate": 1.73474703838857e-05,
"loss": 0.3178,
"step": 283400
},
{
"epoch": 0.6537559848870795,
"grad_norm": 0.40501514077186584,
"learning_rate": 1.732453359950761e-05,
"loss": 0.3584,
"step": 283600
},
{
"epoch": 0.6542170257791015,
"grad_norm": 0.3024444878101349,
"learning_rate": 1.730148155490651e-05,
"loss": 0.3886,
"step": 283800
},
{
"epoch": 0.6546780666711234,
"grad_norm": 0.24661915004253387,
"learning_rate": 1.7278429510305417e-05,
"loss": 0.3619,
"step": 284000
},
{
"epoch": 0.6551391075631453,
"grad_norm": 0.4417787492275238,
"learning_rate": 1.7255377465704323e-05,
"loss": 0.3392,
"step": 284200
},
{
"epoch": 0.6556001484551672,
"grad_norm": 0.39604651927948,
"learning_rate": 1.7232325421103225e-05,
"loss": 0.3229,
"step": 284400
},
{
"epoch": 0.6560611893471892,
"grad_norm": 0.6911141872406006,
"learning_rate": 1.720927337650213e-05,
"loss": 0.3314,
"step": 284600
},
{
"epoch": 0.6565222302392111,
"grad_norm": 0.9970071315765381,
"learning_rate": 1.7186221331901033e-05,
"loss": 0.3746,
"step": 284800
},
{
"epoch": 0.656983271131233,
"grad_norm": 0.42293471097946167,
"learning_rate": 1.716316928729994e-05,
"loss": 0.347,
"step": 285000
},
{
"epoch": 0.6574443120232549,
"grad_norm": 0.3817155957221985,
"learning_rate": 1.714011724269884e-05,
"loss": 0.3671,
"step": 285200
},
{
"epoch": 0.6579053529152769,
"grad_norm": 0.3805120587348938,
"learning_rate": 1.7117065198097747e-05,
"loss": 0.3126,
"step": 285400
},
{
"epoch": 0.6583663938072988,
"grad_norm": 0.6818868517875671,
"learning_rate": 1.709401315349665e-05,
"loss": 0.3531,
"step": 285600
},
{
"epoch": 0.6588274346993207,
"grad_norm": 0.3460671901702881,
"learning_rate": 1.7070961108895555e-05,
"loss": 0.3398,
"step": 285800
},
{
"epoch": 0.6592884755913426,
"grad_norm": 0.45356935262680054,
"learning_rate": 1.704790906429446e-05,
"loss": 0.3359,
"step": 286000
},
{
"epoch": 0.6597495164833644,
"grad_norm": 0.2916777729988098,
"learning_rate": 1.7024972279916366e-05,
"loss": 0.3394,
"step": 286200
},
{
"epoch": 0.6602105573753864,
"grad_norm": 1.864121913909912,
"learning_rate": 1.7001920235315272e-05,
"loss": 0.3568,
"step": 286400
},
{
"epoch": 0.6606715982674083,
"grad_norm": 0.6602578163146973,
"learning_rate": 1.6978868190714174e-05,
"loss": 0.3346,
"step": 286600
},
{
"epoch": 0.6611326391594302,
"grad_norm": 0.5515346527099609,
"learning_rate": 1.695581614611308e-05,
"loss": 0.3506,
"step": 286800
},
{
"epoch": 0.6615936800514521,
"grad_norm": 0.733218252658844,
"learning_rate": 1.6932764101511982e-05,
"loss": 0.3362,
"step": 287000
},
{
"epoch": 0.6620547209434741,
"grad_norm": 0.36931312084198,
"learning_rate": 1.6909712056910888e-05,
"loss": 0.3512,
"step": 287200
},
{
"epoch": 0.662515761835496,
"grad_norm": 0.5945706963539124,
"learning_rate": 1.6886660012309794e-05,
"loss": 0.3441,
"step": 287400
},
{
"epoch": 0.6629768027275179,
"grad_norm": 1.0846385955810547,
"learning_rate": 1.6863607967708696e-05,
"loss": 0.3436,
"step": 287600
},
{
"epoch": 0.6634378436195398,
"grad_norm": 0.2504749000072479,
"learning_rate": 1.68405559231076e-05,
"loss": 0.3626,
"step": 287800
},
{
"epoch": 0.6638988845115618,
"grad_norm": 0.5974973440170288,
"learning_rate": 1.6817503878506504e-05,
"loss": 0.3204,
"step": 288000
},
{
"epoch": 0.6643599254035837,
"grad_norm": 0.5793485045433044,
"learning_rate": 1.679445183390541e-05,
"loss": 0.3191,
"step": 288200
},
{
"epoch": 0.6648209662956056,
"grad_norm": 0.23215670883655548,
"learning_rate": 1.6771399789304312e-05,
"loss": 0.3516,
"step": 288400
},
{
"epoch": 0.6652820071876275,
"grad_norm": 0.5341312289237976,
"learning_rate": 1.6748347744703218e-05,
"loss": 0.3593,
"step": 288600
},
{
"epoch": 0.6657430480796495,
"grad_norm": 0.35843783617019653,
"learning_rate": 1.6725295700102123e-05,
"loss": 0.3264,
"step": 288800
},
{
"epoch": 0.6662040889716714,
"grad_norm": 0.2489808201789856,
"learning_rate": 1.6702243655501026e-05,
"loss": 0.3507,
"step": 289000
},
{
"epoch": 0.6666651298636933,
"grad_norm": 0.41247281432151794,
"learning_rate": 1.667919161089993e-05,
"loss": 0.3228,
"step": 289200
},
{
"epoch": 0.6671261707557152,
"grad_norm": 0.35904163122177124,
"learning_rate": 1.6656139566298833e-05,
"loss": 0.3518,
"step": 289400
},
{
"epoch": 0.667587211647737,
"grad_norm": 0.23721112310886383,
"learning_rate": 1.663308752169774e-05,
"loss": 0.3294,
"step": 289600
},
{
"epoch": 0.668048252539759,
"grad_norm": 0.40108099579811096,
"learning_rate": 1.661003547709664e-05,
"loss": 0.3152,
"step": 289800
},
{
"epoch": 0.6685092934317809,
"grad_norm": 0.7761299014091492,
"learning_rate": 1.6586983432495544e-05,
"loss": 0.3289,
"step": 290000
},
{
"epoch": 0.6689703343238028,
"grad_norm": 0.5430353283882141,
"learning_rate": 1.656393138789445e-05,
"loss": 0.319,
"step": 290200
},
{
"epoch": 0.6694313752158247,
"grad_norm": 1.571452021598816,
"learning_rate": 1.654087934329335e-05,
"loss": 0.3726,
"step": 290400
},
{
"epoch": 0.6698924161078467,
"grad_norm": 0.2540998160839081,
"learning_rate": 1.651782729869226e-05,
"loss": 0.3358,
"step": 290600
},
{
"epoch": 0.6703534569998686,
"grad_norm": 0.2784154415130615,
"learning_rate": 1.6494890514314167e-05,
"loss": 0.3547,
"step": 290800
},
{
"epoch": 0.6708144978918905,
"grad_norm": 0.6045355200767517,
"learning_rate": 1.6471838469713072e-05,
"loss": 0.3381,
"step": 291000
},
{
"epoch": 0.6712755387839124,
"grad_norm": 0.4733976423740387,
"learning_rate": 1.6448786425111975e-05,
"loss": 0.3338,
"step": 291200
},
{
"epoch": 0.6717365796759344,
"grad_norm": 0.28665849566459656,
"learning_rate": 1.642573438051088e-05,
"loss": 0.3373,
"step": 291400
},
{
"epoch": 0.6721976205679563,
"grad_norm": 0.38959574699401855,
"learning_rate": 1.6402682335909782e-05,
"loss": 0.3611,
"step": 291600
},
{
"epoch": 0.6726586614599782,
"grad_norm": 0.4198921322822571,
"learning_rate": 1.6379630291308688e-05,
"loss": 0.3811,
"step": 291800
},
{
"epoch": 0.6731197023520001,
"grad_norm": 0.2657308280467987,
"learning_rate": 1.6356578246707594e-05,
"loss": 0.3206,
"step": 292000
},
{
"epoch": 0.6735807432440221,
"grad_norm": 0.40216901898384094,
"learning_rate": 1.6333526202106496e-05,
"loss": 0.341,
"step": 292200
},
{
"epoch": 0.674041784136044,
"grad_norm": 0.554699182510376,
"learning_rate": 1.6310474157505402e-05,
"loss": 0.334,
"step": 292400
},
{
"epoch": 0.6745028250280659,
"grad_norm": 1.3959380388259888,
"learning_rate": 1.628753737312731e-05,
"loss": 0.39,
"step": 292600
},
{
"epoch": 0.6749638659200878,
"grad_norm": 0.327361136674881,
"learning_rate": 1.6264485328526213e-05,
"loss": 0.3169,
"step": 292800
},
{
"epoch": 0.6754249068121096,
"grad_norm": 0.38681721687316895,
"learning_rate": 1.624143328392512e-05,
"loss": 0.3708,
"step": 293000
},
{
"epoch": 0.6758859477041316,
"grad_norm": 0.28206613659858704,
"learning_rate": 1.6218381239324025e-05,
"loss": 0.3559,
"step": 293200
},
{
"epoch": 0.6763469885961535,
"grad_norm": 0.463687539100647,
"learning_rate": 1.6195329194722927e-05,
"loss": 0.3389,
"step": 293400
},
{
"epoch": 0.6768080294881754,
"grad_norm": 0.3705352246761322,
"learning_rate": 1.6172277150121833e-05,
"loss": 0.3122,
"step": 293600
},
{
"epoch": 0.6772690703801973,
"grad_norm": 0.38660258054733276,
"learning_rate": 1.6149225105520735e-05,
"loss": 0.3497,
"step": 293800
},
{
"epoch": 0.6777301112722193,
"grad_norm": 0.5014271140098572,
"learning_rate": 1.6126173060919637e-05,
"loss": 0.3093,
"step": 294000
},
{
"epoch": 0.6781911521642412,
"grad_norm": 0.20219211280345917,
"learning_rate": 1.6103121016318543e-05,
"loss": 0.3498,
"step": 294200
},
{
"epoch": 0.6786521930562631,
"grad_norm": 0.6576494574546814,
"learning_rate": 1.6080068971717445e-05,
"loss": 0.3497,
"step": 294400
},
{
"epoch": 0.679113233948285,
"grad_norm": 0.9610106945037842,
"learning_rate": 1.605701692711635e-05,
"loss": 0.3588,
"step": 294600
},
{
"epoch": 0.679574274840307,
"grad_norm": 0.12911242246627808,
"learning_rate": 1.6033964882515257e-05,
"loss": 0.3481,
"step": 294800
},
{
"epoch": 0.6800353157323289,
"grad_norm": 0.6255713105201721,
"learning_rate": 1.601091283791416e-05,
"loss": 0.3683,
"step": 295000
},
{
"epoch": 0.6804963566243508,
"grad_norm": 0.6422701478004456,
"learning_rate": 1.5987860793313064e-05,
"loss": 0.3424,
"step": 295200
},
{
"epoch": 0.6809573975163727,
"grad_norm": 0.22834673523902893,
"learning_rate": 1.5964924008934974e-05,
"loss": 0.3673,
"step": 295400
},
{
"epoch": 0.6814184384083947,
"grad_norm": 0.506917417049408,
"learning_rate": 1.5941871964333876e-05,
"loss": 0.3496,
"step": 295600
},
{
"epoch": 0.6818794793004166,
"grad_norm": 0.42795634269714355,
"learning_rate": 1.591881991973278e-05,
"loss": 0.3205,
"step": 295800
},
{
"epoch": 0.6823405201924385,
"grad_norm": 0.4438583254814148,
"learning_rate": 1.5895767875131687e-05,
"loss": 0.3466,
"step": 296000
},
{
"epoch": 0.6828015610844604,
"grad_norm": 0.10134997218847275,
"learning_rate": 1.587271583053059e-05,
"loss": 0.3721,
"step": 296200
},
{
"epoch": 0.6832626019764823,
"grad_norm": 0.32405492663383484,
"learning_rate": 1.5849663785929495e-05,
"loss": 0.3353,
"step": 296400
},
{
"epoch": 0.6837236428685042,
"grad_norm": 0.7178249359130859,
"learning_rate": 1.5826611741328398e-05,
"loss": 0.3526,
"step": 296600
},
{
"epoch": 0.6841846837605261,
"grad_norm": 0.27805209159851074,
"learning_rate": 1.5803559696727303e-05,
"loss": 0.3487,
"step": 296800
},
{
"epoch": 0.684645724652548,
"grad_norm": 0.6939868330955505,
"learning_rate": 1.5780507652126206e-05,
"loss": 0.3437,
"step": 297000
},
{
"epoch": 0.68510676554457,
"grad_norm": 0.39550527930259705,
"learning_rate": 1.575745560752511e-05,
"loss": 0.3214,
"step": 297200
},
{
"epoch": 0.6855678064365919,
"grad_norm": 0.26896151900291443,
"learning_rate": 1.573451882314702e-05,
"loss": 0.3804,
"step": 297400
},
{
"epoch": 0.6860288473286138,
"grad_norm": 0.1456928700208664,
"learning_rate": 1.5711466778545926e-05,
"loss": 0.3387,
"step": 297600
},
{
"epoch": 0.6864898882206357,
"grad_norm": 0.5462424159049988,
"learning_rate": 1.568841473394483e-05,
"loss": 0.3099,
"step": 297800
},
{
"epoch": 0.6869509291126576,
"grad_norm": 0.19774644076824188,
"learning_rate": 1.566536268934373e-05,
"loss": 0.3298,
"step": 298000
},
{
"epoch": 0.6874119700046796,
"grad_norm": 0.5376401543617249,
"learning_rate": 1.5642310644742636e-05,
"loss": 0.3553,
"step": 298200
},
{
"epoch": 0.6878730108967015,
"grad_norm": 0.533101499080658,
"learning_rate": 1.561925860014154e-05,
"loss": 0.3298,
"step": 298400
},
{
"epoch": 0.6883340517887234,
"grad_norm": 1.3765742778778076,
"learning_rate": 1.5596206555540444e-05,
"loss": 0.3452,
"step": 298600
},
{
"epoch": 0.6887950926807453,
"grad_norm": 0.1399991810321808,
"learning_rate": 1.5573154510939347e-05,
"loss": 0.3092,
"step": 298800
},
{
"epoch": 0.6892561335727673,
"grad_norm": 0.34275999665260315,
"learning_rate": 1.5550102466338252e-05,
"loss": 0.308,
"step": 299000
},
{
"epoch": 0.6897171744647892,
"grad_norm": 0.3125983476638794,
"learning_rate": 1.5527050421737158e-05,
"loss": 0.3385,
"step": 299200
},
{
"epoch": 0.6901782153568111,
"grad_norm": 0.30715808272361755,
"learning_rate": 1.550399837713606e-05,
"loss": 0.384,
"step": 299400
},
{
"epoch": 0.690639256248833,
"grad_norm": 0.6447780728340149,
"learning_rate": 1.5480946332534966e-05,
"loss": 0.3592,
"step": 299600
},
{
"epoch": 0.6911002971408549,
"grad_norm": 0.17741701006889343,
"learning_rate": 1.5458009548156875e-05,
"loss": 0.3644,
"step": 299800
},
{
"epoch": 0.6915613380328768,
"grad_norm": 0.5684409737586975,
"learning_rate": 1.5434957503555777e-05,
"loss": 0.3301,
"step": 300000
},
{
"epoch": 0.6915613380328768,
"eval_loss": 0.3471066951751709,
"eval_runtime": 222.7661,
"eval_samples_per_second": 19.671,
"eval_steps_per_second": 19.671,
"step": 300000
},
{
"epoch": 0.6920223789248987,
"grad_norm": 0.2992446720600128,
"learning_rate": 1.5411905458954683e-05,
"loss": 0.3235,
"step": 300200
},
{
"epoch": 0.6924834198169206,
"grad_norm": 0.5946826338768005,
"learning_rate": 1.538885341435359e-05,
"loss": 0.3463,
"step": 300400
},
{
"epoch": 0.6929444607089426,
"grad_norm": 0.35386455059051514,
"learning_rate": 1.536580136975249e-05,
"loss": 0.3689,
"step": 300600
},
{
"epoch": 0.6934055016009645,
"grad_norm": 0.27284950017929077,
"learning_rate": 1.5342749325151397e-05,
"loss": 0.3763,
"step": 300800
},
{
"epoch": 0.6938665424929864,
"grad_norm": 0.28267648816108704,
"learning_rate": 1.53196972805503e-05,
"loss": 0.3393,
"step": 301000
},
{
"epoch": 0.6943275833850083,
"grad_norm": 0.19581338763237,
"learning_rate": 1.52966452359492e-05,
"loss": 0.3471,
"step": 301200
},
{
"epoch": 0.6947886242770303,
"grad_norm": 0.4849194288253784,
"learning_rate": 1.5273593191348107e-05,
"loss": 0.3192,
"step": 301400
},
{
"epoch": 0.6952496651690522,
"grad_norm": 0.21700933575630188,
"learning_rate": 1.5250541146747011e-05,
"loss": 0.309,
"step": 301600
},
{
"epoch": 0.6957107060610741,
"grad_norm": 0.42427298426628113,
"learning_rate": 1.5227489102145915e-05,
"loss": 0.3603,
"step": 301800
},
{
"epoch": 0.696171746953096,
"grad_norm": 0.3934516906738281,
"learning_rate": 1.520443705754482e-05,
"loss": 0.337,
"step": 302000
},
{
"epoch": 0.696632787845118,
"grad_norm": 0.5094757676124573,
"learning_rate": 1.5181385012943725e-05,
"loss": 0.3197,
"step": 302200
},
{
"epoch": 0.6970938287371399,
"grad_norm": 0.4605288505554199,
"learning_rate": 1.5158332968342629e-05,
"loss": 0.311,
"step": 302400
},
{
"epoch": 0.6975548696291618,
"grad_norm": 0.5409327745437622,
"learning_rate": 1.5135396183964538e-05,
"loss": 0.3555,
"step": 302600
},
{
"epoch": 0.6980159105211837,
"grad_norm": 0.5775600671768188,
"learning_rate": 1.511234413936344e-05,
"loss": 0.3312,
"step": 302800
},
{
"epoch": 0.6984769514132056,
"grad_norm": 0.26445746421813965,
"learning_rate": 1.5089292094762344e-05,
"loss": 0.3464,
"step": 303000
},
{
"epoch": 0.6989379923052275,
"grad_norm": 0.6190831065177917,
"learning_rate": 1.5066240050161251e-05,
"loss": 0.3515,
"step": 303200
},
{
"epoch": 0.6993990331972494,
"grad_norm": 0.5285741090774536,
"learning_rate": 1.5043188005560155e-05,
"loss": 0.3492,
"step": 303400
},
{
"epoch": 0.6998600740892713,
"grad_norm": 0.512768030166626,
"learning_rate": 1.502013596095906e-05,
"loss": 0.3602,
"step": 303600
},
{
"epoch": 0.7003211149812932,
"grad_norm": 0.6661767959594727,
"learning_rate": 1.4997083916357962e-05,
"loss": 0.3544,
"step": 303800
},
{
"epoch": 0.7007821558733152,
"grad_norm": 0.5230671167373657,
"learning_rate": 1.4974031871756866e-05,
"loss": 0.3353,
"step": 304000
},
{
"epoch": 0.7012431967653371,
"grad_norm": 0.2725779414176941,
"learning_rate": 1.495097982715577e-05,
"loss": 0.3573,
"step": 304200
},
{
"epoch": 0.701704237657359,
"grad_norm": 0.5096437931060791,
"learning_rate": 1.4927927782554674e-05,
"loss": 0.3259,
"step": 304400
},
{
"epoch": 0.7021652785493809,
"grad_norm": 0.58111172914505,
"learning_rate": 1.4904875737953578e-05,
"loss": 0.3364,
"step": 304600
},
{
"epoch": 0.7026263194414029,
"grad_norm": 0.20404790341854095,
"learning_rate": 1.4881823693352482e-05,
"loss": 0.316,
"step": 304800
},
{
"epoch": 0.7030873603334248,
"grad_norm": 0.17927314341068268,
"learning_rate": 1.4858886908974392e-05,
"loss": 0.3376,
"step": 305000
},
{
"epoch": 0.7035484012254467,
"grad_norm": 0.20475348830223083,
"learning_rate": 1.4835834864373296e-05,
"loss": 0.3385,
"step": 305200
},
{
"epoch": 0.7040094421174686,
"grad_norm": 0.3952394425868988,
"learning_rate": 1.48127828197722e-05,
"loss": 0.3208,
"step": 305400
},
{
"epoch": 0.7044704830094906,
"grad_norm": 0.3956199288368225,
"learning_rate": 1.4789730775171104e-05,
"loss": 0.3508,
"step": 305600
},
{
"epoch": 0.7049315239015125,
"grad_norm": 0.6233502626419067,
"learning_rate": 1.4766678730570008e-05,
"loss": 0.3534,
"step": 305800
},
{
"epoch": 0.7053925647935344,
"grad_norm": 0.25527262687683105,
"learning_rate": 1.474362668596891e-05,
"loss": 0.3408,
"step": 306000
},
{
"epoch": 0.7058536056855563,
"grad_norm": 0.28162071108818054,
"learning_rate": 1.4720574641367818e-05,
"loss": 0.3097,
"step": 306200
},
{
"epoch": 0.7063146465775783,
"grad_norm": 0.6374503970146179,
"learning_rate": 1.4697522596766722e-05,
"loss": 0.3576,
"step": 306400
},
{
"epoch": 0.7067756874696001,
"grad_norm": 0.34172001481056213,
"learning_rate": 1.4674470552165626e-05,
"loss": 0.3222,
"step": 306600
},
{
"epoch": 0.707236728361622,
"grad_norm": 1.869160771369934,
"learning_rate": 1.465141850756453e-05,
"loss": 0.3419,
"step": 306800
},
{
"epoch": 0.7076977692536439,
"grad_norm": 0.3956671953201294,
"learning_rate": 1.4628366462963434e-05,
"loss": 0.2977,
"step": 307000
},
{
"epoch": 0.7081588101456658,
"grad_norm": 0.191593199968338,
"learning_rate": 1.4605314418362336e-05,
"loss": 0.3209,
"step": 307200
},
{
"epoch": 0.7086198510376878,
"grad_norm": 0.6856529116630554,
"learning_rate": 1.458226237376124e-05,
"loss": 0.3418,
"step": 307400
},
{
"epoch": 0.7090808919297097,
"grad_norm": 0.3877515494823456,
"learning_rate": 1.4559210329160144e-05,
"loss": 0.2946,
"step": 307600
},
{
"epoch": 0.7095419328217316,
"grad_norm": 0.2209376096725464,
"learning_rate": 1.4536158284559052e-05,
"loss": 0.3153,
"step": 307800
},
{
"epoch": 0.7100029737137535,
"grad_norm": 0.5207587480545044,
"learning_rate": 1.4513221500180959e-05,
"loss": 0.3586,
"step": 308000
},
{
"epoch": 0.7104640146057755,
"grad_norm": 5.93732213973999,
"learning_rate": 1.4490169455579863e-05,
"loss": 0.3589,
"step": 308200
},
{
"epoch": 0.7109250554977974,
"grad_norm": 0.19288334250450134,
"learning_rate": 1.4467117410978767e-05,
"loss": 0.3654,
"step": 308400
},
{
"epoch": 0.7113860963898193,
"grad_norm": 0.16748656332492828,
"learning_rate": 1.4444065366377671e-05,
"loss": 0.3587,
"step": 308600
},
{
"epoch": 0.7118471372818412,
"grad_norm": 0.145443856716156,
"learning_rate": 1.4421013321776575e-05,
"loss": 0.3815,
"step": 308800
},
{
"epoch": 0.7123081781738632,
"grad_norm": 0.400766521692276,
"learning_rate": 1.4397961277175479e-05,
"loss": 0.3757,
"step": 309000
},
{
"epoch": 0.7127692190658851,
"grad_norm": 0.2556675970554352,
"learning_rate": 1.4374909232574385e-05,
"loss": 0.3463,
"step": 309200
},
{
"epoch": 0.713230259957907,
"grad_norm": 0.5215242505073547,
"learning_rate": 1.4351857187973289e-05,
"loss": 0.3503,
"step": 309400
},
{
"epoch": 0.7136913008499289,
"grad_norm": 4.102840900421143,
"learning_rate": 1.4328805143372193e-05,
"loss": 0.3707,
"step": 309600
},
{
"epoch": 0.7141523417419507,
"grad_norm": 0.330891877412796,
"learning_rate": 1.4305753098771097e-05,
"loss": 0.3081,
"step": 309800
},
{
"epoch": 0.7146133826339727,
"grad_norm": 0.3524174392223358,
"learning_rate": 1.428270105417e-05,
"loss": 0.3688,
"step": 310000
},
{
"epoch": 0.7150744235259946,
"grad_norm": 0.6447340250015259,
"learning_rate": 1.4259649009568905e-05,
"loss": 0.3255,
"step": 310200
},
{
"epoch": 0.7155354644180165,
"grad_norm": 0.22535520792007446,
"learning_rate": 1.4236712225190816e-05,
"loss": 0.3136,
"step": 310400
},
{
"epoch": 0.7159965053100384,
"grad_norm": 0.24304255843162537,
"learning_rate": 1.421366018058972e-05,
"loss": 0.3298,
"step": 310600
},
{
"epoch": 0.7164575462020604,
"grad_norm": 0.46862804889678955,
"learning_rate": 1.4190608135988624e-05,
"loss": 0.3021,
"step": 310800
},
{
"epoch": 0.7169185870940823,
"grad_norm": 0.313894659280777,
"learning_rate": 1.4167556091387526e-05,
"loss": 0.3066,
"step": 311000
},
{
"epoch": 0.7173796279861042,
"grad_norm": 0.4070858657360077,
"learning_rate": 1.414450404678643e-05,
"loss": 0.3673,
"step": 311200
},
{
"epoch": 0.7178406688781261,
"grad_norm": 1.0770429372787476,
"learning_rate": 1.4121452002185334e-05,
"loss": 0.3161,
"step": 311400
},
{
"epoch": 0.7183017097701481,
"grad_norm": 0.48763588070869446,
"learning_rate": 1.4098399957584238e-05,
"loss": 0.3449,
"step": 311600
},
{
"epoch": 0.71876275066217,
"grad_norm": 0.3299145996570587,
"learning_rate": 1.4075347912983142e-05,
"loss": 0.3063,
"step": 311800
},
{
"epoch": 0.7192237915541919,
"grad_norm": 0.37613481283187866,
"learning_rate": 1.4052295868382046e-05,
"loss": 0.3353,
"step": 312000
},
{
"epoch": 0.7196848324462138,
"grad_norm": 0.3305826485157013,
"learning_rate": 1.4029243823780951e-05,
"loss": 0.3631,
"step": 312200
},
{
"epoch": 0.7201458733382358,
"grad_norm": 0.6210027933120728,
"learning_rate": 1.4006191779179855e-05,
"loss": 0.34,
"step": 312400
},
{
"epoch": 0.7206069142302577,
"grad_norm": 0.4074510633945465,
"learning_rate": 1.398313973457876e-05,
"loss": 0.3438,
"step": 312600
},
{
"epoch": 0.7210679551222796,
"grad_norm": 0.47203293442726135,
"learning_rate": 1.3960087689977663e-05,
"loss": 0.3126,
"step": 312800
},
{
"epoch": 0.7215289960143015,
"grad_norm": 0.3788442015647888,
"learning_rate": 1.3937035645376567e-05,
"loss": 0.3451,
"step": 313000
},
{
"epoch": 0.7219900369063234,
"grad_norm": 0.45450833439826965,
"learning_rate": 1.3913983600775471e-05,
"loss": 0.3496,
"step": 313200
},
{
"epoch": 0.7224510777983453,
"grad_norm": 0.3482572138309479,
"learning_rate": 1.3890931556174375e-05,
"loss": 0.3367,
"step": 313400
},
{
"epoch": 0.7229121186903672,
"grad_norm": 0.7931276559829712,
"learning_rate": 1.386787951157328e-05,
"loss": 0.3435,
"step": 313600
},
{
"epoch": 0.7233731595823891,
"grad_norm": 0.41410383582115173,
"learning_rate": 1.384494272719519e-05,
"loss": 0.3595,
"step": 313800
},
{
"epoch": 0.723834200474411,
"grad_norm": 0.3738642930984497,
"learning_rate": 1.3821890682594094e-05,
"loss": 0.3268,
"step": 314000
},
{
"epoch": 0.724295241366433,
"grad_norm": 0.3588384985923767,
"learning_rate": 1.3798838637992998e-05,
"loss": 0.311,
"step": 314200
},
{
"epoch": 0.7247562822584549,
"grad_norm": 0.642122209072113,
"learning_rate": 1.37757865933919e-05,
"loss": 0.3596,
"step": 314400
},
{
"epoch": 0.7252173231504768,
"grad_norm": 0.6884900331497192,
"learning_rate": 1.3752734548790804e-05,
"loss": 0.3292,
"step": 314600
},
{
"epoch": 0.7256783640424987,
"grad_norm": 0.4391751289367676,
"learning_rate": 1.3729682504189708e-05,
"loss": 0.3435,
"step": 314800
},
{
"epoch": 0.7261394049345207,
"grad_norm": 0.42878326773643494,
"learning_rate": 1.3706630459588616e-05,
"loss": 0.302,
"step": 315000
},
{
"epoch": 0.7266004458265426,
"grad_norm": 0.977390706539154,
"learning_rate": 1.368357841498752e-05,
"loss": 0.3475,
"step": 315200
},
{
"epoch": 0.7270614867185645,
"grad_norm": 0.5029573440551758,
"learning_rate": 1.3660526370386422e-05,
"loss": 0.3438,
"step": 315400
},
{
"epoch": 0.7275225276105864,
"grad_norm": 0.5833427309989929,
"learning_rate": 1.3637474325785326e-05,
"loss": 0.337,
"step": 315600
},
{
"epoch": 0.7279835685026084,
"grad_norm": 0.5248010754585266,
"learning_rate": 1.361442228118423e-05,
"loss": 0.3337,
"step": 315800
},
{
"epoch": 0.7284446093946303,
"grad_norm": 0.6147269606590271,
"learning_rate": 1.359148549680614e-05,
"loss": 0.3646,
"step": 316000
},
{
"epoch": 0.7289056502866522,
"grad_norm": 0.16760729253292084,
"learning_rate": 1.3568433452205043e-05,
"loss": 0.3588,
"step": 316200
},
{
"epoch": 0.7293666911786741,
"grad_norm": 0.4736160635948181,
"learning_rate": 1.3545381407603949e-05,
"loss": 0.2925,
"step": 316400
},
{
"epoch": 0.729827732070696,
"grad_norm": 0.5483229160308838,
"learning_rate": 1.3522329363002853e-05,
"loss": 0.386,
"step": 316600
},
{
"epoch": 0.7302887729627179,
"grad_norm": 0.19415533542633057,
"learning_rate": 1.3499277318401757e-05,
"loss": 0.342,
"step": 316800
},
{
"epoch": 0.7307498138547398,
"grad_norm": 0.585850179195404,
"learning_rate": 1.347622527380066e-05,
"loss": 0.3307,
"step": 317000
},
{
"epoch": 0.7312108547467617,
"grad_norm": 0.63239985704422,
"learning_rate": 1.3453173229199565e-05,
"loss": 0.3368,
"step": 317200
},
{
"epoch": 0.7316718956387837,
"grad_norm": 0.11184985190629959,
"learning_rate": 1.3430121184598469e-05,
"loss": 0.3593,
"step": 317400
},
{
"epoch": 0.7321329365308056,
"grad_norm": 0.5238900780677795,
"learning_rate": 1.3407069139997373e-05,
"loss": 0.3827,
"step": 317600
},
{
"epoch": 0.7325939774228275,
"grad_norm": 0.838485062122345,
"learning_rate": 1.3384017095396275e-05,
"loss": 0.3594,
"step": 317800
},
{
"epoch": 0.7330550183148494,
"grad_norm": 0.24753722548484802,
"learning_rate": 1.3360965050795182e-05,
"loss": 0.3659,
"step": 318000
},
{
"epoch": 0.7335160592068714,
"grad_norm": 0.2856638729572296,
"learning_rate": 1.3337913006194086e-05,
"loss": 0.3055,
"step": 318200
},
{
"epoch": 0.7339771000988933,
"grad_norm": 0.36176905035972595,
"learning_rate": 1.331486096159299e-05,
"loss": 0.3537,
"step": 318400
},
{
"epoch": 0.7344381409909152,
"grad_norm": 0.22868487238883972,
"learning_rate": 1.3291924177214898e-05,
"loss": 0.3492,
"step": 318600
},
{
"epoch": 0.7348991818829371,
"grad_norm": 0.35884350538253784,
"learning_rate": 1.3268872132613802e-05,
"loss": 0.3231,
"step": 318800
},
{
"epoch": 0.735360222774959,
"grad_norm": 0.2672630250453949,
"learning_rate": 1.3245820088012706e-05,
"loss": 0.3519,
"step": 319000
},
{
"epoch": 0.735821263666981,
"grad_norm": 0.1157107725739479,
"learning_rate": 1.322276804341161e-05,
"loss": 0.3305,
"step": 319200
},
{
"epoch": 0.7362823045590029,
"grad_norm": 0.2983868718147278,
"learning_rate": 1.3199715998810515e-05,
"loss": 0.3128,
"step": 319400
},
{
"epoch": 0.7367433454510248,
"grad_norm": 0.24847714602947235,
"learning_rate": 1.317666395420942e-05,
"loss": 0.3113,
"step": 319600
},
{
"epoch": 0.7372043863430467,
"grad_norm": 0.620329737663269,
"learning_rate": 1.3153611909608323e-05,
"loss": 0.3472,
"step": 319800
},
{
"epoch": 0.7376654272350686,
"grad_norm": 0.2390708178281784,
"learning_rate": 1.3130559865007227e-05,
"loss": 0.313,
"step": 320000
},
{
"epoch": 0.7381264681270905,
"grad_norm": 0.3180009424686432,
"learning_rate": 1.3107507820406131e-05,
"loss": 0.3331,
"step": 320200
},
{
"epoch": 0.7385875090191124,
"grad_norm": 0.3560062646865845,
"learning_rate": 1.3084455775805035e-05,
"loss": 0.3513,
"step": 320400
},
{
"epoch": 0.7390485499111343,
"grad_norm": 0.637478232383728,
"learning_rate": 1.306140373120394e-05,
"loss": 0.3453,
"step": 320600
},
{
"epoch": 0.7395095908031563,
"grad_norm": 0.44424542784690857,
"learning_rate": 1.3038351686602843e-05,
"loss": 0.3151,
"step": 320800
},
{
"epoch": 0.7399706316951782,
"grad_norm": 0.259972482919693,
"learning_rate": 1.3015299642001749e-05,
"loss": 0.3422,
"step": 321000
},
{
"epoch": 0.7404316725872001,
"grad_norm": 0.31797853112220764,
"learning_rate": 1.2992247597400653e-05,
"loss": 0.3446,
"step": 321200
},
{
"epoch": 0.740892713479222,
"grad_norm": 0.6182370781898499,
"learning_rate": 1.2969195552799557e-05,
"loss": 0.3612,
"step": 321400
},
{
"epoch": 0.741353754371244,
"grad_norm": 0.4620261788368225,
"learning_rate": 1.2946143508198461e-05,
"loss": 0.3548,
"step": 321600
},
{
"epoch": 0.7418147952632659,
"grad_norm": 0.3697432577610016,
"learning_rate": 1.2923091463597365e-05,
"loss": 0.3716,
"step": 321800
},
{
"epoch": 0.7422758361552878,
"grad_norm": 0.555289089679718,
"learning_rate": 1.2900039418996269e-05,
"loss": 0.3579,
"step": 322000
},
{
"epoch": 0.7427368770473097,
"grad_norm": 0.12596555054187775,
"learning_rate": 1.2876987374395171e-05,
"loss": 0.3565,
"step": 322200
},
{
"epoch": 0.7431979179393317,
"grad_norm": 0.23611651360988617,
"learning_rate": 1.2853935329794075e-05,
"loss": 0.3613,
"step": 322400
},
{
"epoch": 0.7436589588313536,
"grad_norm": 0.33889690041542053,
"learning_rate": 1.283088328519298e-05,
"loss": 0.3363,
"step": 322600
},
{
"epoch": 0.7441199997233755,
"grad_norm": 0.547919750213623,
"learning_rate": 1.280794650081489e-05,
"loss": 0.3519,
"step": 322800
},
{
"epoch": 0.7445810406153974,
"grad_norm": 0.22968533635139465,
"learning_rate": 1.2784894456213794e-05,
"loss": 0.376,
"step": 323000
},
{
"epoch": 0.7450420815074194,
"grad_norm": 0.6183640360832214,
"learning_rate": 1.2761842411612698e-05,
"loss": 0.3045,
"step": 323200
},
{
"epoch": 0.7455031223994412,
"grad_norm": 0.1490042358636856,
"learning_rate": 1.2738790367011602e-05,
"loss": 0.3224,
"step": 323400
},
{
"epoch": 0.7459641632914631,
"grad_norm": 0.6043654680252075,
"learning_rate": 1.2715738322410506e-05,
"loss": 0.312,
"step": 323600
},
{
"epoch": 0.746425204183485,
"grad_norm": 0.6506444811820984,
"learning_rate": 1.269268627780941e-05,
"loss": 0.3728,
"step": 323800
},
{
"epoch": 0.7468862450755069,
"grad_norm": 0.511868417263031,
"learning_rate": 1.2669634233208316e-05,
"loss": 0.315,
"step": 324000
},
{
"epoch": 0.7473472859675289,
"grad_norm": 0.5781561136245728,
"learning_rate": 1.264658218860722e-05,
"loss": 0.3568,
"step": 324200
},
{
"epoch": 0.7478083268595508,
"grad_norm": 0.6765771508216858,
"learning_rate": 1.2623530144006124e-05,
"loss": 0.3314,
"step": 324400
},
{
"epoch": 0.7482693677515727,
"grad_norm": 0.2340543568134308,
"learning_rate": 1.2600478099405028e-05,
"loss": 0.3698,
"step": 324600
},
{
"epoch": 0.7487304086435946,
"grad_norm": 0.27794602513313293,
"learning_rate": 1.2577426054803932e-05,
"loss": 0.3534,
"step": 324800
},
{
"epoch": 0.7491914495356166,
"grad_norm": 0.37269988656044006,
"learning_rate": 1.2554374010202836e-05,
"loss": 0.3537,
"step": 325000
},
{
"epoch": 0.7496524904276385,
"grad_norm": 0.2142777442932129,
"learning_rate": 1.2531552486047752e-05,
"loss": 0.329,
"step": 325200
},
{
"epoch": 0.7501135313196604,
"grad_norm": 0.24108269810676575,
"learning_rate": 1.2508500441446656e-05,
"loss": 0.3492,
"step": 325400
},
{
"epoch": 0.7505745722116823,
"grad_norm": 0.8712294697761536,
"learning_rate": 1.2485448396845558e-05,
"loss": 0.3148,
"step": 325600
},
{
"epoch": 0.7510356131037043,
"grad_norm": 0.5410193204879761,
"learning_rate": 1.2462396352244462e-05,
"loss": 0.3416,
"step": 325800
},
{
"epoch": 0.7514966539957262,
"grad_norm": 0.4121902585029602,
"learning_rate": 1.2439344307643366e-05,
"loss": 0.3509,
"step": 326000
},
{
"epoch": 0.7519576948877481,
"grad_norm": 0.363309770822525,
"learning_rate": 1.2416292263042272e-05,
"loss": 0.363,
"step": 326200
},
{
"epoch": 0.75241873577977,
"grad_norm": 0.4633045196533203,
"learning_rate": 1.2393240218441176e-05,
"loss": 0.3293,
"step": 326400
},
{
"epoch": 0.752879776671792,
"grad_norm": 0.26468852162361145,
"learning_rate": 1.237018817384008e-05,
"loss": 0.3698,
"step": 326600
},
{
"epoch": 0.7533408175638138,
"grad_norm": 0.644490122795105,
"learning_rate": 1.2347136129238984e-05,
"loss": 0.325,
"step": 326800
},
{
"epoch": 0.7538018584558357,
"grad_norm": 0.30917617678642273,
"learning_rate": 1.2324084084637888e-05,
"loss": 0.3339,
"step": 327000
},
{
"epoch": 0.7542628993478576,
"grad_norm": 0.44271737337112427,
"learning_rate": 1.2301032040036792e-05,
"loss": 0.3258,
"step": 327200
},
{
"epoch": 0.7547239402398795,
"grad_norm": 0.2927365005016327,
"learning_rate": 1.2277979995435696e-05,
"loss": 0.3547,
"step": 327400
},
{
"epoch": 0.7551849811319015,
"grad_norm": 0.2159711718559265,
"learning_rate": 1.22549279508346e-05,
"loss": 0.3181,
"step": 327600
},
{
"epoch": 0.7556460220239234,
"grad_norm": 0.5284668803215027,
"learning_rate": 1.2231875906233505e-05,
"loss": 0.3485,
"step": 327800
},
{
"epoch": 0.7561070629159453,
"grad_norm": 0.8608717322349548,
"learning_rate": 1.2208939121855413e-05,
"loss": 0.3521,
"step": 328000
},
{
"epoch": 0.7565681038079672,
"grad_norm": 0.4907655119895935,
"learning_rate": 1.2185887077254318e-05,
"loss": 0.3408,
"step": 328200
},
{
"epoch": 0.7570291446999892,
"grad_norm": 0.389893501996994,
"learning_rate": 1.2162835032653222e-05,
"loss": 0.3656,
"step": 328400
},
{
"epoch": 0.7574901855920111,
"grad_norm": 0.3046099841594696,
"learning_rate": 1.2139782988052126e-05,
"loss": 0.3565,
"step": 328600
},
{
"epoch": 0.757951226484033,
"grad_norm": 0.3519379794597626,
"learning_rate": 1.211673094345103e-05,
"loss": 0.3327,
"step": 328800
},
{
"epoch": 0.7584122673760549,
"grad_norm": 0.23121443390846252,
"learning_rate": 1.2093678898849934e-05,
"loss": 0.3565,
"step": 329000
},
{
"epoch": 0.7588733082680769,
"grad_norm": 0.15006621181964874,
"learning_rate": 1.2070626854248838e-05,
"loss": 0.3089,
"step": 329200
},
{
"epoch": 0.7593343491600988,
"grad_norm": 0.32709234952926636,
"learning_rate": 1.2047574809647742e-05,
"loss": 0.3345,
"step": 329400
},
{
"epoch": 0.7597953900521207,
"grad_norm": 0.5025938153266907,
"learning_rate": 1.2024522765046646e-05,
"loss": 0.2891,
"step": 329600
},
{
"epoch": 0.7602564309441426,
"grad_norm": 0.434060275554657,
"learning_rate": 1.200147072044555e-05,
"loss": 0.3349,
"step": 329800
},
{
"epoch": 0.7607174718361646,
"grad_norm": 0.17737102508544922,
"learning_rate": 1.1978418675844454e-05,
"loss": 0.3575,
"step": 330000
},
{
"epoch": 0.7611785127281864,
"grad_norm": 0.6926540732383728,
"learning_rate": 1.1955366631243358e-05,
"loss": 0.3402,
"step": 330200
},
{
"epoch": 0.7616395536202083,
"grad_norm": 0.1456059366464615,
"learning_rate": 1.1932314586642262e-05,
"loss": 0.3384,
"step": 330400
},
{
"epoch": 0.7621005945122302,
"grad_norm": 0.4401134252548218,
"learning_rate": 1.1909262542041166e-05,
"loss": 0.3215,
"step": 330600
},
{
"epoch": 0.7625616354042521,
"grad_norm": 0.5725939273834229,
"learning_rate": 1.1886210497440072e-05,
"loss": 0.3263,
"step": 330800
},
{
"epoch": 0.7630226762962741,
"grad_norm": 0.4161362051963806,
"learning_rate": 1.1863158452838976e-05,
"loss": 0.3996,
"step": 331000
},
{
"epoch": 0.763483717188296,
"grad_norm": 0.47353842854499817,
"learning_rate": 1.184010640823788e-05,
"loss": 0.3441,
"step": 331200
},
{
"epoch": 0.7639447580803179,
"grad_norm": 0.4981532394886017,
"learning_rate": 1.1817054363636782e-05,
"loss": 0.3374,
"step": 331400
},
{
"epoch": 0.7644057989723398,
"grad_norm": 0.5217335820198059,
"learning_rate": 1.1794002319035688e-05,
"loss": 0.2804,
"step": 331600
},
{
"epoch": 0.7648668398643618,
"grad_norm": 0.39930033683776855,
"learning_rate": 1.1770950274434592e-05,
"loss": 0.3612,
"step": 331800
},
{
"epoch": 0.7653278807563837,
"grad_norm": 0.6013798117637634,
"learning_rate": 1.1747898229833496e-05,
"loss": 0.3522,
"step": 332000
},
{
"epoch": 0.7657889216484056,
"grad_norm": 0.621385395526886,
"learning_rate": 1.172507670567841e-05,
"loss": 0.3119,
"step": 332200
},
{
"epoch": 0.7662499625404275,
"grad_norm": 0.6038312315940857,
"learning_rate": 1.1702024661077316e-05,
"loss": 0.3521,
"step": 332400
},
{
"epoch": 0.7667110034324495,
"grad_norm": 0.3375696539878845,
"learning_rate": 1.167897261647622e-05,
"loss": 0.3313,
"step": 332600
},
{
"epoch": 0.7671720443244714,
"grad_norm": 0.6996720433235168,
"learning_rate": 1.1655920571875124e-05,
"loss": 0.3565,
"step": 332800
},
{
"epoch": 0.7676330852164933,
"grad_norm": 0.4329245686531067,
"learning_rate": 1.1632868527274026e-05,
"loss": 0.3402,
"step": 333000
},
{
"epoch": 0.7680941261085152,
"grad_norm": 0.44923895597457886,
"learning_rate": 1.160981648267293e-05,
"loss": 0.3685,
"step": 333200
},
{
"epoch": 0.7685551670005372,
"grad_norm": 0.6261276602745056,
"learning_rate": 1.1586764438071836e-05,
"loss": 0.3702,
"step": 333400
},
{
"epoch": 0.769016207892559,
"grad_norm": 0.351365327835083,
"learning_rate": 1.156371239347074e-05,
"loss": 0.3791,
"step": 333600
},
{
"epoch": 0.7694772487845809,
"grad_norm": 0.3072427809238434,
"learning_rate": 1.1540660348869644e-05,
"loss": 0.3334,
"step": 333800
},
{
"epoch": 0.7699382896766028,
"grad_norm": 0.6335543990135193,
"learning_rate": 1.1517608304268548e-05,
"loss": 0.3679,
"step": 334000
},
{
"epoch": 0.7703993305686248,
"grad_norm": 0.6969138383865356,
"learning_rate": 1.1494556259667452e-05,
"loss": 0.3297,
"step": 334200
},
{
"epoch": 0.7708603714606467,
"grad_norm": 0.2591989040374756,
"learning_rate": 1.1471619475289361e-05,
"loss": 0.3308,
"step": 334400
},
{
"epoch": 0.7713214123526686,
"grad_norm": 0.43712693452835083,
"learning_rate": 1.1448567430688267e-05,
"loss": 0.3129,
"step": 334600
},
{
"epoch": 0.7717824532446905,
"grad_norm": 0.23401319980621338,
"learning_rate": 1.142551538608717e-05,
"loss": 0.3493,
"step": 334800
},
{
"epoch": 0.7722434941367124,
"grad_norm": 2.6639888286590576,
"learning_rate": 1.1402463341486073e-05,
"loss": 0.3675,
"step": 335000
},
{
"epoch": 0.7727045350287344,
"grad_norm": 0.3677375316619873,
"learning_rate": 1.1379411296884977e-05,
"loss": 0.3342,
"step": 335200
},
{
"epoch": 0.7731655759207563,
"grad_norm": 0.335509717464447,
"learning_rate": 1.1356359252283882e-05,
"loss": 0.3442,
"step": 335400
},
{
"epoch": 0.7736266168127782,
"grad_norm": 0.5779723525047302,
"learning_rate": 1.1333307207682786e-05,
"loss": 0.3273,
"step": 335600
},
{
"epoch": 0.7740876577048001,
"grad_norm": 0.2849660813808441,
"learning_rate": 1.131025516308169e-05,
"loss": 0.3444,
"step": 335800
},
{
"epoch": 0.7745486985968221,
"grad_norm": 0.23056674003601074,
"learning_rate": 1.1287203118480594e-05,
"loss": 0.3021,
"step": 336000
},
{
"epoch": 0.775009739488844,
"grad_norm": 0.5832560062408447,
"learning_rate": 1.1264151073879498e-05,
"loss": 0.348,
"step": 336200
},
{
"epoch": 0.7754707803808659,
"grad_norm": 0.3691689670085907,
"learning_rate": 1.1241099029278402e-05,
"loss": 0.3401,
"step": 336400
},
{
"epoch": 0.7759318212728878,
"grad_norm": 0.7762495279312134,
"learning_rate": 1.1218046984677306e-05,
"loss": 0.4003,
"step": 336600
},
{
"epoch": 0.7763928621649098,
"grad_norm": 0.7631283402442932,
"learning_rate": 1.119499494007621e-05,
"loss": 0.3371,
"step": 336800
},
{
"epoch": 0.7768539030569316,
"grad_norm": 1.1363520622253418,
"learning_rate": 1.1171942895475114e-05,
"loss": 0.3009,
"step": 337000
},
{
"epoch": 0.7773149439489535,
"grad_norm": 0.6495063304901123,
"learning_rate": 1.114889085087402e-05,
"loss": 0.3623,
"step": 337200
},
{
"epoch": 0.7777759848409754,
"grad_norm": 0.6790567636489868,
"learning_rate": 1.1125954066495928e-05,
"loss": 0.3501,
"step": 337400
},
{
"epoch": 0.7782370257329974,
"grad_norm": 0.26499179005622864,
"learning_rate": 1.1102902021894833e-05,
"loss": 0.339,
"step": 337600
},
{
"epoch": 0.7786980666250193,
"grad_norm": 0.27862033247947693,
"learning_rate": 1.1079849977293737e-05,
"loss": 0.3506,
"step": 337800
},
{
"epoch": 0.7791591075170412,
"grad_norm": 0.9641061425209045,
"learning_rate": 1.1056797932692641e-05,
"loss": 0.3389,
"step": 338000
},
{
"epoch": 0.7796201484090631,
"grad_norm": 0.6359947323799133,
"learning_rate": 1.1033745888091543e-05,
"loss": 0.3234,
"step": 338200
},
{
"epoch": 0.780081189301085,
"grad_norm": 0.36157238483428955,
"learning_rate": 1.1010693843490449e-05,
"loss": 0.3349,
"step": 338400
},
{
"epoch": 0.780542230193107,
"grad_norm": 0.33560287952423096,
"learning_rate": 1.0987641798889353e-05,
"loss": 0.339,
"step": 338600
},
{
"epoch": 0.7810032710851289,
"grad_norm": 0.44259363412857056,
"learning_rate": 1.0964589754288257e-05,
"loss": 0.3225,
"step": 338800
},
{
"epoch": 0.7814643119771508,
"grad_norm": 0.4264705181121826,
"learning_rate": 1.0941537709687161e-05,
"loss": 0.318,
"step": 339000
},
{
"epoch": 0.7819253528691728,
"grad_norm": 0.39866140484809875,
"learning_rate": 1.0918485665086067e-05,
"loss": 0.3443,
"step": 339200
},
{
"epoch": 0.7823863937611947,
"grad_norm": 0.08641880005598068,
"learning_rate": 1.0895433620484969e-05,
"loss": 0.3738,
"step": 339400
},
{
"epoch": 0.7828474346532166,
"grad_norm": 0.2622983753681183,
"learning_rate": 1.087249683610688e-05,
"loss": 0.3489,
"step": 339600
},
{
"epoch": 0.7833084755452385,
"grad_norm": 0.5842790603637695,
"learning_rate": 1.0849444791505784e-05,
"loss": 0.3097,
"step": 339800
},
{
"epoch": 0.7837695164372604,
"grad_norm": 0.6991069316864014,
"learning_rate": 1.0826392746904688e-05,
"loss": 0.2891,
"step": 340000
},
{
"epoch": 0.7842305573292824,
"grad_norm": 0.6470245718955994,
"learning_rate": 1.080334070230359e-05,
"loss": 0.3419,
"step": 340200
},
{
"epoch": 0.7846915982213042,
"grad_norm": 0.17487183213233948,
"learning_rate": 1.0780288657702496e-05,
"loss": 0.3527,
"step": 340400
},
{
"epoch": 0.7851526391133261,
"grad_norm": 0.4151386320590973,
"learning_rate": 1.07572366131014e-05,
"loss": 0.3574,
"step": 340600
},
{
"epoch": 0.785613680005348,
"grad_norm": 0.22517116367816925,
"learning_rate": 1.0734184568500304e-05,
"loss": 0.3171,
"step": 340800
},
{
"epoch": 0.78607472089737,
"grad_norm": 2.3805177211761475,
"learning_rate": 1.0711132523899208e-05,
"loss": 0.3545,
"step": 341000
},
{
"epoch": 0.7865357617893919,
"grad_norm": 0.4683511555194855,
"learning_rate": 1.0688080479298112e-05,
"loss": 0.329,
"step": 341200
},
{
"epoch": 0.7869968026814138,
"grad_norm": 0.6103031039237976,
"learning_rate": 1.0665028434697016e-05,
"loss": 0.3254,
"step": 341400
},
{
"epoch": 0.7874578435734357,
"grad_norm": 0.3412925601005554,
"learning_rate": 1.064197639009592e-05,
"loss": 0.3361,
"step": 341600
},
{
"epoch": 0.7879188844654577,
"grad_norm": 0.505245566368103,
"learning_rate": 1.0618924345494824e-05,
"loss": 0.3568,
"step": 341800
},
{
"epoch": 0.7883799253574796,
"grad_norm": 3.5416276454925537,
"learning_rate": 1.0595872300893728e-05,
"loss": 0.342,
"step": 342000
},
{
"epoch": 0.7888409662495015,
"grad_norm": 0.7549706697463989,
"learning_rate": 1.0572820256292633e-05,
"loss": 0.3579,
"step": 342200
},
{
"epoch": 0.7893020071415234,
"grad_norm": 0.2924489676952362,
"learning_rate": 1.0549768211691537e-05,
"loss": 0.3424,
"step": 342400
},
{
"epoch": 0.7897630480335454,
"grad_norm": 0.2786658704280853,
"learning_rate": 1.052671616709044e-05,
"loss": 0.3483,
"step": 342600
},
{
"epoch": 0.7902240889255673,
"grad_norm": 0.5424038171768188,
"learning_rate": 1.0503664122489344e-05,
"loss": 0.3607,
"step": 342800
},
{
"epoch": 0.7906851298175892,
"grad_norm": 0.4523806571960449,
"learning_rate": 1.048061207788825e-05,
"loss": 0.3069,
"step": 343000
},
{
"epoch": 0.7911461707096111,
"grad_norm": 0.4648999273777008,
"learning_rate": 1.0457560033287153e-05,
"loss": 0.3382,
"step": 343200
},
{
"epoch": 0.791607211601633,
"grad_norm": 0.5711612105369568,
"learning_rate": 1.0434623248909063e-05,
"loss": 0.3529,
"step": 343400
},
{
"epoch": 0.792068252493655,
"grad_norm": 0.5079129934310913,
"learning_rate": 1.0411571204307966e-05,
"loss": 0.3405,
"step": 343600
},
{
"epoch": 0.7925292933856768,
"grad_norm": 0.5167766213417053,
"learning_rate": 1.038851915970687e-05,
"loss": 0.3124,
"step": 343800
},
{
"epoch": 0.7929903342776987,
"grad_norm": 0.5312823057174683,
"learning_rate": 1.0365467115105774e-05,
"loss": 0.3303,
"step": 344000
},
{
"epoch": 0.7934513751697206,
"grad_norm": 0.5642597079277039,
"learning_rate": 1.0342415070504678e-05,
"loss": 0.3481,
"step": 344200
},
{
"epoch": 0.7939124160617426,
"grad_norm": 0.36466097831726074,
"learning_rate": 1.0319363025903584e-05,
"loss": 0.3229,
"step": 344400
},
{
"epoch": 0.7943734569537645,
"grad_norm": 0.3474145233631134,
"learning_rate": 1.0296310981302486e-05,
"loss": 0.336,
"step": 344600
},
{
"epoch": 0.7948344978457864,
"grad_norm": 0.42870771884918213,
"learning_rate": 1.027325893670139e-05,
"loss": 0.3317,
"step": 344800
},
{
"epoch": 0.7952955387378083,
"grad_norm": 0.2636290490627289,
"learning_rate": 1.0250206892100294e-05,
"loss": 0.3277,
"step": 345000
},
{
"epoch": 0.7957565796298303,
"grad_norm": 0.2751811444759369,
"learning_rate": 1.02271548474992e-05,
"loss": 0.3327,
"step": 345200
},
{
"epoch": 0.7962176205218522,
"grad_norm": 0.410022109746933,
"learning_rate": 1.0204102802898104e-05,
"loss": 0.3107,
"step": 345400
},
{
"epoch": 0.7966786614138741,
"grad_norm": 0.3887878656387329,
"learning_rate": 1.0181050758297008e-05,
"loss": 0.3489,
"step": 345600
},
{
"epoch": 0.797139702305896,
"grad_norm": 0.3395098149776459,
"learning_rate": 1.0157998713695912e-05,
"loss": 0.3418,
"step": 345800
},
{
"epoch": 0.797600743197918,
"grad_norm": 0.1733238846063614,
"learning_rate": 1.0134946669094816e-05,
"loss": 0.3379,
"step": 346000
},
{
"epoch": 0.7980617840899399,
"grad_norm": 0.7795068621635437,
"learning_rate": 1.011189462449372e-05,
"loss": 0.3483,
"step": 346200
},
{
"epoch": 0.7985228249819618,
"grad_norm": 0.40186524391174316,
"learning_rate": 1.0088842579892624e-05,
"loss": 0.3293,
"step": 346400
},
{
"epoch": 0.7989838658739837,
"grad_norm": 0.8612332940101624,
"learning_rate": 1.0065790535291528e-05,
"loss": 0.3449,
"step": 346600
},
{
"epoch": 0.7994449067660057,
"grad_norm": 0.4895592927932739,
"learning_rate": 1.0042738490690434e-05,
"loss": 0.337,
"step": 346800
},
{
"epoch": 0.7999059476580276,
"grad_norm": 0.49298328161239624,
"learning_rate": 1.0019686446089336e-05,
"loss": 0.3513,
"step": 347000
},
{
"epoch": 0.8003669885500494,
"grad_norm": 0.6252027153968811,
"learning_rate": 9.996749661711247e-06,
"loss": 0.3368,
"step": 347200
},
{
"epoch": 0.8008280294420713,
"grad_norm": 0.31628209352493286,
"learning_rate": 9.97369761711015e-06,
"loss": 0.3553,
"step": 347400
},
{
"epoch": 0.8012890703340932,
"grad_norm": 0.47392186522483826,
"learning_rate": 9.950645572509055e-06,
"loss": 0.3615,
"step": 347600
},
{
"epoch": 0.8017501112261152,
"grad_norm": 0.37043142318725586,
"learning_rate": 9.927593527907959e-06,
"loss": 0.3539,
"step": 347800
},
{
"epoch": 0.8022111521181371,
"grad_norm": 0.4550717771053314,
"learning_rate": 9.904541483306861e-06,
"loss": 0.337,
"step": 348000
},
{
"epoch": 0.802672193010159,
"grad_norm": 0.35163044929504395,
"learning_rate": 9.881489438705767e-06,
"loss": 0.3113,
"step": 348200
},
{
"epoch": 0.8031332339021809,
"grad_norm": 0.4993239939212799,
"learning_rate": 9.85843739410467e-06,
"loss": 0.3342,
"step": 348400
},
{
"epoch": 0.8035942747942029,
"grad_norm": 0.3358531892299652,
"learning_rate": 9.835385349503575e-06,
"loss": 0.3617,
"step": 348600
},
{
"epoch": 0.8040553156862248,
"grad_norm": 0.4327227771282196,
"learning_rate": 9.812333304902479e-06,
"loss": 0.3205,
"step": 348800
},
{
"epoch": 0.8045163565782467,
"grad_norm": 0.42475658655166626,
"learning_rate": 9.789281260301383e-06,
"loss": 0.3222,
"step": 349000
},
{
"epoch": 0.8049773974702686,
"grad_norm": 2.7171945571899414,
"learning_rate": 9.766229215700287e-06,
"loss": 0.3424,
"step": 349200
},
{
"epoch": 0.8054384383622906,
"grad_norm": 0.2042093276977539,
"learning_rate": 9.743292431322197e-06,
"loss": 0.3581,
"step": 349400
},
{
"epoch": 0.8058994792543125,
"grad_norm": 0.24315522611141205,
"learning_rate": 9.720240386721101e-06,
"loss": 0.3284,
"step": 349600
},
{
"epoch": 0.8063605201463344,
"grad_norm": 0.42488613724708557,
"learning_rate": 9.697188342120005e-06,
"loss": 0.3428,
"step": 349800
},
{
"epoch": 0.8068215610383563,
"grad_norm": 0.5881152749061584,
"learning_rate": 9.674136297518908e-06,
"loss": 0.3199,
"step": 350000
},
{
"epoch": 0.8068215610383563,
"eval_loss": 0.34083473682403564,
"eval_runtime": 223.8348,
"eval_samples_per_second": 19.577,
"eval_steps_per_second": 19.577,
"step": 350000
}
],
"logging_steps": 200,
"max_steps": 433801,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3263892905984e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}