craa's picture
Upload folder using huggingface_hub
f879408 verified
{
"best_global_step": 89000,
"best_metric": 3.5322518348693848,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_hit_frequency_1001/checkpoint-40000",
"epoch": 31.75990675990676,
"eval_steps": 1000,
"global_step": 109000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014568764568764568,
"grad_norm": 1.0800504684448242,
"learning_rate": 0.000294,
"loss": 8.4423,
"step": 50
},
{
"epoch": 0.029137529137529136,
"grad_norm": 1.1450026035308838,
"learning_rate": 0.0005939999999999999,
"loss": 6.716,
"step": 100
},
{
"epoch": 0.043706293706293704,
"grad_norm": 0.8036891222000122,
"learning_rate": 0.0005998285714285713,
"loss": 6.3382,
"step": 150
},
{
"epoch": 0.05827505827505827,
"grad_norm": 0.45410433411598206,
"learning_rate": 0.0005996536443148687,
"loss": 6.1321,
"step": 200
},
{
"epoch": 0.07284382284382285,
"grad_norm": 0.4067866802215576,
"learning_rate": 0.0005994787172011662,
"loss": 6.0112,
"step": 250
},
{
"epoch": 0.08741258741258741,
"grad_norm": 0.5059288144111633,
"learning_rate": 0.0005993037900874635,
"loss": 5.8763,
"step": 300
},
{
"epoch": 0.10198135198135198,
"grad_norm": 0.4744279086589813,
"learning_rate": 0.0005991288629737609,
"loss": 5.7396,
"step": 350
},
{
"epoch": 0.11655011655011654,
"grad_norm": 0.4487614035606384,
"learning_rate": 0.0005989539358600582,
"loss": 5.6258,
"step": 400
},
{
"epoch": 0.13111888111888112,
"grad_norm": 0.49402862787246704,
"learning_rate": 0.0005987790087463557,
"loss": 5.5118,
"step": 450
},
{
"epoch": 0.1456876456876457,
"grad_norm": 0.5154410004615784,
"learning_rate": 0.000598604081632653,
"loss": 5.4117,
"step": 500
},
{
"epoch": 0.16025641025641027,
"grad_norm": 0.42797785997390747,
"learning_rate": 0.0005984291545189504,
"loss": 5.3483,
"step": 550
},
{
"epoch": 0.17482517482517482,
"grad_norm": 0.40208497643470764,
"learning_rate": 0.0005982542274052477,
"loss": 5.2713,
"step": 600
},
{
"epoch": 0.1893939393939394,
"grad_norm": 0.4364264905452728,
"learning_rate": 0.0005980793002915452,
"loss": 5.1872,
"step": 650
},
{
"epoch": 0.20396270396270397,
"grad_norm": 0.43082669377326965,
"learning_rate": 0.0005979043731778425,
"loss": 5.1489,
"step": 700
},
{
"epoch": 0.21853146853146854,
"grad_norm": 0.42554041743278503,
"learning_rate": 0.0005977294460641399,
"loss": 5.1058,
"step": 750
},
{
"epoch": 0.2331002331002331,
"grad_norm": 0.4088204801082611,
"learning_rate": 0.0005975545189504372,
"loss": 5.0338,
"step": 800
},
{
"epoch": 0.24766899766899766,
"grad_norm": 0.46788325905799866,
"learning_rate": 0.0005973795918367347,
"loss": 4.9637,
"step": 850
},
{
"epoch": 0.26223776223776224,
"grad_norm": 0.4362371861934662,
"learning_rate": 0.000597204664723032,
"loss": 4.9335,
"step": 900
},
{
"epoch": 0.2768065268065268,
"grad_norm": 0.4068501591682434,
"learning_rate": 0.0005970297376093294,
"loss": 4.8877,
"step": 950
},
{
"epoch": 0.2913752913752914,
"grad_norm": 0.43599948287010193,
"learning_rate": 0.0005968548104956268,
"loss": 4.8439,
"step": 1000
},
{
"epoch": 0.2913752913752914,
"eval_accuracy": 0.25089175031884764,
"eval_loss": 4.7751851081848145,
"eval_runtime": 179.8397,
"eval_samples_per_second": 92.538,
"eval_steps_per_second": 5.788,
"step": 1000
},
{
"epoch": 0.30594405594405594,
"grad_norm": 0.43511125445365906,
"learning_rate": 0.0005966798833819242,
"loss": 4.8042,
"step": 1050
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.5234044790267944,
"learning_rate": 0.0005965049562682215,
"loss": 4.7645,
"step": 1100
},
{
"epoch": 0.3350815850815851,
"grad_norm": 0.45317813754081726,
"learning_rate": 0.0005963300291545189,
"loss": 4.7255,
"step": 1150
},
{
"epoch": 0.34965034965034963,
"grad_norm": 0.46833401918411255,
"learning_rate": 0.0005961551020408162,
"loss": 4.6673,
"step": 1200
},
{
"epoch": 0.36421911421911424,
"grad_norm": 0.39988529682159424,
"learning_rate": 0.0005959801749271137,
"loss": 4.6489,
"step": 1250
},
{
"epoch": 0.3787878787878788,
"grad_norm": 0.46132174134254456,
"learning_rate": 0.000595805247813411,
"loss": 4.6077,
"step": 1300
},
{
"epoch": 0.39335664335664333,
"grad_norm": 0.5386888980865479,
"learning_rate": 0.0005956303206997084,
"loss": 4.6052,
"step": 1350
},
{
"epoch": 0.40792540792540793,
"grad_norm": 0.44581151008605957,
"learning_rate": 0.0005954553935860059,
"loss": 4.5552,
"step": 1400
},
{
"epoch": 0.4224941724941725,
"grad_norm": 0.4631711542606354,
"learning_rate": 0.0005952804664723032,
"loss": 4.5286,
"step": 1450
},
{
"epoch": 0.4370629370629371,
"grad_norm": 0.441871702671051,
"learning_rate": 0.0005951055393586005,
"loss": 4.5022,
"step": 1500
},
{
"epoch": 0.45163170163170163,
"grad_norm": 0.4177614152431488,
"learning_rate": 0.0005949306122448979,
"loss": 4.4857,
"step": 1550
},
{
"epoch": 0.4662004662004662,
"grad_norm": 0.4227793216705322,
"learning_rate": 0.0005947556851311952,
"loss": 4.4736,
"step": 1600
},
{
"epoch": 0.4807692307692308,
"grad_norm": 0.4034147262573242,
"learning_rate": 0.0005945807580174927,
"loss": 4.441,
"step": 1650
},
{
"epoch": 0.49533799533799533,
"grad_norm": 0.4163498282432556,
"learning_rate": 0.00059440583090379,
"loss": 4.4279,
"step": 1700
},
{
"epoch": 0.5099067599067599,
"grad_norm": 0.42639055848121643,
"learning_rate": 0.0005942309037900874,
"loss": 4.4093,
"step": 1750
},
{
"epoch": 0.5244755244755245,
"grad_norm": 0.4284451901912689,
"learning_rate": 0.0005940559766763847,
"loss": 4.3972,
"step": 1800
},
{
"epoch": 0.539044289044289,
"grad_norm": 0.4254048764705658,
"learning_rate": 0.0005938810495626822,
"loss": 4.3764,
"step": 1850
},
{
"epoch": 0.5536130536130536,
"grad_norm": 0.40332749485969543,
"learning_rate": 0.0005937061224489796,
"loss": 4.3664,
"step": 1900
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.41593775153160095,
"learning_rate": 0.0005935311953352769,
"loss": 4.3491,
"step": 1950
},
{
"epoch": 0.5827505827505828,
"grad_norm": 0.3947841227054596,
"learning_rate": 0.0005933562682215743,
"loss": 4.329,
"step": 2000
},
{
"epoch": 0.5827505827505828,
"eval_accuracy": 0.3000407334753674,
"eval_loss": 4.277864933013916,
"eval_runtime": 179.9996,
"eval_samples_per_second": 92.456,
"eval_steps_per_second": 5.783,
"step": 2000
},
{
"epoch": 0.5973193473193473,
"grad_norm": 0.3777885437011719,
"learning_rate": 0.0005931813411078717,
"loss": 4.3248,
"step": 2050
},
{
"epoch": 0.6118881118881119,
"grad_norm": 0.4356587827205658,
"learning_rate": 0.000593006413994169,
"loss": 4.3067,
"step": 2100
},
{
"epoch": 0.6264568764568764,
"grad_norm": 0.3846246898174286,
"learning_rate": 0.0005928314868804664,
"loss": 4.3008,
"step": 2150
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.3826352655887604,
"learning_rate": 0.0005926565597667638,
"loss": 4.2847,
"step": 2200
},
{
"epoch": 0.6555944055944056,
"grad_norm": 0.35496312379837036,
"learning_rate": 0.0005924816326530612,
"loss": 4.2659,
"step": 2250
},
{
"epoch": 0.6701631701631702,
"grad_norm": 0.3881349563598633,
"learning_rate": 0.0005923067055393586,
"loss": 4.2684,
"step": 2300
},
{
"epoch": 0.6847319347319347,
"grad_norm": 0.3730715215206146,
"learning_rate": 0.0005921317784256559,
"loss": 4.2634,
"step": 2350
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.38490796089172363,
"learning_rate": 0.0005919568513119533,
"loss": 4.242,
"step": 2400
},
{
"epoch": 0.7138694638694638,
"grad_norm": 0.3891685903072357,
"learning_rate": 0.0005917819241982507,
"loss": 4.2256,
"step": 2450
},
{
"epoch": 0.7284382284382285,
"grad_norm": 0.4048764705657959,
"learning_rate": 0.000591606997084548,
"loss": 4.2221,
"step": 2500
},
{
"epoch": 0.743006993006993,
"grad_norm": 0.3975249230861664,
"learning_rate": 0.0005914320699708454,
"loss": 4.2187,
"step": 2550
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.38864725828170776,
"learning_rate": 0.0005912571428571428,
"loss": 4.2054,
"step": 2600
},
{
"epoch": 0.7721445221445221,
"grad_norm": 0.4100123345851898,
"learning_rate": 0.0005910822157434402,
"loss": 4.2029,
"step": 2650
},
{
"epoch": 0.7867132867132867,
"grad_norm": 0.3994918763637543,
"learning_rate": 0.0005909072886297376,
"loss": 4.1821,
"step": 2700
},
{
"epoch": 0.8012820512820513,
"grad_norm": 0.4012759327888489,
"learning_rate": 0.0005907323615160349,
"loss": 4.1892,
"step": 2750
},
{
"epoch": 0.8158508158508159,
"grad_norm": 0.40569165349006653,
"learning_rate": 0.0005905574344023324,
"loss": 4.1752,
"step": 2800
},
{
"epoch": 0.8304195804195804,
"grad_norm": 0.4140167832374573,
"learning_rate": 0.0005903825072886297,
"loss": 4.1679,
"step": 2850
},
{
"epoch": 0.844988344988345,
"grad_norm": 0.40095993876457214,
"learning_rate": 0.000590207580174927,
"loss": 4.152,
"step": 2900
},
{
"epoch": 0.8595571095571095,
"grad_norm": 0.3606632351875305,
"learning_rate": 0.0005900326530612244,
"loss": 4.1414,
"step": 2950
},
{
"epoch": 0.8741258741258742,
"grad_norm": 0.3624541163444519,
"learning_rate": 0.0005898577259475218,
"loss": 4.1511,
"step": 3000
},
{
"epoch": 0.8741258741258742,
"eval_accuracy": 0.31668007594488373,
"eval_loss": 4.091818809509277,
"eval_runtime": 179.9935,
"eval_samples_per_second": 92.459,
"eval_steps_per_second": 5.784,
"step": 3000
},
{
"epoch": 0.8886946386946387,
"grad_norm": 0.37599942088127136,
"learning_rate": 0.0005896827988338192,
"loss": 4.1237,
"step": 3050
},
{
"epoch": 0.9032634032634033,
"grad_norm": 0.37333548069000244,
"learning_rate": 0.0005895078717201166,
"loss": 4.1181,
"step": 3100
},
{
"epoch": 0.9178321678321678,
"grad_norm": 0.3791821300983429,
"learning_rate": 0.000589332944606414,
"loss": 4.1161,
"step": 3150
},
{
"epoch": 0.9324009324009324,
"grad_norm": 0.34895625710487366,
"learning_rate": 0.0005891580174927114,
"loss": 4.1138,
"step": 3200
},
{
"epoch": 0.946969696969697,
"grad_norm": 0.4230435788631439,
"learning_rate": 0.0005889830903790087,
"loss": 4.0998,
"step": 3250
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.356535404920578,
"learning_rate": 0.000588808163265306,
"loss": 4.1032,
"step": 3300
},
{
"epoch": 0.9761072261072261,
"grad_norm": 0.39397764205932617,
"learning_rate": 0.0005886332361516035,
"loss": 4.0832,
"step": 3350
},
{
"epoch": 0.9906759906759907,
"grad_norm": 0.34773239493370056,
"learning_rate": 0.0005884583090379008,
"loss": 4.0945,
"step": 3400
},
{
"epoch": 1.0052447552447552,
"grad_norm": 0.3444010019302368,
"learning_rate": 0.0005882833819241982,
"loss": 4.0365,
"step": 3450
},
{
"epoch": 1.0198135198135199,
"grad_norm": 0.3705681264400482,
"learning_rate": 0.0005881084548104955,
"loss": 4.0123,
"step": 3500
},
{
"epoch": 1.0343822843822843,
"grad_norm": 0.351367712020874,
"learning_rate": 0.000587933527696793,
"loss": 4.0021,
"step": 3550
},
{
"epoch": 1.048951048951049,
"grad_norm": 0.34913498163223267,
"learning_rate": 0.0005877586005830904,
"loss": 4.0137,
"step": 3600
},
{
"epoch": 1.0635198135198136,
"grad_norm": 0.35815438628196716,
"learning_rate": 0.0005875836734693877,
"loss": 4.0028,
"step": 3650
},
{
"epoch": 1.078088578088578,
"grad_norm": 0.3701878488063812,
"learning_rate": 0.0005874087463556851,
"loss": 4.0022,
"step": 3700
},
{
"epoch": 1.0926573426573427,
"grad_norm": 0.3556549847126007,
"learning_rate": 0.0005872338192419825,
"loss": 4.011,
"step": 3750
},
{
"epoch": 1.1072261072261071,
"grad_norm": 0.3409753739833832,
"learning_rate": 0.0005870588921282798,
"loss": 4.0026,
"step": 3800
},
{
"epoch": 1.1217948717948718,
"grad_norm": 0.3481835126876831,
"learning_rate": 0.0005868839650145772,
"loss": 3.992,
"step": 3850
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.3449511229991913,
"learning_rate": 0.0005867090379008745,
"loss": 3.9925,
"step": 3900
},
{
"epoch": 1.150932400932401,
"grad_norm": 0.3285890817642212,
"learning_rate": 0.000586534110787172,
"loss": 3.996,
"step": 3950
},
{
"epoch": 1.1655011655011656,
"grad_norm": 0.34176114201545715,
"learning_rate": 0.0005863591836734694,
"loss": 3.9927,
"step": 4000
},
{
"epoch": 1.1655011655011656,
"eval_accuracy": 0.32579760119340617,
"eval_loss": 3.9809024333953857,
"eval_runtime": 179.99,
"eval_samples_per_second": 92.461,
"eval_steps_per_second": 5.784,
"step": 4000
},
{
"epoch": 1.18006993006993,
"grad_norm": 0.33748361468315125,
"learning_rate": 0.0005861842565597667,
"loss": 3.9866,
"step": 4050
},
{
"epoch": 1.1946386946386947,
"grad_norm": 0.3346062898635864,
"learning_rate": 0.0005860093294460641,
"loss": 3.9696,
"step": 4100
},
{
"epoch": 1.2092074592074593,
"grad_norm": 0.3950088322162628,
"learning_rate": 0.0005858344023323615,
"loss": 3.9875,
"step": 4150
},
{
"epoch": 1.2237762237762237,
"grad_norm": 0.3502199053764343,
"learning_rate": 0.0005856594752186588,
"loss": 3.9619,
"step": 4200
},
{
"epoch": 1.2383449883449884,
"grad_norm": 0.35575905442237854,
"learning_rate": 0.0005854845481049562,
"loss": 3.9642,
"step": 4250
},
{
"epoch": 1.2529137529137528,
"grad_norm": 0.3766721487045288,
"learning_rate": 0.0005853096209912535,
"loss": 3.973,
"step": 4300
},
{
"epoch": 1.2674825174825175,
"grad_norm": 0.3426792621612549,
"learning_rate": 0.000585134693877551,
"loss": 3.965,
"step": 4350
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.32944539189338684,
"learning_rate": 0.0005849597667638484,
"loss": 3.9634,
"step": 4400
},
{
"epoch": 1.2966200466200466,
"grad_norm": 0.3352791368961334,
"learning_rate": 0.0005847848396501457,
"loss": 3.953,
"step": 4450
},
{
"epoch": 1.3111888111888113,
"grad_norm": 0.3523617386817932,
"learning_rate": 0.0005846099125364432,
"loss": 3.9577,
"step": 4500
},
{
"epoch": 1.3257575757575757,
"grad_norm": 0.35192176699638367,
"learning_rate": 0.0005844349854227405,
"loss": 3.9518,
"step": 4550
},
{
"epoch": 1.3403263403263403,
"grad_norm": 0.3310588598251343,
"learning_rate": 0.0005842600583090379,
"loss": 3.9505,
"step": 4600
},
{
"epoch": 1.354895104895105,
"grad_norm": 0.3588995933532715,
"learning_rate": 0.0005840851311953352,
"loss": 3.9455,
"step": 4650
},
{
"epoch": 1.3694638694638694,
"grad_norm": 0.36037853360176086,
"learning_rate": 0.0005839102040816325,
"loss": 3.929,
"step": 4700
},
{
"epoch": 1.384032634032634,
"grad_norm": 0.335283488035202,
"learning_rate": 0.00058373527696793,
"loss": 3.9381,
"step": 4750
},
{
"epoch": 1.3986013986013985,
"grad_norm": 0.4042842388153076,
"learning_rate": 0.0005835603498542273,
"loss": 3.9542,
"step": 4800
},
{
"epoch": 1.4131701631701632,
"grad_norm": 0.3445352613925934,
"learning_rate": 0.0005833854227405247,
"loss": 3.9421,
"step": 4850
},
{
"epoch": 1.4277389277389276,
"grad_norm": 0.33620062470436096,
"learning_rate": 0.0005832104956268222,
"loss": 3.9228,
"step": 4900
},
{
"epoch": 1.4423076923076923,
"grad_norm": 0.3565351665019989,
"learning_rate": 0.0005830355685131195,
"loss": 3.9231,
"step": 4950
},
{
"epoch": 1.456876456876457,
"grad_norm": 0.3392309248447418,
"learning_rate": 0.0005828606413994169,
"loss": 3.9166,
"step": 5000
},
{
"epoch": 1.456876456876457,
"eval_accuracy": 0.33273405109228976,
"eval_loss": 3.90755033493042,
"eval_runtime": 180.1089,
"eval_samples_per_second": 92.4,
"eval_steps_per_second": 5.78,
"step": 5000
},
{
"epoch": 1.4714452214452214,
"grad_norm": 0.33691731095314026,
"learning_rate": 0.0005826857142857142,
"loss": 3.919,
"step": 5050
},
{
"epoch": 1.486013986013986,
"grad_norm": 0.34635353088378906,
"learning_rate": 0.0005825107871720116,
"loss": 3.9288,
"step": 5100
},
{
"epoch": 1.5005827505827507,
"grad_norm": 0.33720529079437256,
"learning_rate": 0.000582335860058309,
"loss": 3.9096,
"step": 5150
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.33896803855895996,
"learning_rate": 0.0005821609329446063,
"loss": 3.9164,
"step": 5200
},
{
"epoch": 1.5297202797202796,
"grad_norm": 0.3564818799495697,
"learning_rate": 0.0005819860058309037,
"loss": 3.923,
"step": 5250
},
{
"epoch": 1.5442890442890445,
"grad_norm": 0.3230993151664734,
"learning_rate": 0.0005818110787172012,
"loss": 3.9313,
"step": 5300
},
{
"epoch": 1.558857808857809,
"grad_norm": 0.3241836428642273,
"learning_rate": 0.0005816361516034985,
"loss": 3.9051,
"step": 5350
},
{
"epoch": 1.5734265734265733,
"grad_norm": 0.3641699254512787,
"learning_rate": 0.0005814612244897959,
"loss": 3.8919,
"step": 5400
},
{
"epoch": 1.587995337995338,
"grad_norm": 0.33417633175849915,
"learning_rate": 0.0005812862973760932,
"loss": 3.9001,
"step": 5450
},
{
"epoch": 1.6025641025641026,
"grad_norm": 0.3381204903125763,
"learning_rate": 0.0005811113702623907,
"loss": 3.8948,
"step": 5500
},
{
"epoch": 1.617132867132867,
"grad_norm": 0.33705613017082214,
"learning_rate": 0.000580936443148688,
"loss": 3.9058,
"step": 5550
},
{
"epoch": 1.6317016317016317,
"grad_norm": 0.33371835947036743,
"learning_rate": 0.0005807615160349853,
"loss": 3.8844,
"step": 5600
},
{
"epoch": 1.6462703962703964,
"grad_norm": 0.3282848596572876,
"learning_rate": 0.0005805865889212827,
"loss": 3.8905,
"step": 5650
},
{
"epoch": 1.6608391608391608,
"grad_norm": 0.33143365383148193,
"learning_rate": 0.0005804116618075802,
"loss": 3.8805,
"step": 5700
},
{
"epoch": 1.6754079254079253,
"grad_norm": 0.3147096335887909,
"learning_rate": 0.0005802367346938775,
"loss": 3.8818,
"step": 5750
},
{
"epoch": 1.68997668997669,
"grad_norm": 0.3499462902545929,
"learning_rate": 0.0005800618075801749,
"loss": 3.887,
"step": 5800
},
{
"epoch": 1.7045454545454546,
"grad_norm": 0.3366160988807678,
"learning_rate": 0.0005798868804664722,
"loss": 3.8708,
"step": 5850
},
{
"epoch": 1.719114219114219,
"grad_norm": 0.33289647102355957,
"learning_rate": 0.0005797119533527697,
"loss": 3.881,
"step": 5900
},
{
"epoch": 1.7336829836829837,
"grad_norm": 0.34272122383117676,
"learning_rate": 0.000579537026239067,
"loss": 3.888,
"step": 5950
},
{
"epoch": 1.7482517482517483,
"grad_norm": 0.33425042033195496,
"learning_rate": 0.0005793620991253643,
"loss": 3.8693,
"step": 6000
},
{
"epoch": 1.7482517482517483,
"eval_accuracy": 0.3378537221389026,
"eval_loss": 3.8500373363494873,
"eval_runtime": 180.0024,
"eval_samples_per_second": 92.454,
"eval_steps_per_second": 5.783,
"step": 6000
},
{
"epoch": 1.7628205128205128,
"grad_norm": 0.33192870020866394,
"learning_rate": 0.0005791871720116617,
"loss": 3.8684,
"step": 6050
},
{
"epoch": 1.7773892773892774,
"grad_norm": 0.3180764615535736,
"learning_rate": 0.0005790122448979591,
"loss": 3.8647,
"step": 6100
},
{
"epoch": 1.791958041958042,
"grad_norm": 0.3597005009651184,
"learning_rate": 0.0005788373177842565,
"loss": 3.8541,
"step": 6150
},
{
"epoch": 1.8065268065268065,
"grad_norm": 0.30698323249816895,
"learning_rate": 0.0005786623906705539,
"loss": 3.8699,
"step": 6200
},
{
"epoch": 1.821095571095571,
"grad_norm": 0.32494407892227173,
"learning_rate": 0.0005784874635568512,
"loss": 3.8687,
"step": 6250
},
{
"epoch": 1.8356643356643356,
"grad_norm": 0.32841014862060547,
"learning_rate": 0.0005783125364431487,
"loss": 3.8528,
"step": 6300
},
{
"epoch": 1.8502331002331003,
"grad_norm": 0.3171062171459198,
"learning_rate": 0.000578137609329446,
"loss": 3.8483,
"step": 6350
},
{
"epoch": 1.8648018648018647,
"grad_norm": 0.3196788728237152,
"learning_rate": 0.0005779626822157434,
"loss": 3.8493,
"step": 6400
},
{
"epoch": 1.8793706293706294,
"grad_norm": 0.324875146150589,
"learning_rate": 0.0005777877551020408,
"loss": 3.8649,
"step": 6450
},
{
"epoch": 1.893939393939394,
"grad_norm": 0.3102283775806427,
"learning_rate": 0.0005776128279883381,
"loss": 3.8619,
"step": 6500
},
{
"epoch": 1.9085081585081585,
"grad_norm": 0.34474343061447144,
"learning_rate": 0.0005774379008746355,
"loss": 3.8556,
"step": 6550
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.3284788429737091,
"learning_rate": 0.0005772629737609329,
"loss": 3.8463,
"step": 6600
},
{
"epoch": 1.9376456876456878,
"grad_norm": 0.31729623675346375,
"learning_rate": 0.0005770880466472303,
"loss": 3.837,
"step": 6650
},
{
"epoch": 1.9522144522144522,
"grad_norm": 0.3377760946750641,
"learning_rate": 0.0005769131195335277,
"loss": 3.8615,
"step": 6700
},
{
"epoch": 1.9667832167832167,
"grad_norm": 0.31680819392204285,
"learning_rate": 0.000576738192419825,
"loss": 3.8351,
"step": 6750
},
{
"epoch": 1.9813519813519813,
"grad_norm": 0.3283533453941345,
"learning_rate": 0.0005765632653061224,
"loss": 3.8376,
"step": 6800
},
{
"epoch": 1.995920745920746,
"grad_norm": 0.32663795351982117,
"learning_rate": 0.0005763883381924198,
"loss": 3.8481,
"step": 6850
},
{
"epoch": 2.0104895104895104,
"grad_norm": 0.3307703733444214,
"learning_rate": 0.0005762134110787171,
"loss": 3.7685,
"step": 6900
},
{
"epoch": 2.025058275058275,
"grad_norm": 0.3428336977958679,
"learning_rate": 0.0005760384839650145,
"loss": 3.734,
"step": 6950
},
{
"epoch": 2.0396270396270397,
"grad_norm": 0.3403928577899933,
"learning_rate": 0.0005758635568513119,
"loss": 3.7327,
"step": 7000
},
{
"epoch": 2.0396270396270397,
"eval_accuracy": 0.342164485630514,
"eval_loss": 3.808783769607544,
"eval_runtime": 180.0566,
"eval_samples_per_second": 92.427,
"eval_steps_per_second": 5.782,
"step": 7000
},
{
"epoch": 2.054195804195804,
"grad_norm": 0.3355982303619385,
"learning_rate": 0.0005756886297376093,
"loss": 3.7526,
"step": 7050
},
{
"epoch": 2.0687645687645686,
"grad_norm": 0.34529364109039307,
"learning_rate": 0.0005755137026239067,
"loss": 3.7421,
"step": 7100
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.3153204023838043,
"learning_rate": 0.000575338775510204,
"loss": 3.752,
"step": 7150
},
{
"epoch": 2.097902097902098,
"grad_norm": 0.32373619079589844,
"learning_rate": 0.0005751638483965014,
"loss": 3.737,
"step": 7200
},
{
"epoch": 2.1124708624708624,
"grad_norm": 0.32524994015693665,
"learning_rate": 0.0005749889212827988,
"loss": 3.7506,
"step": 7250
},
{
"epoch": 2.1270396270396272,
"grad_norm": 0.33004939556121826,
"learning_rate": 0.0005748139941690962,
"loss": 3.7495,
"step": 7300
},
{
"epoch": 2.1416083916083917,
"grad_norm": 0.3301999270915985,
"learning_rate": 0.0005746390670553935,
"loss": 3.7555,
"step": 7350
},
{
"epoch": 2.156177156177156,
"grad_norm": 0.31961825489997864,
"learning_rate": 0.000574464139941691,
"loss": 3.7478,
"step": 7400
},
{
"epoch": 2.1707459207459205,
"grad_norm": 0.3425387144088745,
"learning_rate": 0.0005742892128279883,
"loss": 3.7535,
"step": 7450
},
{
"epoch": 2.1853146853146854,
"grad_norm": 0.33788540959358215,
"learning_rate": 0.0005741142857142857,
"loss": 3.7545,
"step": 7500
},
{
"epoch": 2.19988344988345,
"grad_norm": 0.32931485772132874,
"learning_rate": 0.000573939358600583,
"loss": 3.7457,
"step": 7550
},
{
"epoch": 2.2144522144522143,
"grad_norm": 0.3231325149536133,
"learning_rate": 0.0005737644314868805,
"loss": 3.7522,
"step": 7600
},
{
"epoch": 2.229020979020979,
"grad_norm": 0.30641666054725647,
"learning_rate": 0.0005735895043731778,
"loss": 3.7573,
"step": 7650
},
{
"epoch": 2.2435897435897436,
"grad_norm": 0.3495963513851166,
"learning_rate": 0.0005734145772594752,
"loss": 3.7483,
"step": 7700
},
{
"epoch": 2.258158508158508,
"grad_norm": 0.3273448646068573,
"learning_rate": 0.0005732396501457726,
"loss": 3.7492,
"step": 7750
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.3312014639377594,
"learning_rate": 0.0005730647230320698,
"loss": 3.7369,
"step": 7800
},
{
"epoch": 2.2872960372960374,
"grad_norm": 0.31098538637161255,
"learning_rate": 0.0005728897959183673,
"loss": 3.7632,
"step": 7850
},
{
"epoch": 2.301864801864802,
"grad_norm": 0.3138977885246277,
"learning_rate": 0.0005727148688046647,
"loss": 3.725,
"step": 7900
},
{
"epoch": 2.3164335664335667,
"grad_norm": 0.3393580913543701,
"learning_rate": 0.000572539941690962,
"loss": 3.7624,
"step": 7950
},
{
"epoch": 2.331002331002331,
"grad_norm": 0.3338955044746399,
"learning_rate": 0.0005723650145772595,
"loss": 3.7487,
"step": 8000
},
{
"epoch": 2.331002331002331,
"eval_accuracy": 0.3450254713570997,
"eval_loss": 3.777087450027466,
"eval_runtime": 180.0959,
"eval_samples_per_second": 92.406,
"eval_steps_per_second": 5.78,
"step": 8000
},
{
"epoch": 2.3455710955710956,
"grad_norm": 0.3354654014110565,
"learning_rate": 0.0005721900874635568,
"loss": 3.7489,
"step": 8050
},
{
"epoch": 2.36013986013986,
"grad_norm": 0.31632018089294434,
"learning_rate": 0.0005720151603498542,
"loss": 3.748,
"step": 8100
},
{
"epoch": 2.374708624708625,
"grad_norm": 0.31123656034469604,
"learning_rate": 0.0005718402332361515,
"loss": 3.7349,
"step": 8150
},
{
"epoch": 2.3892773892773893,
"grad_norm": 0.3092937469482422,
"learning_rate": 0.000571665306122449,
"loss": 3.7501,
"step": 8200
},
{
"epoch": 2.4038461538461537,
"grad_norm": 0.3420781195163727,
"learning_rate": 0.0005714903790087463,
"loss": 3.7408,
"step": 8250
},
{
"epoch": 2.4184149184149186,
"grad_norm": 0.3158037066459656,
"learning_rate": 0.0005713154518950437,
"loss": 3.7381,
"step": 8300
},
{
"epoch": 2.432983682983683,
"grad_norm": 0.32453060150146484,
"learning_rate": 0.000571140524781341,
"loss": 3.7396,
"step": 8350
},
{
"epoch": 2.4475524475524475,
"grad_norm": 0.3126128613948822,
"learning_rate": 0.0005709655976676385,
"loss": 3.7409,
"step": 8400
},
{
"epoch": 2.462121212121212,
"grad_norm": 0.31964123249053955,
"learning_rate": 0.0005707906705539358,
"loss": 3.7503,
"step": 8450
},
{
"epoch": 2.476689976689977,
"grad_norm": 0.3062613606452942,
"learning_rate": 0.0005706157434402332,
"loss": 3.7406,
"step": 8500
},
{
"epoch": 2.4912587412587412,
"grad_norm": 0.31508180499076843,
"learning_rate": 0.0005704408163265305,
"loss": 3.7365,
"step": 8550
},
{
"epoch": 2.5058275058275057,
"grad_norm": 0.3289349377155304,
"learning_rate": 0.000570265889212828,
"loss": 3.7355,
"step": 8600
},
{
"epoch": 2.5203962703962706,
"grad_norm": 0.3557591736316681,
"learning_rate": 0.0005700909620991253,
"loss": 3.7401,
"step": 8650
},
{
"epoch": 2.534965034965035,
"grad_norm": 0.3184148073196411,
"learning_rate": 0.0005699160349854227,
"loss": 3.7459,
"step": 8700
},
{
"epoch": 2.5495337995337994,
"grad_norm": 0.33620187640190125,
"learning_rate": 0.00056974110787172,
"loss": 3.7431,
"step": 8750
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.31378746032714844,
"learning_rate": 0.0005695661807580175,
"loss": 3.756,
"step": 8800
},
{
"epoch": 2.5786713286713288,
"grad_norm": 0.32717159390449524,
"learning_rate": 0.0005693912536443148,
"loss": 3.7267,
"step": 8850
},
{
"epoch": 2.593240093240093,
"grad_norm": 0.2987724244594574,
"learning_rate": 0.0005692163265306122,
"loss": 3.732,
"step": 8900
},
{
"epoch": 2.607808857808858,
"grad_norm": 0.3157244324684143,
"learning_rate": 0.0005690413994169095,
"loss": 3.7428,
"step": 8950
},
{
"epoch": 2.6223776223776225,
"grad_norm": 0.307952344417572,
"learning_rate": 0.000568866472303207,
"loss": 3.7285,
"step": 9000
},
{
"epoch": 2.6223776223776225,
"eval_accuracy": 0.3477683958560039,
"eval_loss": 3.749556064605713,
"eval_runtime": 180.156,
"eval_samples_per_second": 92.375,
"eval_steps_per_second": 5.778,
"step": 9000
},
{
"epoch": 2.636946386946387,
"grad_norm": 0.3186863660812378,
"learning_rate": 0.0005686915451895044,
"loss": 3.7367,
"step": 9050
},
{
"epoch": 2.6515151515151514,
"grad_norm": 0.33932939171791077,
"learning_rate": 0.0005685166180758016,
"loss": 3.7358,
"step": 9100
},
{
"epoch": 2.666083916083916,
"grad_norm": 0.3142889142036438,
"learning_rate": 0.000568341690962099,
"loss": 3.7354,
"step": 9150
},
{
"epoch": 2.6806526806526807,
"grad_norm": 0.3227601647377014,
"learning_rate": 0.0005681667638483965,
"loss": 3.7269,
"step": 9200
},
{
"epoch": 2.695221445221445,
"grad_norm": 0.36613523960113525,
"learning_rate": 0.0005679918367346938,
"loss": 3.7232,
"step": 9250
},
{
"epoch": 2.70979020979021,
"grad_norm": 0.3181409239768982,
"learning_rate": 0.0005678169096209912,
"loss": 3.7404,
"step": 9300
},
{
"epoch": 2.7243589743589745,
"grad_norm": 0.32013779878616333,
"learning_rate": 0.0005676419825072885,
"loss": 3.7268,
"step": 9350
},
{
"epoch": 2.738927738927739,
"grad_norm": 0.31618836522102356,
"learning_rate": 0.000567467055393586,
"loss": 3.723,
"step": 9400
},
{
"epoch": 2.7534965034965033,
"grad_norm": 0.34557044506073,
"learning_rate": 0.0005672921282798833,
"loss": 3.7266,
"step": 9450
},
{
"epoch": 2.768065268065268,
"grad_norm": 0.3225458264350891,
"learning_rate": 0.0005671172011661807,
"loss": 3.7298,
"step": 9500
},
{
"epoch": 2.7826340326340326,
"grad_norm": 0.3175308406352997,
"learning_rate": 0.000566942274052478,
"loss": 3.7272,
"step": 9550
},
{
"epoch": 2.797202797202797,
"grad_norm": 0.3606358468532562,
"learning_rate": 0.0005667673469387755,
"loss": 3.7187,
"step": 9600
},
{
"epoch": 2.811771561771562,
"grad_norm": 0.32616934180259705,
"learning_rate": 0.0005665924198250728,
"loss": 3.7128,
"step": 9650
},
{
"epoch": 2.8263403263403264,
"grad_norm": 0.30753543972969055,
"learning_rate": 0.0005664174927113702,
"loss": 3.7311,
"step": 9700
},
{
"epoch": 2.840909090909091,
"grad_norm": 0.3168071508407593,
"learning_rate": 0.0005662425655976676,
"loss": 3.7225,
"step": 9750
},
{
"epoch": 2.8554778554778553,
"grad_norm": 0.3064960837364197,
"learning_rate": 0.000566067638483965,
"loss": 3.714,
"step": 9800
},
{
"epoch": 2.87004662004662,
"grad_norm": 0.3164961338043213,
"learning_rate": 0.0005658927113702623,
"loss": 3.7264,
"step": 9850
},
{
"epoch": 2.8846153846153846,
"grad_norm": 0.32540807127952576,
"learning_rate": 0.0005657177842565597,
"loss": 3.7172,
"step": 9900
},
{
"epoch": 2.8991841491841495,
"grad_norm": 0.318560391664505,
"learning_rate": 0.0005655428571428572,
"loss": 3.7118,
"step": 9950
},
{
"epoch": 2.913752913752914,
"grad_norm": 0.3398500084877014,
"learning_rate": 0.0005653679300291545,
"loss": 3.7212,
"step": 10000
},
{
"epoch": 2.913752913752914,
"eval_accuracy": 0.35038244076771785,
"eval_loss": 3.7192089557647705,
"eval_runtime": 180.0126,
"eval_samples_per_second": 92.449,
"eval_steps_per_second": 5.783,
"step": 10000
},
{
"epoch": 2.9283216783216783,
"grad_norm": 0.32436761260032654,
"learning_rate": 0.0005651930029154518,
"loss": 3.7122,
"step": 10050
},
{
"epoch": 2.9428904428904428,
"grad_norm": 0.3328739106655121,
"learning_rate": 0.0005650180758017492,
"loss": 3.7132,
"step": 10100
},
{
"epoch": 2.957459207459207,
"grad_norm": 0.31366440653800964,
"learning_rate": 0.0005648431486880466,
"loss": 3.7084,
"step": 10150
},
{
"epoch": 2.972027972027972,
"grad_norm": 0.3094634711742401,
"learning_rate": 0.000564668221574344,
"loss": 3.711,
"step": 10200
},
{
"epoch": 2.9865967365967365,
"grad_norm": 0.3342944383621216,
"learning_rate": 0.0005644932944606413,
"loss": 3.7073,
"step": 10250
},
{
"epoch": 3.001165501165501,
"grad_norm": 0.3029869496822357,
"learning_rate": 0.0005643183673469387,
"loss": 3.7038,
"step": 10300
},
{
"epoch": 3.015734265734266,
"grad_norm": 0.314151406288147,
"learning_rate": 0.0005641434402332362,
"loss": 3.5974,
"step": 10350
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.31103676557540894,
"learning_rate": 0.0005639685131195335,
"loss": 3.6124,
"step": 10400
},
{
"epoch": 3.0448717948717947,
"grad_norm": 0.3061332702636719,
"learning_rate": 0.0005637935860058308,
"loss": 3.6153,
"step": 10450
},
{
"epoch": 3.0594405594405596,
"grad_norm": 0.3213406205177307,
"learning_rate": 0.0005636186588921282,
"loss": 3.6026,
"step": 10500
},
{
"epoch": 3.074009324009324,
"grad_norm": 0.3310888111591339,
"learning_rate": 0.0005634437317784256,
"loss": 3.6145,
"step": 10550
},
{
"epoch": 3.0885780885780885,
"grad_norm": 0.3187083601951599,
"learning_rate": 0.000563268804664723,
"loss": 3.6114,
"step": 10600
},
{
"epoch": 3.1031468531468533,
"grad_norm": 0.3135937750339508,
"learning_rate": 0.0005630938775510203,
"loss": 3.6184,
"step": 10650
},
{
"epoch": 3.117715617715618,
"grad_norm": 0.3369690179824829,
"learning_rate": 0.0005629189504373177,
"loss": 3.6178,
"step": 10700
},
{
"epoch": 3.132284382284382,
"grad_norm": 0.3287160098552704,
"learning_rate": 0.0005627440233236151,
"loss": 3.6235,
"step": 10750
},
{
"epoch": 3.1468531468531467,
"grad_norm": 0.3253456652164459,
"learning_rate": 0.0005625690962099125,
"loss": 3.6301,
"step": 10800
},
{
"epoch": 3.1614219114219115,
"grad_norm": 0.3379002511501312,
"learning_rate": 0.0005623941690962099,
"loss": 3.6268,
"step": 10850
},
{
"epoch": 3.175990675990676,
"grad_norm": 0.3228496313095093,
"learning_rate": 0.0005622192419825073,
"loss": 3.623,
"step": 10900
},
{
"epoch": 3.1905594405594404,
"grad_norm": 0.32255882024765015,
"learning_rate": 0.0005620443148688046,
"loss": 3.6165,
"step": 10950
},
{
"epoch": 3.2051282051282053,
"grad_norm": 0.3215540945529938,
"learning_rate": 0.000561869387755102,
"loss": 3.6257,
"step": 11000
},
{
"epoch": 3.2051282051282053,
"eval_accuracy": 0.3524533334775781,
"eval_loss": 3.709486484527588,
"eval_runtime": 179.9331,
"eval_samples_per_second": 92.49,
"eval_steps_per_second": 5.785,
"step": 11000
},
{
"epoch": 3.2196969696969697,
"grad_norm": 0.31145378947257996,
"learning_rate": 0.0005616944606413993,
"loss": 3.6378,
"step": 11050
},
{
"epoch": 3.234265734265734,
"grad_norm": 0.3215027451515198,
"learning_rate": 0.0005615195335276968,
"loss": 3.6347,
"step": 11100
},
{
"epoch": 3.248834498834499,
"grad_norm": 0.32689639925956726,
"learning_rate": 0.0005613446064139941,
"loss": 3.6273,
"step": 11150
},
{
"epoch": 3.2634032634032635,
"grad_norm": 0.3098812401294708,
"learning_rate": 0.0005611696793002915,
"loss": 3.6261,
"step": 11200
},
{
"epoch": 3.277972027972028,
"grad_norm": 0.3190302848815918,
"learning_rate": 0.0005609947521865889,
"loss": 3.6275,
"step": 11250
},
{
"epoch": 3.2925407925407923,
"grad_norm": 0.31954291462898254,
"learning_rate": 0.0005608198250728863,
"loss": 3.6323,
"step": 11300
},
{
"epoch": 3.3071095571095572,
"grad_norm": 0.33193185925483704,
"learning_rate": 0.0005606448979591836,
"loss": 3.6185,
"step": 11350
},
{
"epoch": 3.3216783216783217,
"grad_norm": 0.30325186252593994,
"learning_rate": 0.000560469970845481,
"loss": 3.6268,
"step": 11400
},
{
"epoch": 3.336247086247086,
"grad_norm": 0.311393678188324,
"learning_rate": 0.0005602950437317783,
"loss": 3.6327,
"step": 11450
},
{
"epoch": 3.350815850815851,
"grad_norm": 0.33085450530052185,
"learning_rate": 0.0005601201166180758,
"loss": 3.6412,
"step": 11500
},
{
"epoch": 3.3653846153846154,
"grad_norm": 0.3296162486076355,
"learning_rate": 0.0005599451895043731,
"loss": 3.6398,
"step": 11550
},
{
"epoch": 3.37995337995338,
"grad_norm": 0.3392165005207062,
"learning_rate": 0.0005597702623906705,
"loss": 3.6299,
"step": 11600
},
{
"epoch": 3.3945221445221447,
"grad_norm": 0.3435153663158417,
"learning_rate": 0.0005595953352769679,
"loss": 3.6408,
"step": 11650
},
{
"epoch": 3.409090909090909,
"grad_norm": 0.3271493911743164,
"learning_rate": 0.0005594204081632653,
"loss": 3.6544,
"step": 11700
},
{
"epoch": 3.4236596736596736,
"grad_norm": 0.3151093125343323,
"learning_rate": 0.0005592454810495627,
"loss": 3.6352,
"step": 11750
},
{
"epoch": 3.438228438228438,
"grad_norm": 0.3355579674243927,
"learning_rate": 0.00055907055393586,
"loss": 3.6333,
"step": 11800
},
{
"epoch": 3.452797202797203,
"grad_norm": 0.3261067271232605,
"learning_rate": 0.0005588956268221573,
"loss": 3.6588,
"step": 11850
},
{
"epoch": 3.4673659673659674,
"grad_norm": 0.3100610673427582,
"learning_rate": 0.0005587206997084548,
"loss": 3.6338,
"step": 11900
},
{
"epoch": 3.481934731934732,
"grad_norm": 0.3188706636428833,
"learning_rate": 0.0005585457725947521,
"loss": 3.6304,
"step": 11950
},
{
"epoch": 3.4965034965034967,
"grad_norm": 0.3274799585342407,
"learning_rate": 0.0005583708454810495,
"loss": 3.6324,
"step": 12000
},
{
"epoch": 3.4965034965034967,
"eval_accuracy": 0.3537938693297391,
"eval_loss": 3.6916778087615967,
"eval_runtime": 180.0372,
"eval_samples_per_second": 92.436,
"eval_steps_per_second": 5.782,
"step": 12000
},
{
"epoch": 3.511072261072261,
"grad_norm": 0.3254401981830597,
"learning_rate": 0.0005581959183673468,
"loss": 3.636,
"step": 12050
},
{
"epoch": 3.5256410256410255,
"grad_norm": 0.3294544517993927,
"learning_rate": 0.0005580209912536443,
"loss": 3.6423,
"step": 12100
},
{
"epoch": 3.54020979020979,
"grad_norm": 0.32317084074020386,
"learning_rate": 0.0005578460641399417,
"loss": 3.644,
"step": 12150
},
{
"epoch": 3.554778554778555,
"grad_norm": 0.3209782540798187,
"learning_rate": 0.000557671137026239,
"loss": 3.6373,
"step": 12200
},
{
"epoch": 3.5693473193473193,
"grad_norm": 0.3246396780014038,
"learning_rate": 0.0005574962099125363,
"loss": 3.6479,
"step": 12250
},
{
"epoch": 3.583916083916084,
"grad_norm": 0.32658424973487854,
"learning_rate": 0.0005573212827988338,
"loss": 3.6435,
"step": 12300
},
{
"epoch": 3.5984848484848486,
"grad_norm": 0.34126392006874084,
"learning_rate": 0.0005571463556851311,
"loss": 3.6474,
"step": 12350
},
{
"epoch": 3.613053613053613,
"grad_norm": 0.31840071082115173,
"learning_rate": 0.0005569714285714285,
"loss": 3.6364,
"step": 12400
},
{
"epoch": 3.6276223776223775,
"grad_norm": 0.32670649886131287,
"learning_rate": 0.0005567965014577258,
"loss": 3.6318,
"step": 12450
},
{
"epoch": 3.642191142191142,
"grad_norm": 0.3206935524940491,
"learning_rate": 0.0005566215743440233,
"loss": 3.6369,
"step": 12500
},
{
"epoch": 3.656759906759907,
"grad_norm": 0.3333989083766937,
"learning_rate": 0.0005564466472303207,
"loss": 3.6437,
"step": 12550
},
{
"epoch": 3.6713286713286712,
"grad_norm": 0.33308637142181396,
"learning_rate": 0.000556271720116618,
"loss": 3.6476,
"step": 12600
},
{
"epoch": 3.685897435897436,
"grad_norm": 0.30366429686546326,
"learning_rate": 0.0005560967930029155,
"loss": 3.6361,
"step": 12650
},
{
"epoch": 3.7004662004662006,
"grad_norm": 0.3257286250591278,
"learning_rate": 0.0005559218658892128,
"loss": 3.6359,
"step": 12700
},
{
"epoch": 3.715034965034965,
"grad_norm": 0.3141056299209595,
"learning_rate": 0.0005557469387755101,
"loss": 3.6392,
"step": 12750
},
{
"epoch": 3.7296037296037294,
"grad_norm": 0.321039617061615,
"learning_rate": 0.0005555720116618075,
"loss": 3.6265,
"step": 12800
},
{
"epoch": 3.7441724941724943,
"grad_norm": 0.30979907512664795,
"learning_rate": 0.0005553970845481049,
"loss": 3.6325,
"step": 12850
},
{
"epoch": 3.7587412587412588,
"grad_norm": 0.31813862919807434,
"learning_rate": 0.0005552221574344023,
"loss": 3.6491,
"step": 12900
},
{
"epoch": 3.773310023310023,
"grad_norm": 0.32558414340019226,
"learning_rate": 0.0005550472303206997,
"loss": 3.6395,
"step": 12950
},
{
"epoch": 3.787878787878788,
"grad_norm": 0.3136826157569885,
"learning_rate": 0.000554872303206997,
"loss": 3.6403,
"step": 13000
},
{
"epoch": 3.787878787878788,
"eval_accuracy": 0.35539592726393576,
"eval_loss": 3.6737773418426514,
"eval_runtime": 180.0124,
"eval_samples_per_second": 92.449,
"eval_steps_per_second": 5.783,
"step": 13000
},
{
"epoch": 3.8024475524475525,
"grad_norm": 0.2937915325164795,
"learning_rate": 0.0005546973760932945,
"loss": 3.638,
"step": 13050
},
{
"epoch": 3.817016317016317,
"grad_norm": 0.3118761479854584,
"learning_rate": 0.0005545224489795918,
"loss": 3.638,
"step": 13100
},
{
"epoch": 3.8315850815850814,
"grad_norm": 0.32508912682533264,
"learning_rate": 0.0005543475218658891,
"loss": 3.6426,
"step": 13150
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.3132587969303131,
"learning_rate": 0.0005541725947521865,
"loss": 3.6273,
"step": 13200
},
{
"epoch": 3.8607226107226107,
"grad_norm": 0.32538720965385437,
"learning_rate": 0.0005539976676384839,
"loss": 3.6287,
"step": 13250
},
{
"epoch": 3.875291375291375,
"grad_norm": 0.3058031499385834,
"learning_rate": 0.0005538227405247813,
"loss": 3.6354,
"step": 13300
},
{
"epoch": 3.88986013986014,
"grad_norm": 0.3083172142505646,
"learning_rate": 0.0005536478134110787,
"loss": 3.6299,
"step": 13350
},
{
"epoch": 3.9044289044289044,
"grad_norm": 0.31901004910469055,
"learning_rate": 0.000553472886297376,
"loss": 3.6346,
"step": 13400
},
{
"epoch": 3.918997668997669,
"grad_norm": 0.30737531185150146,
"learning_rate": 0.0005532979591836735,
"loss": 3.6519,
"step": 13450
},
{
"epoch": 3.9335664335664333,
"grad_norm": 0.3150396943092346,
"learning_rate": 0.0005531230320699708,
"loss": 3.6308,
"step": 13500
},
{
"epoch": 3.948135198135198,
"grad_norm": 0.30913490056991577,
"learning_rate": 0.0005529481049562682,
"loss": 3.6363,
"step": 13550
},
{
"epoch": 3.9627039627039626,
"grad_norm": 0.32307544350624084,
"learning_rate": 0.0005527731778425655,
"loss": 3.628,
"step": 13600
},
{
"epoch": 3.9772727272727275,
"grad_norm": 0.32591161131858826,
"learning_rate": 0.0005525982507288629,
"loss": 3.6293,
"step": 13650
},
{
"epoch": 3.991841491841492,
"grad_norm": 0.35084959864616394,
"learning_rate": 0.0005524233236151603,
"loss": 3.6342,
"step": 13700
},
{
"epoch": 4.006410256410256,
"grad_norm": 0.3362487554550171,
"learning_rate": 0.0005522483965014576,
"loss": 3.577,
"step": 13750
},
{
"epoch": 4.020979020979021,
"grad_norm": 0.33940762281417847,
"learning_rate": 0.000552073469387755,
"loss": 3.5236,
"step": 13800
},
{
"epoch": 4.035547785547785,
"grad_norm": 0.3365685045719147,
"learning_rate": 0.0005518985422740525,
"loss": 3.5171,
"step": 13850
},
{
"epoch": 4.05011655011655,
"grad_norm": 0.32239189743995667,
"learning_rate": 0.0005517236151603498,
"loss": 3.5349,
"step": 13900
},
{
"epoch": 4.064685314685315,
"grad_norm": 0.3359779715538025,
"learning_rate": 0.0005515486880466472,
"loss": 3.5388,
"step": 13950
},
{
"epoch": 4.0792540792540795,
"grad_norm": 0.3400496542453766,
"learning_rate": 0.0005513737609329446,
"loss": 3.5471,
"step": 14000
},
{
"epoch": 4.0792540792540795,
"eval_accuracy": 0.3563893348849056,
"eval_loss": 3.6670174598693848,
"eval_runtime": 180.0,
"eval_samples_per_second": 92.456,
"eval_steps_per_second": 5.783,
"step": 14000
},
{
"epoch": 4.093822843822844,
"grad_norm": 0.3247426450252533,
"learning_rate": 0.0005511988338192419,
"loss": 3.5358,
"step": 14050
},
{
"epoch": 4.108391608391608,
"grad_norm": 0.32067304849624634,
"learning_rate": 0.0005510239067055393,
"loss": 3.5265,
"step": 14100
},
{
"epoch": 4.122960372960373,
"grad_norm": 0.31784480810165405,
"learning_rate": 0.0005508489795918366,
"loss": 3.5411,
"step": 14150
},
{
"epoch": 4.137529137529137,
"grad_norm": 0.30831626057624817,
"learning_rate": 0.0005506740524781341,
"loss": 3.5379,
"step": 14200
},
{
"epoch": 4.1520979020979025,
"grad_norm": 0.3176884651184082,
"learning_rate": 0.0005504991253644315,
"loss": 3.5452,
"step": 14250
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.34430864453315735,
"learning_rate": 0.0005503241982507288,
"loss": 3.5519,
"step": 14300
},
{
"epoch": 4.181235431235431,
"grad_norm": 0.3338887691497803,
"learning_rate": 0.0005501492711370262,
"loss": 3.5369,
"step": 14350
},
{
"epoch": 4.195804195804196,
"grad_norm": 0.3119693100452423,
"learning_rate": 0.0005499743440233236,
"loss": 3.5491,
"step": 14400
},
{
"epoch": 4.21037296037296,
"grad_norm": 0.3248231112957001,
"learning_rate": 0.000549799416909621,
"loss": 3.5541,
"step": 14450
},
{
"epoch": 4.224941724941725,
"grad_norm": 0.32441389560699463,
"learning_rate": 0.0005496244897959183,
"loss": 3.5611,
"step": 14500
},
{
"epoch": 4.239510489510489,
"grad_norm": 0.3318880498409271,
"learning_rate": 0.0005494495626822156,
"loss": 3.5484,
"step": 14550
},
{
"epoch": 4.2540792540792545,
"grad_norm": 0.3246316611766815,
"learning_rate": 0.0005492746355685131,
"loss": 3.5641,
"step": 14600
},
{
"epoch": 4.268648018648019,
"grad_norm": 0.32765793800354004,
"learning_rate": 0.0005490997084548105,
"loss": 3.5597,
"step": 14650
},
{
"epoch": 4.283216783216783,
"grad_norm": 0.3181726336479187,
"learning_rate": 0.0005489247813411078,
"loss": 3.5587,
"step": 14700
},
{
"epoch": 4.297785547785548,
"grad_norm": 0.3304060697555542,
"learning_rate": 0.0005487498542274052,
"loss": 3.5593,
"step": 14750
},
{
"epoch": 4.312354312354312,
"grad_norm": 0.3318939507007599,
"learning_rate": 0.0005485749271137026,
"loss": 3.5622,
"step": 14800
},
{
"epoch": 4.326923076923077,
"grad_norm": 0.31229400634765625,
"learning_rate": 0.0005484,
"loss": 3.555,
"step": 14850
},
{
"epoch": 4.341491841491841,
"grad_norm": 0.3215874433517456,
"learning_rate": 0.0005482250728862973,
"loss": 3.5703,
"step": 14900
},
{
"epoch": 4.356060606060606,
"grad_norm": 0.3406248092651367,
"learning_rate": 0.0005480501457725946,
"loss": 3.5592,
"step": 14950
},
{
"epoch": 4.370629370629371,
"grad_norm": 0.3115122318267822,
"learning_rate": 0.0005478752186588921,
"loss": 3.5627,
"step": 15000
},
{
"epoch": 4.370629370629371,
"eval_accuracy": 0.3577426881412671,
"eval_loss": 3.655911922454834,
"eval_runtime": 180.1218,
"eval_samples_per_second": 92.393,
"eval_steps_per_second": 5.779,
"step": 15000
},
{
"epoch": 4.385198135198135,
"grad_norm": 0.32183435559272766,
"learning_rate": 0.0005477002915451894,
"loss": 3.5637,
"step": 15050
},
{
"epoch": 4.3997668997669,
"grad_norm": 0.3230733573436737,
"learning_rate": 0.0005475253644314868,
"loss": 3.5635,
"step": 15100
},
{
"epoch": 4.414335664335664,
"grad_norm": 0.3347214162349701,
"learning_rate": 0.0005473504373177842,
"loss": 3.5735,
"step": 15150
},
{
"epoch": 4.428904428904429,
"grad_norm": 0.3242194950580597,
"learning_rate": 0.0005471755102040816,
"loss": 3.5625,
"step": 15200
},
{
"epoch": 4.443473193473194,
"grad_norm": 0.31296148896217346,
"learning_rate": 0.000547000583090379,
"loss": 3.5629,
"step": 15250
},
{
"epoch": 4.458041958041958,
"grad_norm": 0.3409155607223511,
"learning_rate": 0.0005468256559766763,
"loss": 3.5572,
"step": 15300
},
{
"epoch": 4.472610722610723,
"grad_norm": 0.3338817059993744,
"learning_rate": 0.0005466507288629738,
"loss": 3.5601,
"step": 15350
},
{
"epoch": 4.487179487179487,
"grad_norm": 0.32212719321250916,
"learning_rate": 0.0005464758017492711,
"loss": 3.5644,
"step": 15400
},
{
"epoch": 4.501748251748252,
"grad_norm": 0.32063987851142883,
"learning_rate": 0.0005463008746355684,
"loss": 3.5586,
"step": 15450
},
{
"epoch": 4.516317016317016,
"grad_norm": 0.3092406392097473,
"learning_rate": 0.0005461259475218658,
"loss": 3.5675,
"step": 15500
},
{
"epoch": 4.5308857808857805,
"grad_norm": 0.31964099407196045,
"learning_rate": 0.0005459510204081633,
"loss": 3.5691,
"step": 15550
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.31656357645988464,
"learning_rate": 0.0005457760932944606,
"loss": 3.5724,
"step": 15600
},
{
"epoch": 4.56002331002331,
"grad_norm": 0.3061751425266266,
"learning_rate": 0.000545601166180758,
"loss": 3.5734,
"step": 15650
},
{
"epoch": 4.574592074592075,
"grad_norm": 0.3436771333217621,
"learning_rate": 0.0005454262390670553,
"loss": 3.5662,
"step": 15700
},
{
"epoch": 4.589160839160839,
"grad_norm": 0.33170390129089355,
"learning_rate": 0.0005452513119533528,
"loss": 3.5748,
"step": 15750
},
{
"epoch": 4.603729603729604,
"grad_norm": 0.3121337294578552,
"learning_rate": 0.0005450763848396501,
"loss": 3.5799,
"step": 15800
},
{
"epoch": 4.618298368298368,
"grad_norm": 0.3196841776371002,
"learning_rate": 0.0005449014577259474,
"loss": 3.5751,
"step": 15850
},
{
"epoch": 4.632867132867133,
"grad_norm": 0.32077154517173767,
"learning_rate": 0.0005447265306122448,
"loss": 3.566,
"step": 15900
},
{
"epoch": 4.647435897435898,
"grad_norm": 0.34822878241539,
"learning_rate": 0.0005445516034985423,
"loss": 3.5787,
"step": 15950
},
{
"epoch": 4.662004662004662,
"grad_norm": 0.35842180252075195,
"learning_rate": 0.0005443766763848396,
"loss": 3.5772,
"step": 16000
},
{
"epoch": 4.662004662004662,
"eval_accuracy": 0.35919034927073673,
"eval_loss": 3.6416175365448,
"eval_runtime": 180.1041,
"eval_samples_per_second": 92.402,
"eval_steps_per_second": 5.78,
"step": 16000
},
{
"epoch": 4.676573426573427,
"grad_norm": 0.3213863670825958,
"learning_rate": 0.000544201749271137,
"loss": 3.5688,
"step": 16050
},
{
"epoch": 4.691142191142191,
"grad_norm": 0.3245983421802521,
"learning_rate": 0.0005440268221574343,
"loss": 3.5757,
"step": 16100
},
{
"epoch": 4.7057109557109555,
"grad_norm": 0.3060520887374878,
"learning_rate": 0.0005438518950437318,
"loss": 3.5767,
"step": 16150
},
{
"epoch": 4.72027972027972,
"grad_norm": 0.3220885097980499,
"learning_rate": 0.0005436769679300291,
"loss": 3.5617,
"step": 16200
},
{
"epoch": 4.734848484848484,
"grad_norm": 0.30982065200805664,
"learning_rate": 0.0005435020408163265,
"loss": 3.5732,
"step": 16250
},
{
"epoch": 4.74941724941725,
"grad_norm": 0.31846100091934204,
"learning_rate": 0.0005433271137026238,
"loss": 3.5751,
"step": 16300
},
{
"epoch": 4.763986013986014,
"grad_norm": 0.3250206410884857,
"learning_rate": 0.0005431521865889212,
"loss": 3.5711,
"step": 16350
},
{
"epoch": 4.778554778554779,
"grad_norm": 0.3231157064437866,
"learning_rate": 0.0005429772594752186,
"loss": 3.5673,
"step": 16400
},
{
"epoch": 4.793123543123543,
"grad_norm": 0.34479278326034546,
"learning_rate": 0.000542802332361516,
"loss": 3.5883,
"step": 16450
},
{
"epoch": 4.8076923076923075,
"grad_norm": 0.2996126413345337,
"learning_rate": 0.0005426274052478133,
"loss": 3.5754,
"step": 16500
},
{
"epoch": 4.822261072261072,
"grad_norm": 0.3211113214492798,
"learning_rate": 0.0005424524781341108,
"loss": 3.5794,
"step": 16550
},
{
"epoch": 4.836829836829837,
"grad_norm": 0.33680838346481323,
"learning_rate": 0.0005422775510204081,
"loss": 3.567,
"step": 16600
},
{
"epoch": 4.851398601398602,
"grad_norm": 0.2969048321247101,
"learning_rate": 0.0005421026239067055,
"loss": 3.5801,
"step": 16650
},
{
"epoch": 4.865967365967366,
"grad_norm": 0.31932759284973145,
"learning_rate": 0.0005419276967930028,
"loss": 3.5793,
"step": 16700
},
{
"epoch": 4.880536130536131,
"grad_norm": 0.2942551076412201,
"learning_rate": 0.0005417527696793002,
"loss": 3.5725,
"step": 16750
},
{
"epoch": 4.895104895104895,
"grad_norm": 0.30885228514671326,
"learning_rate": 0.0005415778425655976,
"loss": 3.5668,
"step": 16800
},
{
"epoch": 4.909673659673659,
"grad_norm": 0.29992324113845825,
"learning_rate": 0.000541402915451895,
"loss": 3.5678,
"step": 16850
},
{
"epoch": 4.924242424242424,
"grad_norm": 0.31865811347961426,
"learning_rate": 0.0005412279883381923,
"loss": 3.5728,
"step": 16900
},
{
"epoch": 4.938811188811189,
"grad_norm": 0.3378613293170929,
"learning_rate": 0.0005410530612244898,
"loss": 3.5738,
"step": 16950
},
{
"epoch": 4.953379953379954,
"grad_norm": 0.3021279275417328,
"learning_rate": 0.0005408781341107871,
"loss": 3.5756,
"step": 17000
},
{
"epoch": 4.953379953379954,
"eval_accuracy": 0.36047597018930483,
"eval_loss": 3.6271886825561523,
"eval_runtime": 180.0908,
"eval_samples_per_second": 92.409,
"eval_steps_per_second": 5.78,
"step": 17000
},
{
"epoch": 4.967948717948718,
"grad_norm": 0.3146043121814728,
"learning_rate": 0.0005407032069970845,
"loss": 3.5633,
"step": 17050
},
{
"epoch": 4.9825174825174825,
"grad_norm": 0.3014509677886963,
"learning_rate": 0.0005405282798833819,
"loss": 3.5643,
"step": 17100
},
{
"epoch": 4.997086247086247,
"grad_norm": 0.3128616213798523,
"learning_rate": 0.0005403533527696793,
"loss": 3.572,
"step": 17150
},
{
"epoch": 5.011655011655011,
"grad_norm": 0.34456515312194824,
"learning_rate": 0.0005401784256559766,
"loss": 3.4792,
"step": 17200
},
{
"epoch": 5.026223776223776,
"grad_norm": 0.32703524827957153,
"learning_rate": 0.000540003498542274,
"loss": 3.4653,
"step": 17250
},
{
"epoch": 5.040792540792541,
"grad_norm": 0.32900843024253845,
"learning_rate": 0.0005398285714285714,
"loss": 3.463,
"step": 17300
},
{
"epoch": 5.055361305361306,
"grad_norm": 0.31689518690109253,
"learning_rate": 0.0005396536443148688,
"loss": 3.4594,
"step": 17350
},
{
"epoch": 5.06993006993007,
"grad_norm": 0.3287120461463928,
"learning_rate": 0.0005394787172011661,
"loss": 3.4681,
"step": 17400
},
{
"epoch": 5.084498834498834,
"grad_norm": 0.3206745684146881,
"learning_rate": 0.0005393037900874635,
"loss": 3.4808,
"step": 17450
},
{
"epoch": 5.099067599067599,
"grad_norm": 0.3196023404598236,
"learning_rate": 0.0005391288629737609,
"loss": 3.4808,
"step": 17500
},
{
"epoch": 5.113636363636363,
"grad_norm": 0.33133959770202637,
"learning_rate": 0.0005389539358600583,
"loss": 3.4862,
"step": 17550
},
{
"epoch": 5.128205128205128,
"grad_norm": 0.3225027024745941,
"learning_rate": 0.0005387790087463557,
"loss": 3.4919,
"step": 17600
},
{
"epoch": 5.142773892773893,
"grad_norm": 0.3661787211894989,
"learning_rate": 0.0005386040816326529,
"loss": 3.4776,
"step": 17650
},
{
"epoch": 5.1573426573426575,
"grad_norm": 0.31561416387557983,
"learning_rate": 0.0005384291545189504,
"loss": 3.4934,
"step": 17700
},
{
"epoch": 5.171911421911422,
"grad_norm": 0.32750192284584045,
"learning_rate": 0.0005382542274052478,
"loss": 3.4987,
"step": 17750
},
{
"epoch": 5.186480186480186,
"grad_norm": 0.36426979303359985,
"learning_rate": 0.0005380793002915451,
"loss": 3.4982,
"step": 17800
},
{
"epoch": 5.201048951048951,
"grad_norm": 0.32267603278160095,
"learning_rate": 0.0005379043731778425,
"loss": 3.5052,
"step": 17850
},
{
"epoch": 5.215617715617715,
"grad_norm": 0.32066959142684937,
"learning_rate": 0.0005377294460641399,
"loss": 3.4943,
"step": 17900
},
{
"epoch": 5.230186480186481,
"grad_norm": 0.3105967044830322,
"learning_rate": 0.0005375545189504373,
"loss": 3.5037,
"step": 17950
},
{
"epoch": 5.244755244755245,
"grad_norm": 0.3213440179824829,
"learning_rate": 0.0005373795918367346,
"loss": 3.4948,
"step": 18000
},
{
"epoch": 5.244755244755245,
"eval_accuracy": 0.36040694435200493,
"eval_loss": 3.630403995513916,
"eval_runtime": 180.0023,
"eval_samples_per_second": 92.454,
"eval_steps_per_second": 5.783,
"step": 18000
},
{
"epoch": 5.2593240093240095,
"grad_norm": 0.33134472370147705,
"learning_rate": 0.000537204664723032,
"loss": 3.5058,
"step": 18050
},
{
"epoch": 5.273892773892774,
"grad_norm": 0.32371455430984497,
"learning_rate": 0.0005370297376093294,
"loss": 3.4951,
"step": 18100
},
{
"epoch": 5.288461538461538,
"grad_norm": 0.3160519301891327,
"learning_rate": 0.0005368548104956268,
"loss": 3.5056,
"step": 18150
},
{
"epoch": 5.303030303030303,
"grad_norm": 0.32032284140586853,
"learning_rate": 0.0005366798833819241,
"loss": 3.4996,
"step": 18200
},
{
"epoch": 5.317599067599067,
"grad_norm": 0.344394326210022,
"learning_rate": 0.0005365049562682215,
"loss": 3.5106,
"step": 18250
},
{
"epoch": 5.3321678321678325,
"grad_norm": 0.3405134379863739,
"learning_rate": 0.0005363300291545189,
"loss": 3.5028,
"step": 18300
},
{
"epoch": 5.346736596736597,
"grad_norm": 0.32548317313194275,
"learning_rate": 0.0005361551020408163,
"loss": 3.5205,
"step": 18350
},
{
"epoch": 5.361305361305361,
"grad_norm": 0.3317105174064636,
"learning_rate": 0.0005359801749271136,
"loss": 3.4986,
"step": 18400
},
{
"epoch": 5.375874125874126,
"grad_norm": 0.3342733383178711,
"learning_rate": 0.000535805247813411,
"loss": 3.5151,
"step": 18450
},
{
"epoch": 5.39044289044289,
"grad_norm": 0.33692413568496704,
"learning_rate": 0.0005356303206997085,
"loss": 3.5003,
"step": 18500
},
{
"epoch": 5.405011655011655,
"grad_norm": 0.31811004877090454,
"learning_rate": 0.0005354553935860058,
"loss": 3.5257,
"step": 18550
},
{
"epoch": 5.41958041958042,
"grad_norm": 0.3129310607910156,
"learning_rate": 0.0005352804664723031,
"loss": 3.5172,
"step": 18600
},
{
"epoch": 5.4341491841491845,
"grad_norm": 0.30926746129989624,
"learning_rate": 0.0005351055393586006,
"loss": 3.5206,
"step": 18650
},
{
"epoch": 5.448717948717949,
"grad_norm": 0.3198590576648712,
"learning_rate": 0.0005349306122448979,
"loss": 3.5003,
"step": 18700
},
{
"epoch": 5.463286713286713,
"grad_norm": 0.30657318234443665,
"learning_rate": 0.0005347556851311953,
"loss": 3.5242,
"step": 18750
},
{
"epoch": 5.477855477855478,
"grad_norm": 0.3101734519004822,
"learning_rate": 0.0005345807580174926,
"loss": 3.5127,
"step": 18800
},
{
"epoch": 5.492424242424242,
"grad_norm": 0.3366558253765106,
"learning_rate": 0.0005344058309037901,
"loss": 3.5179,
"step": 18850
},
{
"epoch": 5.506993006993007,
"grad_norm": 0.3367293179035187,
"learning_rate": 0.0005342309037900875,
"loss": 3.5164,
"step": 18900
},
{
"epoch": 5.521561771561771,
"grad_norm": 0.3203129172325134,
"learning_rate": 0.0005340559766763848,
"loss": 3.5129,
"step": 18950
},
{
"epoch": 5.536130536130536,
"grad_norm": 0.3326050341129303,
"learning_rate": 0.0005338810495626821,
"loss": 3.5348,
"step": 19000
},
{
"epoch": 5.536130536130536,
"eval_accuracy": 0.36132097813962316,
"eval_loss": 3.622004508972168,
"eval_runtime": 180.0706,
"eval_samples_per_second": 92.419,
"eval_steps_per_second": 5.781,
"step": 19000
},
{
"epoch": 5.550699300699301,
"grad_norm": 0.3246619701385498,
"learning_rate": 0.0005337061224489796,
"loss": 3.5178,
"step": 19050
},
{
"epoch": 5.565268065268065,
"grad_norm": 0.3267384171485901,
"learning_rate": 0.0005335311953352769,
"loss": 3.5183,
"step": 19100
},
{
"epoch": 5.57983682983683,
"grad_norm": 0.3426792323589325,
"learning_rate": 0.0005333562682215743,
"loss": 3.5213,
"step": 19150
},
{
"epoch": 5.594405594405594,
"grad_norm": 0.3265758454799652,
"learning_rate": 0.0005331813411078716,
"loss": 3.5194,
"step": 19200
},
{
"epoch": 5.608974358974359,
"grad_norm": 0.32385995984077454,
"learning_rate": 0.0005330064139941691,
"loss": 3.5222,
"step": 19250
},
{
"epoch": 5.623543123543124,
"grad_norm": 0.3385801911354065,
"learning_rate": 0.0005328314868804665,
"loss": 3.5114,
"step": 19300
},
{
"epoch": 5.638111888111888,
"grad_norm": 0.30696603655815125,
"learning_rate": 0.0005326565597667638,
"loss": 3.5162,
"step": 19350
},
{
"epoch": 5.652680652680653,
"grad_norm": 0.3337629735469818,
"learning_rate": 0.0005324816326530612,
"loss": 3.5178,
"step": 19400
},
{
"epoch": 5.667249417249417,
"grad_norm": 0.3224729895591736,
"learning_rate": 0.0005323067055393586,
"loss": 3.5234,
"step": 19450
},
{
"epoch": 5.681818181818182,
"grad_norm": 0.3160954713821411,
"learning_rate": 0.0005321317784256559,
"loss": 3.5084,
"step": 19500
},
{
"epoch": 5.696386946386946,
"grad_norm": 0.3182397186756134,
"learning_rate": 0.0005319568513119533,
"loss": 3.5209,
"step": 19550
},
{
"epoch": 5.7109557109557105,
"grad_norm": 0.3317508399486542,
"learning_rate": 0.0005317819241982506,
"loss": 3.5349,
"step": 19600
},
{
"epoch": 5.725524475524476,
"grad_norm": 0.3387415409088135,
"learning_rate": 0.0005316069970845481,
"loss": 3.5275,
"step": 19650
},
{
"epoch": 5.74009324009324,
"grad_norm": 0.33118101954460144,
"learning_rate": 0.0005314320699708454,
"loss": 3.5235,
"step": 19700
},
{
"epoch": 5.754662004662005,
"grad_norm": 0.32198289036750793,
"learning_rate": 0.0005312571428571428,
"loss": 3.5296,
"step": 19750
},
{
"epoch": 5.769230769230769,
"grad_norm": 0.3228715658187866,
"learning_rate": 0.0005310822157434403,
"loss": 3.5169,
"step": 19800
},
{
"epoch": 5.783799533799534,
"grad_norm": 0.3381294012069702,
"learning_rate": 0.0005309072886297376,
"loss": 3.5367,
"step": 19850
},
{
"epoch": 5.798368298368298,
"grad_norm": 0.3143836557865143,
"learning_rate": 0.0005307323615160349,
"loss": 3.5199,
"step": 19900
},
{
"epoch": 5.812937062937063,
"grad_norm": 0.32376548647880554,
"learning_rate": 0.0005305574344023323,
"loss": 3.5133,
"step": 19950
},
{
"epoch": 5.827505827505828,
"grad_norm": 0.31885069608688354,
"learning_rate": 0.0005303825072886296,
"loss": 3.5111,
"step": 20000
},
{
"epoch": 5.827505827505828,
"eval_accuracy": 0.36265845662931434,
"eval_loss": 3.611976146697998,
"eval_runtime": 180.1718,
"eval_samples_per_second": 92.367,
"eval_steps_per_second": 5.778,
"step": 20000
},
{
"epoch": 5.842074592074592,
"grad_norm": 0.3121870458126068,
"learning_rate": 0.0005302075801749271,
"loss": 3.524,
"step": 20050
},
{
"epoch": 5.856643356643357,
"grad_norm": 0.3262169063091278,
"learning_rate": 0.0005300326530612244,
"loss": 3.5309,
"step": 20100
},
{
"epoch": 5.871212121212121,
"grad_norm": 0.3347600996494293,
"learning_rate": 0.0005298577259475218,
"loss": 3.5402,
"step": 20150
},
{
"epoch": 5.8857808857808855,
"grad_norm": 0.31512966752052307,
"learning_rate": 0.0005296827988338193,
"loss": 3.5253,
"step": 20200
},
{
"epoch": 5.90034965034965,
"grad_norm": 0.3156508505344391,
"learning_rate": 0.0005295078717201166,
"loss": 3.5208,
"step": 20250
},
{
"epoch": 5.914918414918415,
"grad_norm": 0.31857651472091675,
"learning_rate": 0.000529332944606414,
"loss": 3.5183,
"step": 20300
},
{
"epoch": 5.92948717948718,
"grad_norm": 0.32981452345848083,
"learning_rate": 0.0005291580174927113,
"loss": 3.5222,
"step": 20350
},
{
"epoch": 5.944055944055944,
"grad_norm": 0.3362169563770294,
"learning_rate": 0.0005289830903790087,
"loss": 3.5272,
"step": 20400
},
{
"epoch": 5.958624708624709,
"grad_norm": 0.32992255687713623,
"learning_rate": 0.0005288081632653061,
"loss": 3.5311,
"step": 20450
},
{
"epoch": 5.973193473193473,
"grad_norm": 0.31938815116882324,
"learning_rate": 0.0005286332361516034,
"loss": 3.5149,
"step": 20500
},
{
"epoch": 5.9877622377622375,
"grad_norm": 0.33160415291786194,
"learning_rate": 0.0005284583090379008,
"loss": 3.5177,
"step": 20550
},
{
"epoch": 6.002331002331002,
"grad_norm": 0.3441227674484253,
"learning_rate": 0.0005282833819241983,
"loss": 3.51,
"step": 20600
},
{
"epoch": 6.016899766899767,
"grad_norm": 0.32598426938056946,
"learning_rate": 0.0005281084548104956,
"loss": 3.414,
"step": 20650
},
{
"epoch": 6.031468531468532,
"grad_norm": 0.31953006982803345,
"learning_rate": 0.000527933527696793,
"loss": 3.4213,
"step": 20700
},
{
"epoch": 6.046037296037296,
"grad_norm": 0.3276713192462921,
"learning_rate": 0.0005277586005830903,
"loss": 3.4137,
"step": 20750
},
{
"epoch": 6.0606060606060606,
"grad_norm": 0.338102251291275,
"learning_rate": 0.0005275836734693877,
"loss": 3.4243,
"step": 20800
},
{
"epoch": 6.075174825174825,
"grad_norm": 0.328328013420105,
"learning_rate": 0.0005274087463556851,
"loss": 3.4284,
"step": 20850
},
{
"epoch": 6.089743589743589,
"grad_norm": 0.33343490958213806,
"learning_rate": 0.0005272338192419824,
"loss": 3.4311,
"step": 20900
},
{
"epoch": 6.104312354312355,
"grad_norm": 0.3318384289741516,
"learning_rate": 0.0005270588921282798,
"loss": 3.4394,
"step": 20950
},
{
"epoch": 6.118881118881119,
"grad_norm": 0.34948527812957764,
"learning_rate": 0.0005268839650145772,
"loss": 3.4294,
"step": 21000
},
{
"epoch": 6.118881118881119,
"eval_accuracy": 0.36256932275423204,
"eval_loss": 3.613757371902466,
"eval_runtime": 180.2204,
"eval_samples_per_second": 92.342,
"eval_steps_per_second": 5.776,
"step": 21000
},
{
"epoch": 6.133449883449884,
"grad_norm": 0.3257012665271759,
"learning_rate": 0.0005267090379008746,
"loss": 3.4437,
"step": 21050
},
{
"epoch": 6.148018648018648,
"grad_norm": 0.3224816620349884,
"learning_rate": 0.000526534110787172,
"loss": 3.4454,
"step": 21100
},
{
"epoch": 6.1625874125874125,
"grad_norm": 0.3164311349391937,
"learning_rate": 0.0005263591836734693,
"loss": 3.4504,
"step": 21150
},
{
"epoch": 6.177156177156177,
"grad_norm": 0.3344584107398987,
"learning_rate": 0.0005261842565597668,
"loss": 3.4456,
"step": 21200
},
{
"epoch": 6.191724941724941,
"grad_norm": 0.32673484086990356,
"learning_rate": 0.0005260093294460641,
"loss": 3.4504,
"step": 21250
},
{
"epoch": 6.206293706293707,
"grad_norm": 0.33216020464897156,
"learning_rate": 0.0005258344023323614,
"loss": 3.4525,
"step": 21300
},
{
"epoch": 6.220862470862471,
"grad_norm": 0.3255676031112671,
"learning_rate": 0.0005256594752186588,
"loss": 3.4533,
"step": 21350
},
{
"epoch": 6.235431235431236,
"grad_norm": 0.3329181373119354,
"learning_rate": 0.0005254845481049562,
"loss": 3.456,
"step": 21400
},
{
"epoch": 6.25,
"grad_norm": 0.3463157117366791,
"learning_rate": 0.0005253096209912536,
"loss": 3.463,
"step": 21450
},
{
"epoch": 6.264568764568764,
"grad_norm": 0.3387112319469452,
"learning_rate": 0.000525134693877551,
"loss": 3.4693,
"step": 21500
},
{
"epoch": 6.279137529137529,
"grad_norm": 0.35688215494155884,
"learning_rate": 0.0005249597667638484,
"loss": 3.4673,
"step": 21550
},
{
"epoch": 6.293706293706293,
"grad_norm": 0.32070285081863403,
"learning_rate": 0.0005247848396501458,
"loss": 3.4506,
"step": 21600
},
{
"epoch": 6.308275058275059,
"grad_norm": 0.3204960525035858,
"learning_rate": 0.0005246099125364431,
"loss": 3.4585,
"step": 21650
},
{
"epoch": 6.322843822843823,
"grad_norm": 0.33638399839401245,
"learning_rate": 0.0005244349854227404,
"loss": 3.4617,
"step": 21700
},
{
"epoch": 6.3374125874125875,
"grad_norm": 0.3176703155040741,
"learning_rate": 0.0005242600583090379,
"loss": 3.4561,
"step": 21750
},
{
"epoch": 6.351981351981352,
"grad_norm": 0.3235324025154114,
"learning_rate": 0.0005240851311953352,
"loss": 3.4705,
"step": 21800
},
{
"epoch": 6.366550116550116,
"grad_norm": 0.34393689036369324,
"learning_rate": 0.0005239102040816326,
"loss": 3.4627,
"step": 21850
},
{
"epoch": 6.381118881118881,
"grad_norm": 0.3094463646411896,
"learning_rate": 0.00052373527696793,
"loss": 3.4661,
"step": 21900
},
{
"epoch": 6.395687645687646,
"grad_norm": 0.33415213227272034,
"learning_rate": 0.0005235603498542274,
"loss": 3.462,
"step": 21950
},
{
"epoch": 6.410256410256411,
"grad_norm": 0.32387012243270874,
"learning_rate": 0.0005233854227405248,
"loss": 3.468,
"step": 22000
},
{
"epoch": 6.410256410256411,
"eval_accuracy": 0.3633268431015672,
"eval_loss": 3.607569694519043,
"eval_runtime": 180.2515,
"eval_samples_per_second": 92.327,
"eval_steps_per_second": 5.775,
"step": 22000
},
{
"epoch": 6.424825174825175,
"grad_norm": 0.33040571212768555,
"learning_rate": 0.0005232104956268221,
"loss": 3.47,
"step": 22050
},
{
"epoch": 6.4393939393939394,
"grad_norm": 0.3241094648838043,
"learning_rate": 0.0005230355685131195,
"loss": 3.4797,
"step": 22100
},
{
"epoch": 6.453962703962704,
"grad_norm": 0.32219818234443665,
"learning_rate": 0.0005228606413994169,
"loss": 3.4814,
"step": 22150
},
{
"epoch": 6.468531468531468,
"grad_norm": 0.29795998334884644,
"learning_rate": 0.0005226857142857142,
"loss": 3.4686,
"step": 22200
},
{
"epoch": 6.483100233100233,
"grad_norm": 0.32461127638816833,
"learning_rate": 0.0005225107871720116,
"loss": 3.4817,
"step": 22250
},
{
"epoch": 6.497668997668998,
"grad_norm": 0.3403743803501129,
"learning_rate": 0.0005223358600583089,
"loss": 3.469,
"step": 22300
},
{
"epoch": 6.5122377622377625,
"grad_norm": 0.3380977213382721,
"learning_rate": 0.0005221609329446064,
"loss": 3.4631,
"step": 22350
},
{
"epoch": 6.526806526806527,
"grad_norm": 0.3517080545425415,
"learning_rate": 0.0005219860058309038,
"loss": 3.4956,
"step": 22400
},
{
"epoch": 6.541375291375291,
"grad_norm": 0.3424749970436096,
"learning_rate": 0.0005218110787172011,
"loss": 3.4799,
"step": 22450
},
{
"epoch": 6.555944055944056,
"grad_norm": 0.33320173621177673,
"learning_rate": 0.0005216361516034985,
"loss": 3.4824,
"step": 22500
},
{
"epoch": 6.57051282051282,
"grad_norm": 0.3647104799747467,
"learning_rate": 0.0005214612244897959,
"loss": 3.4817,
"step": 22550
},
{
"epoch": 6.585081585081585,
"grad_norm": 0.32451170682907104,
"learning_rate": 0.0005212862973760932,
"loss": 3.4947,
"step": 22600
},
{
"epoch": 6.59965034965035,
"grad_norm": 0.3204374313354492,
"learning_rate": 0.0005211113702623906,
"loss": 3.4697,
"step": 22650
},
{
"epoch": 6.6142191142191145,
"grad_norm": 0.35725289583206177,
"learning_rate": 0.0005209364431486879,
"loss": 3.4834,
"step": 22700
},
{
"epoch": 6.628787878787879,
"grad_norm": 0.3210899531841278,
"learning_rate": 0.0005207615160349854,
"loss": 3.4881,
"step": 22750
},
{
"epoch": 6.643356643356643,
"grad_norm": 0.32716649770736694,
"learning_rate": 0.0005205865889212828,
"loss": 3.4757,
"step": 22800
},
{
"epoch": 6.657925407925408,
"grad_norm": 0.30155670642852783,
"learning_rate": 0.0005204116618075801,
"loss": 3.4726,
"step": 22850
},
{
"epoch": 6.672494172494172,
"grad_norm": 0.3163397014141083,
"learning_rate": 0.0005202367346938776,
"loss": 3.4822,
"step": 22900
},
{
"epoch": 6.687062937062937,
"grad_norm": 0.32445988059043884,
"learning_rate": 0.0005200618075801749,
"loss": 3.4772,
"step": 22950
},
{
"epoch": 6.701631701631702,
"grad_norm": 0.3531341254711151,
"learning_rate": 0.0005198868804664723,
"loss": 3.463,
"step": 23000
},
{
"epoch": 6.701631701631702,
"eval_accuracy": 0.3641228156615039,
"eval_loss": 3.5994231700897217,
"eval_runtime": 180.3181,
"eval_samples_per_second": 92.292,
"eval_steps_per_second": 5.773,
"step": 23000
},
{
"epoch": 6.716200466200466,
"grad_norm": 0.31323182582855225,
"learning_rate": 0.0005197119533527696,
"loss": 3.4779,
"step": 23050
},
{
"epoch": 6.730769230769231,
"grad_norm": 0.33791857957839966,
"learning_rate": 0.000519537026239067,
"loss": 3.4815,
"step": 23100
},
{
"epoch": 6.745337995337995,
"grad_norm": 0.32510659098625183,
"learning_rate": 0.0005193620991253644,
"loss": 3.4744,
"step": 23150
},
{
"epoch": 6.75990675990676,
"grad_norm": 0.3223019540309906,
"learning_rate": 0.0005191871720116618,
"loss": 3.4814,
"step": 23200
},
{
"epoch": 6.774475524475524,
"grad_norm": 0.33018162846565247,
"learning_rate": 0.0005190122448979591,
"loss": 3.4914,
"step": 23250
},
{
"epoch": 6.7890442890442895,
"grad_norm": 0.34106600284576416,
"learning_rate": 0.0005188373177842566,
"loss": 3.4826,
"step": 23300
},
{
"epoch": 6.803613053613054,
"grad_norm": 0.3386887013912201,
"learning_rate": 0.0005186623906705539,
"loss": 3.4854,
"step": 23350
},
{
"epoch": 6.818181818181818,
"grad_norm": 0.3332427442073822,
"learning_rate": 0.0005184874635568513,
"loss": 3.4823,
"step": 23400
},
{
"epoch": 6.832750582750583,
"grad_norm": 0.3226874768733978,
"learning_rate": 0.0005183125364431486,
"loss": 3.4834,
"step": 23450
},
{
"epoch": 6.847319347319347,
"grad_norm": 0.3233667016029358,
"learning_rate": 0.000518137609329446,
"loss": 3.4903,
"step": 23500
},
{
"epoch": 6.861888111888112,
"grad_norm": 0.3243621289730072,
"learning_rate": 0.0005179626822157434,
"loss": 3.4887,
"step": 23550
},
{
"epoch": 6.876456876456876,
"grad_norm": 0.31626027822494507,
"learning_rate": 0.0005177877551020407,
"loss": 3.4999,
"step": 23600
},
{
"epoch": 6.891025641025641,
"grad_norm": 0.32455411553382874,
"learning_rate": 0.0005176128279883381,
"loss": 3.4913,
"step": 23650
},
{
"epoch": 6.905594405594406,
"grad_norm": 0.33513808250427246,
"learning_rate": 0.0005174379008746356,
"loss": 3.4868,
"step": 23700
},
{
"epoch": 6.92016317016317,
"grad_norm": 0.3218197226524353,
"learning_rate": 0.0005172629737609329,
"loss": 3.4826,
"step": 23750
},
{
"epoch": 6.934731934731935,
"grad_norm": 0.3316863477230072,
"learning_rate": 0.0005170880466472303,
"loss": 3.4793,
"step": 23800
},
{
"epoch": 6.949300699300699,
"grad_norm": 0.3216456472873688,
"learning_rate": 0.0005169131195335276,
"loss": 3.4953,
"step": 23850
},
{
"epoch": 6.963869463869464,
"grad_norm": 0.3314974308013916,
"learning_rate": 0.0005167381924198251,
"loss": 3.4799,
"step": 23900
},
{
"epoch": 6.978438228438229,
"grad_norm": 0.3145802319049835,
"learning_rate": 0.0005165632653061224,
"loss": 3.4943,
"step": 23950
},
{
"epoch": 6.993006993006993,
"grad_norm": 0.3357449471950531,
"learning_rate": 0.0005163883381924197,
"loss": 3.4931,
"step": 24000
},
{
"epoch": 6.993006993006993,
"eval_accuracy": 0.3651081095128422,
"eval_loss": 3.587930202484131,
"eval_runtime": 180.1844,
"eval_samples_per_second": 92.361,
"eval_steps_per_second": 5.777,
"step": 24000
},
{
"epoch": 7.007575757575758,
"grad_norm": 0.33467987179756165,
"learning_rate": 0.0005162134110787171,
"loss": 3.4232,
"step": 24050
},
{
"epoch": 7.022144522144522,
"grad_norm": 0.34825414419174194,
"learning_rate": 0.0005160384839650146,
"loss": 3.3667,
"step": 24100
},
{
"epoch": 7.036713286713287,
"grad_norm": 0.3400607407093048,
"learning_rate": 0.0005158635568513119,
"loss": 3.3739,
"step": 24150
},
{
"epoch": 7.051282051282051,
"grad_norm": 0.3337096869945526,
"learning_rate": 0.0005156886297376093,
"loss": 3.3744,
"step": 24200
},
{
"epoch": 7.0658508158508155,
"grad_norm": 0.32601627707481384,
"learning_rate": 0.0005155137026239066,
"loss": 3.3788,
"step": 24250
},
{
"epoch": 7.08041958041958,
"grad_norm": 0.3523777723312378,
"learning_rate": 0.0005153387755102041,
"loss": 3.3793,
"step": 24300
},
{
"epoch": 7.094988344988345,
"grad_norm": 0.3176375925540924,
"learning_rate": 0.0005151638483965014,
"loss": 3.3979,
"step": 24350
},
{
"epoch": 7.10955710955711,
"grad_norm": 0.34010785818099976,
"learning_rate": 0.0005149889212827987,
"loss": 3.3969,
"step": 24400
},
{
"epoch": 7.124125874125874,
"grad_norm": 0.3091638684272766,
"learning_rate": 0.0005148139941690961,
"loss": 3.3991,
"step": 24450
},
{
"epoch": 7.138694638694639,
"grad_norm": 0.33341875672340393,
"learning_rate": 0.0005146390670553936,
"loss": 3.3976,
"step": 24500
},
{
"epoch": 7.153263403263403,
"grad_norm": 0.32438334822654724,
"learning_rate": 0.0005144641399416909,
"loss": 3.4227,
"step": 24550
},
{
"epoch": 7.1678321678321675,
"grad_norm": 0.32416701316833496,
"learning_rate": 0.0005142892128279883,
"loss": 3.4131,
"step": 24600
},
{
"epoch": 7.182400932400933,
"grad_norm": 0.31941843032836914,
"learning_rate": 0.0005141142857142856,
"loss": 3.4218,
"step": 24650
},
{
"epoch": 7.196969696969697,
"grad_norm": 0.3458154499530792,
"learning_rate": 0.0005139393586005831,
"loss": 3.406,
"step": 24700
},
{
"epoch": 7.211538461538462,
"grad_norm": 0.3377438485622406,
"learning_rate": 0.0005137644314868804,
"loss": 3.4271,
"step": 24750
},
{
"epoch": 7.226107226107226,
"grad_norm": 0.32163214683532715,
"learning_rate": 0.0005135895043731778,
"loss": 3.4073,
"step": 24800
},
{
"epoch": 7.2406759906759905,
"grad_norm": 0.3218829333782196,
"learning_rate": 0.0005134145772594752,
"loss": 3.4237,
"step": 24850
},
{
"epoch": 7.255244755244755,
"grad_norm": 0.32908493280410767,
"learning_rate": 0.0005132396501457726,
"loss": 3.4279,
"step": 24900
},
{
"epoch": 7.269813519813519,
"grad_norm": 0.3292107880115509,
"learning_rate": 0.0005130647230320699,
"loss": 3.4245,
"step": 24950
},
{
"epoch": 7.284382284382285,
"grad_norm": 0.33104875683784485,
"learning_rate": 0.0005128897959183673,
"loss": 3.4302,
"step": 25000
},
{
"epoch": 7.284382284382285,
"eval_accuracy": 0.36487504441994895,
"eval_loss": 3.5962696075439453,
"eval_runtime": 180.2058,
"eval_samples_per_second": 92.35,
"eval_steps_per_second": 5.777,
"step": 25000
},
{
"epoch": 7.298951048951049,
"grad_norm": 0.3441121578216553,
"learning_rate": 0.0005127148688046647,
"loss": 3.4167,
"step": 25050
},
{
"epoch": 7.313519813519814,
"grad_norm": 0.3398892283439636,
"learning_rate": 0.0005125399416909621,
"loss": 3.4279,
"step": 25100
},
{
"epoch": 7.328088578088578,
"grad_norm": 0.31465670466423035,
"learning_rate": 0.0005123650145772594,
"loss": 3.4331,
"step": 25150
},
{
"epoch": 7.3426573426573425,
"grad_norm": 0.3508657217025757,
"learning_rate": 0.0005121900874635568,
"loss": 3.4321,
"step": 25200
},
{
"epoch": 7.357226107226107,
"grad_norm": 0.3288611173629761,
"learning_rate": 0.0005120151603498543,
"loss": 3.4375,
"step": 25250
},
{
"epoch": 7.371794871794872,
"grad_norm": 0.3296453356742859,
"learning_rate": 0.0005118402332361515,
"loss": 3.4207,
"step": 25300
},
{
"epoch": 7.386363636363637,
"grad_norm": 0.3522392511367798,
"learning_rate": 0.0005116653061224489,
"loss": 3.4314,
"step": 25350
},
{
"epoch": 7.400932400932401,
"grad_norm": 0.3410415053367615,
"learning_rate": 0.0005114903790087463,
"loss": 3.441,
"step": 25400
},
{
"epoch": 7.415501165501166,
"grad_norm": 0.3186326324939728,
"learning_rate": 0.0005113154518950437,
"loss": 3.4412,
"step": 25450
},
{
"epoch": 7.43006993006993,
"grad_norm": 0.34362727403640747,
"learning_rate": 0.0005111405247813411,
"loss": 3.4385,
"step": 25500
},
{
"epoch": 7.444638694638694,
"grad_norm": 0.3459414541721344,
"learning_rate": 0.0005109655976676384,
"loss": 3.4456,
"step": 25550
},
{
"epoch": 7.459207459207459,
"grad_norm": 0.32094958424568176,
"learning_rate": 0.0005107906705539358,
"loss": 3.4242,
"step": 25600
},
{
"epoch": 7.473776223776224,
"grad_norm": 0.33854368329048157,
"learning_rate": 0.0005106157434402332,
"loss": 3.4381,
"step": 25650
},
{
"epoch": 7.488344988344989,
"grad_norm": 0.3441680371761322,
"learning_rate": 0.0005104408163265306,
"loss": 3.4411,
"step": 25700
},
{
"epoch": 7.502913752913753,
"grad_norm": 0.34269794821739197,
"learning_rate": 0.0005102658892128279,
"loss": 3.4422,
"step": 25750
},
{
"epoch": 7.5174825174825175,
"grad_norm": 0.3172236382961273,
"learning_rate": 0.0005100909620991253,
"loss": 3.4436,
"step": 25800
},
{
"epoch": 7.532051282051282,
"grad_norm": 0.3277212977409363,
"learning_rate": 0.0005099160349854227,
"loss": 3.4417,
"step": 25850
},
{
"epoch": 7.546620046620046,
"grad_norm": 0.3453097641468048,
"learning_rate": 0.0005097411078717201,
"loss": 3.441,
"step": 25900
},
{
"epoch": 7.561188811188811,
"grad_norm": 0.33524438738822937,
"learning_rate": 0.0005095661807580174,
"loss": 3.4466,
"step": 25950
},
{
"epoch": 7.575757575757576,
"grad_norm": 0.3289114832878113,
"learning_rate": 0.0005093912536443149,
"loss": 3.4563,
"step": 26000
},
{
"epoch": 7.575757575757576,
"eval_accuracy": 0.36541772625834573,
"eval_loss": 3.5879688262939453,
"eval_runtime": 180.0683,
"eval_samples_per_second": 92.42,
"eval_steps_per_second": 5.781,
"step": 26000
},
{
"epoch": 7.590326340326341,
"grad_norm": 0.32762327790260315,
"learning_rate": 0.0005092163265306122,
"loss": 3.4424,
"step": 26050
},
{
"epoch": 7.604895104895105,
"grad_norm": 0.32108092308044434,
"learning_rate": 0.0005090413994169096,
"loss": 3.4444,
"step": 26100
},
{
"epoch": 7.619463869463869,
"grad_norm": 0.31724825501441956,
"learning_rate": 0.000508866472303207,
"loss": 3.4457,
"step": 26150
},
{
"epoch": 7.634032634032634,
"grad_norm": 0.3468092083930969,
"learning_rate": 0.0005086915451895044,
"loss": 3.4433,
"step": 26200
},
{
"epoch": 7.648601398601398,
"grad_norm": 0.3435969054698944,
"learning_rate": 0.0005085166180758017,
"loss": 3.4572,
"step": 26250
},
{
"epoch": 7.663170163170163,
"grad_norm": 0.34128132462501526,
"learning_rate": 0.0005083416909620991,
"loss": 3.4489,
"step": 26300
},
{
"epoch": 7.677738927738928,
"grad_norm": 0.33892133831977844,
"learning_rate": 0.0005081667638483964,
"loss": 3.4451,
"step": 26350
},
{
"epoch": 7.6923076923076925,
"grad_norm": 0.3309405744075775,
"learning_rate": 0.0005079918367346939,
"loss": 3.4451,
"step": 26400
},
{
"epoch": 7.706876456876457,
"grad_norm": 0.3350067138671875,
"learning_rate": 0.0005078169096209912,
"loss": 3.4521,
"step": 26450
},
{
"epoch": 7.721445221445221,
"grad_norm": 0.3235504627227783,
"learning_rate": 0.0005076419825072886,
"loss": 3.4527,
"step": 26500
},
{
"epoch": 7.736013986013986,
"grad_norm": 0.3295065462589264,
"learning_rate": 0.000507467055393586,
"loss": 3.4627,
"step": 26550
},
{
"epoch": 7.75058275058275,
"grad_norm": 0.3342365622520447,
"learning_rate": 0.0005072921282798834,
"loss": 3.4602,
"step": 26600
},
{
"epoch": 7.765151515151516,
"grad_norm": 0.33766648173332214,
"learning_rate": 0.0005071172011661807,
"loss": 3.4665,
"step": 26650
},
{
"epoch": 7.77972027972028,
"grad_norm": 0.33531129360198975,
"learning_rate": 0.0005069422740524781,
"loss": 3.4529,
"step": 26700
},
{
"epoch": 7.7942890442890445,
"grad_norm": 0.31520190834999084,
"learning_rate": 0.0005067673469387754,
"loss": 3.4565,
"step": 26750
},
{
"epoch": 7.808857808857809,
"grad_norm": 0.319269061088562,
"learning_rate": 0.0005065924198250729,
"loss": 3.4587,
"step": 26800
},
{
"epoch": 7.823426573426573,
"grad_norm": 0.33063361048698425,
"learning_rate": 0.0005064174927113702,
"loss": 3.4538,
"step": 26850
},
{
"epoch": 7.837995337995338,
"grad_norm": 0.3421882390975952,
"learning_rate": 0.0005062425655976676,
"loss": 3.4524,
"step": 26900
},
{
"epoch": 7.852564102564102,
"grad_norm": 0.3269641399383545,
"learning_rate": 0.0005060676384839649,
"loss": 3.4436,
"step": 26950
},
{
"epoch": 7.867132867132867,
"grad_norm": 0.3379534184932709,
"learning_rate": 0.0005058927113702624,
"loss": 3.4508,
"step": 27000
},
{
"epoch": 7.867132867132867,
"eval_accuracy": 0.36653001824304665,
"eval_loss": 3.576610803604126,
"eval_runtime": 180.045,
"eval_samples_per_second": 92.432,
"eval_steps_per_second": 5.782,
"step": 27000
},
{
"epoch": 7.881701631701632,
"grad_norm": 0.32535165548324585,
"learning_rate": 0.0005057177842565598,
"loss": 3.4509,
"step": 27050
},
{
"epoch": 7.896270396270396,
"grad_norm": 0.32828956842422485,
"learning_rate": 0.0005055428571428571,
"loss": 3.4521,
"step": 27100
},
{
"epoch": 7.910839160839161,
"grad_norm": 0.3364267945289612,
"learning_rate": 0.0005053679300291544,
"loss": 3.4542,
"step": 27150
},
{
"epoch": 7.925407925407925,
"grad_norm": 0.3117345869541168,
"learning_rate": 0.0005051930029154519,
"loss": 3.4532,
"step": 27200
},
{
"epoch": 7.93997668997669,
"grad_norm": 0.34424829483032227,
"learning_rate": 0.0005050180758017492,
"loss": 3.4511,
"step": 27250
},
{
"epoch": 7.954545454545455,
"grad_norm": 0.3215528130531311,
"learning_rate": 0.0005048431486880466,
"loss": 3.4591,
"step": 27300
},
{
"epoch": 7.9691142191142195,
"grad_norm": 0.32633933424949646,
"learning_rate": 0.0005046682215743439,
"loss": 3.4638,
"step": 27350
},
{
"epoch": 7.983682983682984,
"grad_norm": 0.3218749165534973,
"learning_rate": 0.0005044932944606414,
"loss": 3.4643,
"step": 27400
},
{
"epoch": 7.998251748251748,
"grad_norm": 0.3505411446094513,
"learning_rate": 0.0005043183673469388,
"loss": 3.471,
"step": 27450
},
{
"epoch": 8.012820512820513,
"grad_norm": 0.32962340116500854,
"learning_rate": 0.0005041434402332361,
"loss": 3.3527,
"step": 27500
},
{
"epoch": 8.027389277389277,
"grad_norm": 0.3332984447479248,
"learning_rate": 0.0005039685131195334,
"loss": 3.3579,
"step": 27550
},
{
"epoch": 8.041958041958042,
"grad_norm": 0.3268524706363678,
"learning_rate": 0.0005037935860058309,
"loss": 3.3558,
"step": 27600
},
{
"epoch": 8.056526806526806,
"grad_norm": 0.326473593711853,
"learning_rate": 0.0005036186588921282,
"loss": 3.3529,
"step": 27650
},
{
"epoch": 8.07109557109557,
"grad_norm": 0.3332017958164215,
"learning_rate": 0.0005034437317784256,
"loss": 3.3669,
"step": 27700
},
{
"epoch": 8.085664335664335,
"grad_norm": 0.34896111488342285,
"learning_rate": 0.000503268804664723,
"loss": 3.353,
"step": 27750
},
{
"epoch": 8.1002331002331,
"grad_norm": 0.3163587749004364,
"learning_rate": 0.0005030938775510204,
"loss": 3.3777,
"step": 27800
},
{
"epoch": 8.114801864801866,
"grad_norm": 0.3226380944252014,
"learning_rate": 0.0005029189504373178,
"loss": 3.3687,
"step": 27850
},
{
"epoch": 8.12937062937063,
"grad_norm": 0.3180595934391022,
"learning_rate": 0.0005027440233236151,
"loss": 3.3655,
"step": 27900
},
{
"epoch": 8.143939393939394,
"grad_norm": 0.3455767035484314,
"learning_rate": 0.0005025690962099126,
"loss": 3.3818,
"step": 27950
},
{
"epoch": 8.158508158508159,
"grad_norm": 0.3457156717777252,
"learning_rate": 0.0005023941690962099,
"loss": 3.3689,
"step": 28000
},
{
"epoch": 8.158508158508159,
"eval_accuracy": 0.36624509557903034,
"eval_loss": 3.5875673294067383,
"eval_runtime": 180.2264,
"eval_samples_per_second": 92.339,
"eval_steps_per_second": 5.776,
"step": 28000
},
{
"epoch": 8.173076923076923,
"grad_norm": 0.33135128021240234,
"learning_rate": 0.0005022192419825072,
"loss": 3.3743,
"step": 28050
},
{
"epoch": 8.187645687645688,
"grad_norm": 0.34318405389785767,
"learning_rate": 0.0005020443148688046,
"loss": 3.3838,
"step": 28100
},
{
"epoch": 8.202214452214452,
"grad_norm": 0.3445442318916321,
"learning_rate": 0.000501869387755102,
"loss": 3.3952,
"step": 28150
},
{
"epoch": 8.216783216783217,
"grad_norm": 0.33991461992263794,
"learning_rate": 0.0005016944606413994,
"loss": 3.39,
"step": 28200
},
{
"epoch": 8.231351981351981,
"grad_norm": 0.30327895283699036,
"learning_rate": 0.0005015195335276967,
"loss": 3.3729,
"step": 28250
},
{
"epoch": 8.245920745920746,
"grad_norm": 0.3228490650653839,
"learning_rate": 0.0005013446064139941,
"loss": 3.3843,
"step": 28300
},
{
"epoch": 8.26048951048951,
"grad_norm": 0.3313380777835846,
"learning_rate": 0.0005011696793002916,
"loss": 3.3977,
"step": 28350
},
{
"epoch": 8.275058275058274,
"grad_norm": 0.31908831000328064,
"learning_rate": 0.0005009947521865889,
"loss": 3.3963,
"step": 28400
},
{
"epoch": 8.289627039627039,
"grad_norm": 0.3229847252368927,
"learning_rate": 0.0005008198250728862,
"loss": 3.3839,
"step": 28450
},
{
"epoch": 8.304195804195805,
"grad_norm": 0.3282051086425781,
"learning_rate": 0.0005006448979591836,
"loss": 3.3839,
"step": 28500
},
{
"epoch": 8.31876456876457,
"grad_norm": 0.33158057928085327,
"learning_rate": 0.000500469970845481,
"loss": 3.3931,
"step": 28550
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.33471545577049255,
"learning_rate": 0.0005002950437317784,
"loss": 3.401,
"step": 28600
},
{
"epoch": 8.347902097902098,
"grad_norm": 0.3325505554676056,
"learning_rate": 0.0005001201166180757,
"loss": 3.4004,
"step": 28650
},
{
"epoch": 8.362470862470863,
"grad_norm": 0.3467610776424408,
"learning_rate": 0.0004999451895043731,
"loss": 3.4166,
"step": 28700
},
{
"epoch": 8.377039627039627,
"grad_norm": 0.3516916334629059,
"learning_rate": 0.0004997702623906706,
"loss": 3.41,
"step": 28750
},
{
"epoch": 8.391608391608392,
"grad_norm": 0.3250814378261566,
"learning_rate": 0.0004995953352769679,
"loss": 3.3877,
"step": 28800
},
{
"epoch": 8.406177156177156,
"grad_norm": 0.3190094530582428,
"learning_rate": 0.0004994204081632653,
"loss": 3.4102,
"step": 28850
},
{
"epoch": 8.42074592074592,
"grad_norm": 0.32494157552719116,
"learning_rate": 0.0004992454810495626,
"loss": 3.3945,
"step": 28900
},
{
"epoch": 8.435314685314685,
"grad_norm": 0.32460781931877136,
"learning_rate": 0.00049907055393586,
"loss": 3.4025,
"step": 28950
},
{
"epoch": 8.44988344988345,
"grad_norm": 0.33918076753616333,
"learning_rate": 0.0004988956268221574,
"loss": 3.409,
"step": 29000
},
{
"epoch": 8.44988344988345,
"eval_accuracy": 0.36715372018689424,
"eval_loss": 3.5787601470947266,
"eval_runtime": 179.9871,
"eval_samples_per_second": 92.462,
"eval_steps_per_second": 5.784,
"step": 29000
},
{
"epoch": 8.464452214452214,
"grad_norm": 0.33270788192749023,
"learning_rate": 0.0004987206997084547,
"loss": 3.4046,
"step": 29050
},
{
"epoch": 8.479020979020978,
"grad_norm": 0.3704804480075836,
"learning_rate": 0.0004985457725947521,
"loss": 3.4126,
"step": 29100
},
{
"epoch": 8.493589743589745,
"grad_norm": 0.32601258158683777,
"learning_rate": 0.0004983708454810496,
"loss": 3.4148,
"step": 29150
},
{
"epoch": 8.508158508158509,
"grad_norm": 0.333025723695755,
"learning_rate": 0.0004981959183673469,
"loss": 3.4101,
"step": 29200
},
{
"epoch": 8.522727272727273,
"grad_norm": 0.37352901697158813,
"learning_rate": 0.0004980209912536443,
"loss": 3.4143,
"step": 29250
},
{
"epoch": 8.537296037296038,
"grad_norm": 0.33815521001815796,
"learning_rate": 0.0004978460641399417,
"loss": 3.4212,
"step": 29300
},
{
"epoch": 8.551864801864802,
"grad_norm": 0.33571699261665344,
"learning_rate": 0.000497671137026239,
"loss": 3.4171,
"step": 29350
},
{
"epoch": 8.566433566433567,
"grad_norm": 0.33257025480270386,
"learning_rate": 0.0004974962099125364,
"loss": 3.4078,
"step": 29400
},
{
"epoch": 8.581002331002331,
"grad_norm": 0.3310984671115875,
"learning_rate": 0.0004973212827988337,
"loss": 3.4084,
"step": 29450
},
{
"epoch": 8.595571095571096,
"grad_norm": 0.3633224368095398,
"learning_rate": 0.0004971463556851312,
"loss": 3.417,
"step": 29500
},
{
"epoch": 8.61013986013986,
"grad_norm": 0.3331892192363739,
"learning_rate": 0.0004969714285714286,
"loss": 3.4142,
"step": 29550
},
{
"epoch": 8.624708624708624,
"grad_norm": 0.3235609829425812,
"learning_rate": 0.0004967965014577259,
"loss": 3.4237,
"step": 29600
},
{
"epoch": 8.639277389277389,
"grad_norm": 0.3281348943710327,
"learning_rate": 0.0004966215743440233,
"loss": 3.417,
"step": 29650
},
{
"epoch": 8.653846153846153,
"grad_norm": 0.3530293405056,
"learning_rate": 0.0004964466472303207,
"loss": 3.4244,
"step": 29700
},
{
"epoch": 8.668414918414918,
"grad_norm": 0.3330443501472473,
"learning_rate": 0.000496271720116618,
"loss": 3.4279,
"step": 29750
},
{
"epoch": 8.682983682983682,
"grad_norm": 0.3074372112751007,
"learning_rate": 0.0004960967930029154,
"loss": 3.4178,
"step": 29800
},
{
"epoch": 8.697552447552448,
"grad_norm": 0.317383348941803,
"learning_rate": 0.0004959218658892127,
"loss": 3.4187,
"step": 29850
},
{
"epoch": 8.712121212121213,
"grad_norm": 0.3294063210487366,
"learning_rate": 0.0004957469387755102,
"loss": 3.4365,
"step": 29900
},
{
"epoch": 8.726689976689977,
"grad_norm": 0.32321542501449585,
"learning_rate": 0.0004955720116618075,
"loss": 3.4291,
"step": 29950
},
{
"epoch": 8.741258741258742,
"grad_norm": 0.33648818731307983,
"learning_rate": 0.0004953970845481049,
"loss": 3.4213,
"step": 30000
},
{
"epoch": 8.741258741258742,
"eval_accuracy": 0.3674972031012944,
"eval_loss": 3.5736160278320312,
"eval_runtime": 184.4878,
"eval_samples_per_second": 90.206,
"eval_steps_per_second": 5.643,
"step": 30000
},
{
"epoch": 8.755827505827506,
"grad_norm": 0.34832367300987244,
"learning_rate": 0.0004952221574344023,
"loss": 3.4256,
"step": 30050
},
{
"epoch": 8.77039627039627,
"grad_norm": 0.35622331500053406,
"learning_rate": 0.0004950472303206997,
"loss": 3.4305,
"step": 30100
},
{
"epoch": 8.784965034965035,
"grad_norm": 0.3356943726539612,
"learning_rate": 0.0004948723032069971,
"loss": 3.4211,
"step": 30150
},
{
"epoch": 8.7995337995338,
"grad_norm": 0.3233150243759155,
"learning_rate": 0.0004946973760932944,
"loss": 3.4292,
"step": 30200
},
{
"epoch": 8.814102564102564,
"grad_norm": 0.321884423494339,
"learning_rate": 0.0004945224489795917,
"loss": 3.4113,
"step": 30250
},
{
"epoch": 8.828671328671328,
"grad_norm": 0.3420848846435547,
"learning_rate": 0.0004943475218658892,
"loss": 3.4163,
"step": 30300
},
{
"epoch": 8.843240093240093,
"grad_norm": 0.33607736229896545,
"learning_rate": 0.0004941725947521865,
"loss": 3.4269,
"step": 30350
},
{
"epoch": 8.857808857808857,
"grad_norm": 0.327867329120636,
"learning_rate": 0.0004939976676384839,
"loss": 3.428,
"step": 30400
},
{
"epoch": 8.872377622377622,
"grad_norm": 0.33073338866233826,
"learning_rate": 0.0004938227405247813,
"loss": 3.4269,
"step": 30450
},
{
"epoch": 8.886946386946388,
"grad_norm": 0.3265506625175476,
"learning_rate": 0.0004936478134110787,
"loss": 3.427,
"step": 30500
},
{
"epoch": 8.901515151515152,
"grad_norm": 0.3573986291885376,
"learning_rate": 0.0004934728862973761,
"loss": 3.4453,
"step": 30550
},
{
"epoch": 8.916083916083917,
"grad_norm": 0.31229108572006226,
"learning_rate": 0.0004932979591836734,
"loss": 3.4326,
"step": 30600
},
{
"epoch": 8.930652680652681,
"grad_norm": 0.31914326548576355,
"learning_rate": 0.0004931230320699707,
"loss": 3.4377,
"step": 30650
},
{
"epoch": 8.945221445221446,
"grad_norm": 0.3208966851234436,
"learning_rate": 0.0004929481049562682,
"loss": 3.4355,
"step": 30700
},
{
"epoch": 8.95979020979021,
"grad_norm": 0.33020609617233276,
"learning_rate": 0.0004927731778425655,
"loss": 3.4301,
"step": 30750
},
{
"epoch": 8.974358974358974,
"grad_norm": 0.3164729177951813,
"learning_rate": 0.0004925982507288629,
"loss": 3.4351,
"step": 30800
},
{
"epoch": 8.988927738927739,
"grad_norm": 0.3445264399051666,
"learning_rate": 0.0004924233236151604,
"loss": 3.4289,
"step": 30850
},
{
"epoch": 9.003496503496503,
"grad_norm": 0.35110875964164734,
"learning_rate": 0.0004922483965014577,
"loss": 3.41,
"step": 30900
},
{
"epoch": 9.018065268065268,
"grad_norm": 0.34962424635887146,
"learning_rate": 0.0004920734693877551,
"loss": 3.3223,
"step": 30950
},
{
"epoch": 9.032634032634032,
"grad_norm": 0.349322110414505,
"learning_rate": 0.0004918985422740524,
"loss": 3.3257,
"step": 31000
},
{
"epoch": 9.032634032634032,
"eval_accuracy": 0.3677295626490023,
"eval_loss": 3.5775063037872314,
"eval_runtime": 184.5342,
"eval_samples_per_second": 90.184,
"eval_steps_per_second": 5.641,
"step": 31000
},
{
"epoch": 9.047202797202797,
"grad_norm": 0.33628126978874207,
"learning_rate": 0.0004917236151603499,
"loss": 3.3398,
"step": 31050
},
{
"epoch": 9.061771561771561,
"grad_norm": 0.3321647047996521,
"learning_rate": 0.0004915486880466472,
"loss": 3.3387,
"step": 31100
},
{
"epoch": 9.076340326340326,
"grad_norm": 0.3293364346027374,
"learning_rate": 0.0004913737609329445,
"loss": 3.3316,
"step": 31150
},
{
"epoch": 9.090909090909092,
"grad_norm": 0.36083984375,
"learning_rate": 0.0004911988338192419,
"loss": 3.3328,
"step": 31200
},
{
"epoch": 9.105477855477856,
"grad_norm": 0.3213653862476349,
"learning_rate": 0.0004910239067055393,
"loss": 3.3368,
"step": 31250
},
{
"epoch": 9.12004662004662,
"grad_norm": 0.328110009431839,
"learning_rate": 0.0004908489795918367,
"loss": 3.3341,
"step": 31300
},
{
"epoch": 9.134615384615385,
"grad_norm": 0.34902945160865784,
"learning_rate": 0.0004906740524781341,
"loss": 3.3506,
"step": 31350
},
{
"epoch": 9.14918414918415,
"grad_norm": 0.3422456979751587,
"learning_rate": 0.0004904991253644314,
"loss": 3.3336,
"step": 31400
},
{
"epoch": 9.163752913752914,
"grad_norm": 0.32111626863479614,
"learning_rate": 0.0004903241982507289,
"loss": 3.3448,
"step": 31450
},
{
"epoch": 9.178321678321678,
"grad_norm": 0.3301600515842438,
"learning_rate": 0.0004901492711370262,
"loss": 3.3602,
"step": 31500
},
{
"epoch": 9.192890442890443,
"grad_norm": 0.3596358895301819,
"learning_rate": 0.0004899743440233235,
"loss": 3.3425,
"step": 31550
},
{
"epoch": 9.207459207459207,
"grad_norm": 0.34408465027809143,
"learning_rate": 0.0004897994169096209,
"loss": 3.3586,
"step": 31600
},
{
"epoch": 9.222027972027972,
"grad_norm": 0.29452061653137207,
"learning_rate": 0.0004896244897959183,
"loss": 3.3461,
"step": 31650
},
{
"epoch": 9.236596736596736,
"grad_norm": 0.3462780714035034,
"learning_rate": 0.0004894495626822157,
"loss": 3.3546,
"step": 31700
},
{
"epoch": 9.2511655011655,
"grad_norm": 0.3364809453487396,
"learning_rate": 0.0004892746355685131,
"loss": 3.3604,
"step": 31750
},
{
"epoch": 9.265734265734265,
"grad_norm": 0.37016913294792175,
"learning_rate": 0.0004890997084548104,
"loss": 3.3713,
"step": 31800
},
{
"epoch": 9.280303030303031,
"grad_norm": 0.35441073775291443,
"learning_rate": 0.0004889247813411079,
"loss": 3.3603,
"step": 31850
},
{
"epoch": 9.294871794871796,
"grad_norm": 0.37994956970214844,
"learning_rate": 0.0004887498542274052,
"loss": 3.3599,
"step": 31900
},
{
"epoch": 9.30944055944056,
"grad_norm": 0.32939761877059937,
"learning_rate": 0.0004885749271137026,
"loss": 3.3698,
"step": 31950
},
{
"epoch": 9.324009324009324,
"grad_norm": 0.3467860817909241,
"learning_rate": 0.0004883999999999999,
"loss": 3.3722,
"step": 32000
},
{
"epoch": 9.324009324009324,
"eval_accuracy": 0.3676854660749181,
"eval_loss": 3.57747483253479,
"eval_runtime": 184.5767,
"eval_samples_per_second": 90.163,
"eval_steps_per_second": 5.64,
"step": 32000
},
{
"epoch": 9.338578088578089,
"grad_norm": 0.3501215875148773,
"learning_rate": 0.0004882250728862973,
"loss": 3.3923,
"step": 32050
},
{
"epoch": 9.353146853146853,
"grad_norm": 0.3472752571105957,
"learning_rate": 0.0004880501457725947,
"loss": 3.3728,
"step": 32100
},
{
"epoch": 9.367715617715618,
"grad_norm": 0.33848997950553894,
"learning_rate": 0.00048787521865889207,
"loss": 3.3655,
"step": 32150
},
{
"epoch": 9.382284382284382,
"grad_norm": 0.33940359950065613,
"learning_rate": 0.00048770029154518945,
"loss": 3.3753,
"step": 32200
},
{
"epoch": 9.396853146853147,
"grad_norm": 0.33698955178260803,
"learning_rate": 0.0004875253644314868,
"loss": 3.3753,
"step": 32250
},
{
"epoch": 9.411421911421911,
"grad_norm": 0.38616305589675903,
"learning_rate": 0.0004873504373177842,
"loss": 3.3864,
"step": 32300
},
{
"epoch": 9.425990675990676,
"grad_norm": 0.3368024528026581,
"learning_rate": 0.00048717551020408163,
"loss": 3.3848,
"step": 32350
},
{
"epoch": 9.44055944055944,
"grad_norm": 0.3477652370929718,
"learning_rate": 0.000487000583090379,
"loss": 3.3911,
"step": 32400
},
{
"epoch": 9.455128205128204,
"grad_norm": 0.3296109437942505,
"learning_rate": 0.00048682565597667633,
"loss": 3.3774,
"step": 32450
},
{
"epoch": 9.469696969696969,
"grad_norm": 0.3162249028682709,
"learning_rate": 0.0004866507288629737,
"loss": 3.389,
"step": 32500
},
{
"epoch": 9.484265734265735,
"grad_norm": 0.35383546352386475,
"learning_rate": 0.0004864758017492711,
"loss": 3.3884,
"step": 32550
},
{
"epoch": 9.4988344988345,
"grad_norm": 0.36289462447166443,
"learning_rate": 0.00048630087463556845,
"loss": 3.3902,
"step": 32600
},
{
"epoch": 9.513403263403264,
"grad_norm": 0.3470366299152374,
"learning_rate": 0.00048612594752186583,
"loss": 3.3957,
"step": 32650
},
{
"epoch": 9.527972027972028,
"grad_norm": 0.35183480381965637,
"learning_rate": 0.0004859510204081632,
"loss": 3.3927,
"step": 32700
},
{
"epoch": 9.542540792540793,
"grad_norm": 0.35853511095046997,
"learning_rate": 0.00048577609329446064,
"loss": 3.3904,
"step": 32750
},
{
"epoch": 9.557109557109557,
"grad_norm": 0.36935216188430786,
"learning_rate": 0.000485601166180758,
"loss": 3.3844,
"step": 32800
},
{
"epoch": 9.571678321678322,
"grad_norm": 0.339053213596344,
"learning_rate": 0.0004854262390670554,
"loss": 3.3895,
"step": 32850
},
{
"epoch": 9.586247086247086,
"grad_norm": 0.34945929050445557,
"learning_rate": 0.0004852513119533527,
"loss": 3.4071,
"step": 32900
},
{
"epoch": 9.60081585081585,
"grad_norm": 0.3158731758594513,
"learning_rate": 0.0004850763848396501,
"loss": 3.401,
"step": 32950
},
{
"epoch": 9.615384615384615,
"grad_norm": 0.33528274297714233,
"learning_rate": 0.00048490145772594746,
"loss": 3.3982,
"step": 33000
},
{
"epoch": 9.615384615384615,
"eval_accuracy": 0.36842334874792776,
"eval_loss": 3.5674285888671875,
"eval_runtime": 181.5104,
"eval_samples_per_second": 91.686,
"eval_steps_per_second": 5.735,
"step": 33000
},
{
"epoch": 9.62995337995338,
"grad_norm": 0.3466652035713196,
"learning_rate": 0.00048472653061224484,
"loss": 3.3889,
"step": 33050
},
{
"epoch": 9.644522144522144,
"grad_norm": 0.34382832050323486,
"learning_rate": 0.0004845516034985422,
"loss": 3.3952,
"step": 33100
},
{
"epoch": 9.659090909090908,
"grad_norm": 0.33218619227409363,
"learning_rate": 0.0004843766763848396,
"loss": 3.3955,
"step": 33150
},
{
"epoch": 9.673659673659674,
"grad_norm": 0.3392280638217926,
"learning_rate": 0.000484201749271137,
"loss": 3.4007,
"step": 33200
},
{
"epoch": 9.688228438228439,
"grad_norm": 0.32414713501930237,
"learning_rate": 0.0004840268221574344,
"loss": 3.3993,
"step": 33250
},
{
"epoch": 9.702797202797203,
"grad_norm": 0.3343014121055603,
"learning_rate": 0.00048385189504373177,
"loss": 3.4024,
"step": 33300
},
{
"epoch": 9.717365967365968,
"grad_norm": 0.3456258177757263,
"learning_rate": 0.0004836769679300291,
"loss": 3.4074,
"step": 33350
},
{
"epoch": 9.731934731934732,
"grad_norm": 0.33294492959976196,
"learning_rate": 0.00048350204081632647,
"loss": 3.3968,
"step": 33400
},
{
"epoch": 9.746503496503497,
"grad_norm": 0.32060715556144714,
"learning_rate": 0.00048332711370262384,
"loss": 3.398,
"step": 33450
},
{
"epoch": 9.761072261072261,
"grad_norm": 0.34071505069732666,
"learning_rate": 0.0004831521865889212,
"loss": 3.4091,
"step": 33500
},
{
"epoch": 9.775641025641026,
"grad_norm": 0.3401913046836853,
"learning_rate": 0.0004829772594752186,
"loss": 3.3941,
"step": 33550
},
{
"epoch": 9.79020979020979,
"grad_norm": 0.3486825227737427,
"learning_rate": 0.00048280233236151597,
"loss": 3.4085,
"step": 33600
},
{
"epoch": 9.804778554778554,
"grad_norm": 0.32629451155662537,
"learning_rate": 0.0004826274052478134,
"loss": 3.3988,
"step": 33650
},
{
"epoch": 9.819347319347319,
"grad_norm": 0.3566981256008148,
"learning_rate": 0.0004824524781341108,
"loss": 3.3986,
"step": 33700
},
{
"epoch": 9.833916083916083,
"grad_norm": 0.3448028564453125,
"learning_rate": 0.00048227755102040815,
"loss": 3.3956,
"step": 33750
},
{
"epoch": 9.848484848484848,
"grad_norm": 0.32237598299980164,
"learning_rate": 0.0004821026239067055,
"loss": 3.4087,
"step": 33800
},
{
"epoch": 9.863053613053612,
"grad_norm": 0.3462821841239929,
"learning_rate": 0.00048192769679300285,
"loss": 3.4071,
"step": 33850
},
{
"epoch": 9.877622377622378,
"grad_norm": 0.3197895288467407,
"learning_rate": 0.0004817527696793002,
"loss": 3.4081,
"step": 33900
},
{
"epoch": 9.892191142191143,
"grad_norm": 0.33993223309516907,
"learning_rate": 0.0004815778425655976,
"loss": 3.4168,
"step": 33950
},
{
"epoch": 9.906759906759907,
"grad_norm": 0.3364954888820648,
"learning_rate": 0.000481402915451895,
"loss": 3.4083,
"step": 34000
},
{
"epoch": 9.906759906759907,
"eval_accuracy": 0.36888501048087374,
"eval_loss": 3.5607762336730957,
"eval_runtime": 180.1014,
"eval_samples_per_second": 92.404,
"eval_steps_per_second": 5.78,
"step": 34000
},
{
"epoch": 9.921328671328672,
"grad_norm": 0.37011784315109253,
"learning_rate": 0.0004812279883381924,
"loss": 3.4251,
"step": 34050
},
{
"epoch": 9.935897435897436,
"grad_norm": 0.33454248309135437,
"learning_rate": 0.0004810530612244898,
"loss": 3.4185,
"step": 34100
},
{
"epoch": 9.9504662004662,
"grad_norm": 0.3181527853012085,
"learning_rate": 0.00048087813411078716,
"loss": 3.406,
"step": 34150
},
{
"epoch": 9.965034965034965,
"grad_norm": 0.34176966547966003,
"learning_rate": 0.00048070320699708453,
"loss": 3.4085,
"step": 34200
},
{
"epoch": 9.97960372960373,
"grad_norm": 0.32492902874946594,
"learning_rate": 0.00048052827988338186,
"loss": 3.4045,
"step": 34250
},
{
"epoch": 9.994172494172494,
"grad_norm": 0.345251202583313,
"learning_rate": 0.00048035335276967923,
"loss": 3.3892,
"step": 34300
},
{
"epoch": 10.008741258741258,
"grad_norm": 0.364636093378067,
"learning_rate": 0.0004801784256559766,
"loss": 3.3166,
"step": 34350
},
{
"epoch": 10.023310023310023,
"grad_norm": 0.3602936267852783,
"learning_rate": 0.000480003498542274,
"loss": 3.294,
"step": 34400
},
{
"epoch": 10.037878787878787,
"grad_norm": 0.32505398988723755,
"learning_rate": 0.00047982857142857136,
"loss": 3.3059,
"step": 34450
},
{
"epoch": 10.052447552447552,
"grad_norm": 0.3503585457801819,
"learning_rate": 0.0004796536443148688,
"loss": 3.292,
"step": 34500
},
{
"epoch": 10.067016317016318,
"grad_norm": 0.35856735706329346,
"learning_rate": 0.00047947871720116616,
"loss": 3.3011,
"step": 34550
},
{
"epoch": 10.081585081585082,
"grad_norm": 0.32884514331817627,
"learning_rate": 0.00047930379008746354,
"loss": 3.3162,
"step": 34600
},
{
"epoch": 10.096153846153847,
"grad_norm": 0.34173399209976196,
"learning_rate": 0.0004791288629737609,
"loss": 3.3267,
"step": 34650
},
{
"epoch": 10.110722610722611,
"grad_norm": 0.32754212617874146,
"learning_rate": 0.00047895393586005824,
"loss": 3.3069,
"step": 34700
},
{
"epoch": 10.125291375291376,
"grad_norm": 0.35599884390830994,
"learning_rate": 0.0004787790087463556,
"loss": 3.3275,
"step": 34750
},
{
"epoch": 10.13986013986014,
"grad_norm": 0.3487524092197418,
"learning_rate": 0.000478604081632653,
"loss": 3.3185,
"step": 34800
},
{
"epoch": 10.154428904428904,
"grad_norm": 0.35253459215164185,
"learning_rate": 0.00047842915451895037,
"loss": 3.326,
"step": 34850
},
{
"epoch": 10.168997668997669,
"grad_norm": 0.37047427892684937,
"learning_rate": 0.0004782542274052478,
"loss": 3.3348,
"step": 34900
},
{
"epoch": 10.183566433566433,
"grad_norm": 0.3368969261646271,
"learning_rate": 0.00047807930029154517,
"loss": 3.3406,
"step": 34950
},
{
"epoch": 10.198135198135198,
"grad_norm": 0.3401995897293091,
"learning_rate": 0.00047790437317784255,
"loss": 3.3294,
"step": 35000
},
{
"epoch": 10.198135198135198,
"eval_accuracy": 0.3686502991158813,
"eval_loss": 3.5734663009643555,
"eval_runtime": 179.963,
"eval_samples_per_second": 92.475,
"eval_steps_per_second": 5.785,
"step": 35000
},
{
"epoch": 10.212703962703962,
"grad_norm": 0.3233661949634552,
"learning_rate": 0.0004777294460641399,
"loss": 3.3418,
"step": 35050
},
{
"epoch": 10.227272727272727,
"grad_norm": 0.35350361466407776,
"learning_rate": 0.0004775545189504373,
"loss": 3.3428,
"step": 35100
},
{
"epoch": 10.241841491841491,
"grad_norm": 0.3602052628993988,
"learning_rate": 0.0004773795918367346,
"loss": 3.3347,
"step": 35150
},
{
"epoch": 10.256410256410255,
"grad_norm": 0.3384787142276764,
"learning_rate": 0.000477204664723032,
"loss": 3.335,
"step": 35200
},
{
"epoch": 10.270979020979022,
"grad_norm": 0.35919293761253357,
"learning_rate": 0.00047702973760932937,
"loss": 3.3324,
"step": 35250
},
{
"epoch": 10.285547785547786,
"grad_norm": 0.3487773835659027,
"learning_rate": 0.00047685481049562675,
"loss": 3.3435,
"step": 35300
},
{
"epoch": 10.30011655011655,
"grad_norm": 0.33635222911834717,
"learning_rate": 0.0004766798833819242,
"loss": 3.3492,
"step": 35350
},
{
"epoch": 10.314685314685315,
"grad_norm": 0.3445112705230713,
"learning_rate": 0.00047650495626822155,
"loss": 3.3516,
"step": 35400
},
{
"epoch": 10.32925407925408,
"grad_norm": 0.3628615438938141,
"learning_rate": 0.00047633002915451893,
"loss": 3.3535,
"step": 35450
},
{
"epoch": 10.343822843822844,
"grad_norm": 0.3586277663707733,
"learning_rate": 0.0004761551020408163,
"loss": 3.3652,
"step": 35500
},
{
"epoch": 10.358391608391608,
"grad_norm": 0.3344104588031769,
"learning_rate": 0.0004759801749271137,
"loss": 3.3564,
"step": 35550
},
{
"epoch": 10.372960372960373,
"grad_norm": 0.32970529794692993,
"learning_rate": 0.000475805247813411,
"loss": 3.3439,
"step": 35600
},
{
"epoch": 10.387529137529137,
"grad_norm": 0.3633076250553131,
"learning_rate": 0.0004756303206997084,
"loss": 3.3533,
"step": 35650
},
{
"epoch": 10.402097902097902,
"grad_norm": 0.34395188093185425,
"learning_rate": 0.00047545539358600575,
"loss": 3.3582,
"step": 35700
},
{
"epoch": 10.416666666666666,
"grad_norm": 0.3351273536682129,
"learning_rate": 0.00047528046647230313,
"loss": 3.3544,
"step": 35750
},
{
"epoch": 10.43123543123543,
"grad_norm": 0.34297794103622437,
"learning_rate": 0.00047510553935860056,
"loss": 3.3573,
"step": 35800
},
{
"epoch": 10.445804195804195,
"grad_norm": 0.3346062898635864,
"learning_rate": 0.00047493061224489794,
"loss": 3.3565,
"step": 35850
},
{
"epoch": 10.460372960372961,
"grad_norm": 0.3574366271495819,
"learning_rate": 0.0004747556851311953,
"loss": 3.3548,
"step": 35900
},
{
"epoch": 10.474941724941726,
"grad_norm": 0.35288873314857483,
"learning_rate": 0.0004745807580174927,
"loss": 3.3584,
"step": 35950
},
{
"epoch": 10.48951048951049,
"grad_norm": 0.32899612188339233,
"learning_rate": 0.00047440583090379006,
"loss": 3.3591,
"step": 36000
},
{
"epoch": 10.48951048951049,
"eval_accuracy": 0.3689316940539709,
"eval_loss": 3.566610336303711,
"eval_runtime": 179.9576,
"eval_samples_per_second": 92.477,
"eval_steps_per_second": 5.785,
"step": 36000
},
{
"epoch": 10.504079254079254,
"grad_norm": 0.3387123942375183,
"learning_rate": 0.0004742309037900874,
"loss": 3.3697,
"step": 36050
},
{
"epoch": 10.518648018648019,
"grad_norm": 0.3552062213420868,
"learning_rate": 0.00047405597667638476,
"loss": 3.3692,
"step": 36100
},
{
"epoch": 10.533216783216783,
"grad_norm": 0.30987149477005005,
"learning_rate": 0.00047388104956268214,
"loss": 3.3757,
"step": 36150
},
{
"epoch": 10.547785547785548,
"grad_norm": 0.38266798853874207,
"learning_rate": 0.00047370612244897957,
"loss": 3.371,
"step": 36200
},
{
"epoch": 10.562354312354312,
"grad_norm": 0.35734835267066956,
"learning_rate": 0.00047353119533527694,
"loss": 3.3526,
"step": 36250
},
{
"epoch": 10.576923076923077,
"grad_norm": 0.33983373641967773,
"learning_rate": 0.0004733562682215743,
"loss": 3.3765,
"step": 36300
},
{
"epoch": 10.591491841491841,
"grad_norm": 0.34860071539878845,
"learning_rate": 0.0004731813411078717,
"loss": 3.3727,
"step": 36350
},
{
"epoch": 10.606060606060606,
"grad_norm": 0.3304286301136017,
"learning_rate": 0.00047300641399416907,
"loss": 3.3738,
"step": 36400
},
{
"epoch": 10.62062937062937,
"grad_norm": 0.36452704668045044,
"learning_rate": 0.00047283148688046645,
"loss": 3.3727,
"step": 36450
},
{
"epoch": 10.635198135198134,
"grad_norm": 0.33509254455566406,
"learning_rate": 0.00047265655976676377,
"loss": 3.3678,
"step": 36500
},
{
"epoch": 10.649766899766899,
"grad_norm": 0.3406018912792206,
"learning_rate": 0.00047248163265306114,
"loss": 3.3738,
"step": 36550
},
{
"epoch": 10.664335664335665,
"grad_norm": 0.3569091856479645,
"learning_rate": 0.0004723067055393585,
"loss": 3.3718,
"step": 36600
},
{
"epoch": 10.67890442890443,
"grad_norm": 0.31912165880203247,
"learning_rate": 0.00047213177842565595,
"loss": 3.3627,
"step": 36650
},
{
"epoch": 10.693473193473194,
"grad_norm": 0.3501559793949127,
"learning_rate": 0.0004719568513119533,
"loss": 3.3774,
"step": 36700
},
{
"epoch": 10.708041958041958,
"grad_norm": 0.36387351155281067,
"learning_rate": 0.0004717819241982507,
"loss": 3.3652,
"step": 36750
},
{
"epoch": 10.722610722610723,
"grad_norm": 0.3268325924873352,
"learning_rate": 0.0004716069970845481,
"loss": 3.3754,
"step": 36800
},
{
"epoch": 10.737179487179487,
"grad_norm": 0.3691311776638031,
"learning_rate": 0.00047143206997084545,
"loss": 3.3789,
"step": 36850
},
{
"epoch": 10.751748251748252,
"grad_norm": 0.34827283024787903,
"learning_rate": 0.00047125714285714283,
"loss": 3.3962,
"step": 36900
},
{
"epoch": 10.766317016317016,
"grad_norm": 0.3422998785972595,
"learning_rate": 0.00047108221574344015,
"loss": 3.3801,
"step": 36950
},
{
"epoch": 10.78088578088578,
"grad_norm": 0.3483596444129944,
"learning_rate": 0.0004709072886297375,
"loss": 3.3894,
"step": 37000
},
{
"epoch": 10.78088578088578,
"eval_accuracy": 0.3693519638027098,
"eval_loss": 3.556098699569702,
"eval_runtime": 180.0859,
"eval_samples_per_second": 92.411,
"eval_steps_per_second": 5.781,
"step": 37000
},
{
"epoch": 10.795454545454545,
"grad_norm": 0.3572978973388672,
"learning_rate": 0.00047073236151603495,
"loss": 3.3816,
"step": 37050
},
{
"epoch": 10.81002331002331,
"grad_norm": 0.3622797727584839,
"learning_rate": 0.00047055743440233233,
"loss": 3.389,
"step": 37100
},
{
"epoch": 10.824592074592074,
"grad_norm": 0.3194954991340637,
"learning_rate": 0.0004703825072886297,
"loss": 3.3866,
"step": 37150
},
{
"epoch": 10.83916083916084,
"grad_norm": 0.34308746457099915,
"learning_rate": 0.0004702075801749271,
"loss": 3.3863,
"step": 37200
},
{
"epoch": 10.853729603729604,
"grad_norm": 0.33468618988990784,
"learning_rate": 0.00047003265306122446,
"loss": 3.3797,
"step": 37250
},
{
"epoch": 10.868298368298369,
"grad_norm": 0.3478671908378601,
"learning_rate": 0.00046985772594752183,
"loss": 3.3934,
"step": 37300
},
{
"epoch": 10.882867132867133,
"grad_norm": 0.34233909845352173,
"learning_rate": 0.0004696827988338192,
"loss": 3.3845,
"step": 37350
},
{
"epoch": 10.897435897435898,
"grad_norm": 0.3444744646549225,
"learning_rate": 0.00046950787172011653,
"loss": 3.3938,
"step": 37400
},
{
"epoch": 10.912004662004662,
"grad_norm": 0.3458085060119629,
"learning_rate": 0.0004693329446064139,
"loss": 3.3931,
"step": 37450
},
{
"epoch": 10.926573426573427,
"grad_norm": 0.3380485773086548,
"learning_rate": 0.00046915801749271134,
"loss": 3.3893,
"step": 37500
},
{
"epoch": 10.941142191142191,
"grad_norm": 0.33901792764663696,
"learning_rate": 0.0004689830903790087,
"loss": 3.3976,
"step": 37550
},
{
"epoch": 10.955710955710956,
"grad_norm": 0.3939083516597748,
"learning_rate": 0.0004688081632653061,
"loss": 3.3945,
"step": 37600
},
{
"epoch": 10.97027972027972,
"grad_norm": 0.3543170392513275,
"learning_rate": 0.00046863323615160346,
"loss": 3.3905,
"step": 37650
},
{
"epoch": 10.984848484848484,
"grad_norm": 0.33849748969078064,
"learning_rate": 0.00046845830903790084,
"loss": 3.3845,
"step": 37700
},
{
"epoch": 10.999417249417249,
"grad_norm": 0.3542396128177643,
"learning_rate": 0.0004682833819241982,
"loss": 3.3943,
"step": 37750
},
{
"epoch": 11.013986013986013,
"grad_norm": 0.3594779968261719,
"learning_rate": 0.0004681084548104956,
"loss": 3.2759,
"step": 37800
},
{
"epoch": 11.028554778554778,
"grad_norm": 0.335800439119339,
"learning_rate": 0.0004679335276967929,
"loss": 3.2729,
"step": 37850
},
{
"epoch": 11.043123543123544,
"grad_norm": 0.3539378345012665,
"learning_rate": 0.0004677586005830903,
"loss": 3.2811,
"step": 37900
},
{
"epoch": 11.057692307692308,
"grad_norm": 0.3634989857673645,
"learning_rate": 0.0004675836734693877,
"loss": 3.2793,
"step": 37950
},
{
"epoch": 11.072261072261073,
"grad_norm": 0.35441911220550537,
"learning_rate": 0.0004674087463556851,
"loss": 3.2944,
"step": 38000
},
{
"epoch": 11.072261072261073,
"eval_accuracy": 0.3693041043209704,
"eval_loss": 3.5667061805725098,
"eval_runtime": 179.9313,
"eval_samples_per_second": 92.491,
"eval_steps_per_second": 5.786,
"step": 38000
},
{
"epoch": 11.086829836829837,
"grad_norm": 0.33456510305404663,
"learning_rate": 0.00046723381924198247,
"loss": 3.3043,
"step": 38050
},
{
"epoch": 11.101398601398602,
"grad_norm": 0.35062792897224426,
"learning_rate": 0.00046705889212827985,
"loss": 3.2976,
"step": 38100
},
{
"epoch": 11.115967365967366,
"grad_norm": 0.35433903336524963,
"learning_rate": 0.0004668839650145772,
"loss": 3.3004,
"step": 38150
},
{
"epoch": 11.13053613053613,
"grad_norm": 0.34735456109046936,
"learning_rate": 0.0004667090379008746,
"loss": 3.3141,
"step": 38200
},
{
"epoch": 11.145104895104895,
"grad_norm": 0.36027348041534424,
"learning_rate": 0.000466534110787172,
"loss": 3.3177,
"step": 38250
},
{
"epoch": 11.15967365967366,
"grad_norm": 0.3495638370513916,
"learning_rate": 0.0004663591836734693,
"loss": 3.3033,
"step": 38300
},
{
"epoch": 11.174242424242424,
"grad_norm": 0.352952778339386,
"learning_rate": 0.0004661842565597667,
"loss": 3.3067,
"step": 38350
},
{
"epoch": 11.188811188811188,
"grad_norm": 0.35756683349609375,
"learning_rate": 0.0004660093294460641,
"loss": 3.3134,
"step": 38400
},
{
"epoch": 11.203379953379953,
"grad_norm": 0.3577975630760193,
"learning_rate": 0.0004658344023323615,
"loss": 3.3004,
"step": 38450
},
{
"epoch": 11.217948717948717,
"grad_norm": 0.32897868752479553,
"learning_rate": 0.00046565947521865885,
"loss": 3.3096,
"step": 38500
},
{
"epoch": 11.232517482517483,
"grad_norm": 0.32986345887184143,
"learning_rate": 0.00046548454810495623,
"loss": 3.3042,
"step": 38550
},
{
"epoch": 11.247086247086248,
"grad_norm": 0.3170894384384155,
"learning_rate": 0.0004653096209912536,
"loss": 3.3199,
"step": 38600
},
{
"epoch": 11.261655011655012,
"grad_norm": 0.3499142527580261,
"learning_rate": 0.000465134693877551,
"loss": 3.3325,
"step": 38650
},
{
"epoch": 11.276223776223777,
"grad_norm": 0.34623247385025024,
"learning_rate": 0.0004649597667638484,
"loss": 3.3242,
"step": 38700
},
{
"epoch": 11.290792540792541,
"grad_norm": 0.3531145453453064,
"learning_rate": 0.0004647848396501457,
"loss": 3.3253,
"step": 38750
},
{
"epoch": 11.305361305361306,
"grad_norm": 0.3613262474536896,
"learning_rate": 0.0004646099125364431,
"loss": 3.318,
"step": 38800
},
{
"epoch": 11.31993006993007,
"grad_norm": 0.34619051218032837,
"learning_rate": 0.0004644349854227405,
"loss": 3.3252,
"step": 38850
},
{
"epoch": 11.334498834498834,
"grad_norm": 0.34788867831230164,
"learning_rate": 0.00046426005830903786,
"loss": 3.3346,
"step": 38900
},
{
"epoch": 11.349067599067599,
"grad_norm": 0.35362058877944946,
"learning_rate": 0.00046408513119533523,
"loss": 3.3304,
"step": 38950
},
{
"epoch": 11.363636363636363,
"grad_norm": 0.32547181844711304,
"learning_rate": 0.0004639102040816326,
"loss": 3.3381,
"step": 39000
},
{
"epoch": 11.363636363636363,
"eval_accuracy": 0.36909796753598456,
"eval_loss": 3.563558578491211,
"eval_runtime": 180.0115,
"eval_samples_per_second": 92.45,
"eval_steps_per_second": 5.783,
"step": 39000
},
{
"epoch": 11.378205128205128,
"grad_norm": 0.37591269612312317,
"learning_rate": 0.00046373527696793,
"loss": 3.3343,
"step": 39050
},
{
"epoch": 11.392773892773892,
"grad_norm": 0.3608076870441437,
"learning_rate": 0.00046356034985422736,
"loss": 3.3386,
"step": 39100
},
{
"epoch": 11.407342657342657,
"grad_norm": 0.3387145400047302,
"learning_rate": 0.0004633854227405248,
"loss": 3.3433,
"step": 39150
},
{
"epoch": 11.421911421911421,
"grad_norm": 0.338716596364975,
"learning_rate": 0.0004632104956268221,
"loss": 3.3425,
"step": 39200
},
{
"epoch": 11.436480186480187,
"grad_norm": 0.3434636890888214,
"learning_rate": 0.0004630355685131195,
"loss": 3.3399,
"step": 39250
},
{
"epoch": 11.451048951048952,
"grad_norm": 0.3447709083557129,
"learning_rate": 0.00046286064139941687,
"loss": 3.3545,
"step": 39300
},
{
"epoch": 11.465617715617716,
"grad_norm": 0.3637690842151642,
"learning_rate": 0.00046268571428571424,
"loss": 3.3408,
"step": 39350
},
{
"epoch": 11.48018648018648,
"grad_norm": 0.3768569529056549,
"learning_rate": 0.0004625107871720116,
"loss": 3.3461,
"step": 39400
},
{
"epoch": 11.494755244755245,
"grad_norm": 0.33349111676216125,
"learning_rate": 0.000462335860058309,
"loss": 3.3351,
"step": 39450
},
{
"epoch": 11.50932400932401,
"grad_norm": 0.3535425364971161,
"learning_rate": 0.00046216093294460637,
"loss": 3.3506,
"step": 39500
},
{
"epoch": 11.523892773892774,
"grad_norm": 0.35613155364990234,
"learning_rate": 0.0004619860058309038,
"loss": 3.3595,
"step": 39550
},
{
"epoch": 11.538461538461538,
"grad_norm": 0.3751663565635681,
"learning_rate": 0.0004618110787172012,
"loss": 3.3434,
"step": 39600
},
{
"epoch": 11.553030303030303,
"grad_norm": 0.38919779658317566,
"learning_rate": 0.0004616361516034985,
"loss": 3.3436,
"step": 39650
},
{
"epoch": 11.567599067599067,
"grad_norm": 0.3445196747779846,
"learning_rate": 0.00046146122448979587,
"loss": 3.3509,
"step": 39700
},
{
"epoch": 11.582167832167832,
"grad_norm": 0.3505731523036957,
"learning_rate": 0.00046128629737609325,
"loss": 3.3374,
"step": 39750
},
{
"epoch": 11.596736596736596,
"grad_norm": 0.35699549317359924,
"learning_rate": 0.0004611113702623906,
"loss": 3.3467,
"step": 39800
},
{
"epoch": 11.61130536130536,
"grad_norm": 0.33985093235969543,
"learning_rate": 0.000460936443148688,
"loss": 3.3476,
"step": 39850
},
{
"epoch": 11.625874125874127,
"grad_norm": 0.37056589126586914,
"learning_rate": 0.0004607615160349854,
"loss": 3.3499,
"step": 39900
},
{
"epoch": 11.640442890442891,
"grad_norm": 0.3587411046028137,
"learning_rate": 0.00046058658892128275,
"loss": 3.3645,
"step": 39950
},
{
"epoch": 11.655011655011656,
"grad_norm": 0.3274442255496979,
"learning_rate": 0.0004604116618075802,
"loss": 3.3591,
"step": 40000
},
{
"epoch": 11.655011655011656,
"eval_accuracy": 0.37000906155199714,
"eval_loss": 3.5557005405426025,
"eval_runtime": 179.9923,
"eval_samples_per_second": 92.46,
"eval_steps_per_second": 5.784,
"step": 40000
},
{
"epoch": 11.66958041958042,
"grad_norm": 0.33285725116729736,
"learning_rate": 0.00046023673469387756,
"loss": 3.3547,
"step": 40050
},
{
"epoch": 11.684149184149184,
"grad_norm": 0.35055187344551086,
"learning_rate": 0.0004600618075801749,
"loss": 3.3695,
"step": 40100
},
{
"epoch": 11.698717948717949,
"grad_norm": 0.33380094170570374,
"learning_rate": 0.00045988688046647225,
"loss": 3.3513,
"step": 40150
},
{
"epoch": 11.713286713286713,
"grad_norm": 0.36289268732070923,
"learning_rate": 0.00045971195335276963,
"loss": 3.3489,
"step": 40200
},
{
"epoch": 11.727855477855478,
"grad_norm": 0.33579185605049133,
"learning_rate": 0.000459537026239067,
"loss": 3.3538,
"step": 40250
},
{
"epoch": 11.742424242424242,
"grad_norm": 0.3782014846801758,
"learning_rate": 0.0004593620991253644,
"loss": 3.3616,
"step": 40300
},
{
"epoch": 11.756993006993007,
"grad_norm": 0.34368279576301575,
"learning_rate": 0.00045918717201166176,
"loss": 3.3674,
"step": 40350
},
{
"epoch": 11.771561771561771,
"grad_norm": 0.3444232940673828,
"learning_rate": 0.00045901224489795913,
"loss": 3.3633,
"step": 40400
},
{
"epoch": 11.786130536130536,
"grad_norm": 0.33604544401168823,
"learning_rate": 0.00045883731778425656,
"loss": 3.3594,
"step": 40450
},
{
"epoch": 11.8006993006993,
"grad_norm": 0.33371636271476746,
"learning_rate": 0.00045866239067055394,
"loss": 3.3687,
"step": 40500
},
{
"epoch": 11.815268065268064,
"grad_norm": 0.3588174879550934,
"learning_rate": 0.00045848746355685126,
"loss": 3.3681,
"step": 40550
},
{
"epoch": 11.82983682983683,
"grad_norm": 0.3513983190059662,
"learning_rate": 0.00045831253644314864,
"loss": 3.3679,
"step": 40600
},
{
"epoch": 11.844405594405595,
"grad_norm": 0.3378084897994995,
"learning_rate": 0.000458137609329446,
"loss": 3.3578,
"step": 40650
},
{
"epoch": 11.85897435897436,
"grad_norm": 0.3297298550605774,
"learning_rate": 0.0004579626822157434,
"loss": 3.3709,
"step": 40700
},
{
"epoch": 11.873543123543124,
"grad_norm": 0.360269695520401,
"learning_rate": 0.00045778775510204076,
"loss": 3.3568,
"step": 40750
},
{
"epoch": 11.888111888111888,
"grad_norm": 0.3312649130821228,
"learning_rate": 0.00045761282798833814,
"loss": 3.3656,
"step": 40800
},
{
"epoch": 11.902680652680653,
"grad_norm": 0.36778295040130615,
"learning_rate": 0.00045743790087463557,
"loss": 3.3674,
"step": 40850
},
{
"epoch": 11.917249417249417,
"grad_norm": 0.33830592036247253,
"learning_rate": 0.00045726297376093294,
"loss": 3.3699,
"step": 40900
},
{
"epoch": 11.931818181818182,
"grad_norm": 0.37597140669822693,
"learning_rate": 0.0004570880466472303,
"loss": 3.3764,
"step": 40950
},
{
"epoch": 11.946386946386946,
"grad_norm": 0.34776413440704346,
"learning_rate": 0.00045691311953352764,
"loss": 3.357,
"step": 41000
},
{
"epoch": 11.946386946386946,
"eval_accuracy": 0.3705675005662,
"eval_loss": 3.548083782196045,
"eval_runtime": 179.9881,
"eval_samples_per_second": 92.462,
"eval_steps_per_second": 5.784,
"step": 41000
},
{
"epoch": 11.96095571095571,
"grad_norm": 0.35101014375686646,
"learning_rate": 0.000456738192419825,
"loss": 3.3761,
"step": 41050
},
{
"epoch": 11.975524475524475,
"grad_norm": 0.3674442172050476,
"learning_rate": 0.0004565632653061224,
"loss": 3.3691,
"step": 41100
},
{
"epoch": 11.99009324009324,
"grad_norm": 0.3214508891105652,
"learning_rate": 0.00045638833819241977,
"loss": 3.3713,
"step": 41150
},
{
"epoch": 12.004662004662004,
"grad_norm": 0.3173612952232361,
"learning_rate": 0.00045621341107871715,
"loss": 3.3369,
"step": 41200
},
{
"epoch": 12.01923076923077,
"grad_norm": 0.3481275737285614,
"learning_rate": 0.0004560384839650145,
"loss": 3.242,
"step": 41250
},
{
"epoch": 12.033799533799534,
"grad_norm": 0.33980754017829895,
"learning_rate": 0.00045586355685131195,
"loss": 3.2611,
"step": 41300
},
{
"epoch": 12.048368298368299,
"grad_norm": 0.35299932956695557,
"learning_rate": 0.0004556886297376093,
"loss": 3.2661,
"step": 41350
},
{
"epoch": 12.062937062937063,
"grad_norm": 0.3485766649246216,
"learning_rate": 0.0004555137026239067,
"loss": 3.2656,
"step": 41400
},
{
"epoch": 12.077505827505828,
"grad_norm": 0.33770278096199036,
"learning_rate": 0.000455338775510204,
"loss": 3.2791,
"step": 41450
},
{
"epoch": 12.092074592074592,
"grad_norm": 0.34751835465431213,
"learning_rate": 0.0004551638483965014,
"loss": 3.2841,
"step": 41500
},
{
"epoch": 12.106643356643357,
"grad_norm": 0.37227487564086914,
"learning_rate": 0.0004549889212827988,
"loss": 3.2799,
"step": 41550
},
{
"epoch": 12.121212121212121,
"grad_norm": 0.34621065855026245,
"learning_rate": 0.00045481399416909615,
"loss": 3.282,
"step": 41600
},
{
"epoch": 12.135780885780886,
"grad_norm": 0.3268602788448334,
"learning_rate": 0.00045463906705539353,
"loss": 3.2881,
"step": 41650
},
{
"epoch": 12.15034965034965,
"grad_norm": 0.39036524295806885,
"learning_rate": 0.0004544641399416909,
"loss": 3.2841,
"step": 41700
},
{
"epoch": 12.164918414918414,
"grad_norm": 0.35717472434043884,
"learning_rate": 0.00045428921282798833,
"loss": 3.2958,
"step": 41750
},
{
"epoch": 12.179487179487179,
"grad_norm": 0.35328230261802673,
"learning_rate": 0.0004541142857142857,
"loss": 3.2796,
"step": 41800
},
{
"epoch": 12.194055944055943,
"grad_norm": 0.33877119421958923,
"learning_rate": 0.0004539393586005831,
"loss": 3.2983,
"step": 41850
},
{
"epoch": 12.20862470862471,
"grad_norm": 0.3424377143383026,
"learning_rate": 0.0004537644314868804,
"loss": 3.2956,
"step": 41900
},
{
"epoch": 12.223193473193474,
"grad_norm": 0.35039380192756653,
"learning_rate": 0.0004535895043731778,
"loss": 3.3151,
"step": 41950
},
{
"epoch": 12.237762237762238,
"grad_norm": 0.36922869086265564,
"learning_rate": 0.00045341457725947516,
"loss": 3.3056,
"step": 42000
},
{
"epoch": 12.237762237762238,
"eval_accuracy": 0.3698839448724621,
"eval_loss": 3.5648937225341797,
"eval_runtime": 180.0535,
"eval_samples_per_second": 92.428,
"eval_steps_per_second": 5.782,
"step": 42000
},
{
"epoch": 12.252331002331003,
"grad_norm": 0.33937689661979675,
"learning_rate": 0.00045323965014577253,
"loss": 3.3066,
"step": 42050
},
{
"epoch": 12.266899766899767,
"grad_norm": 0.36293527483940125,
"learning_rate": 0.0004530647230320699,
"loss": 3.3046,
"step": 42100
},
{
"epoch": 12.281468531468532,
"grad_norm": 0.36257535219192505,
"learning_rate": 0.00045288979591836734,
"loss": 3.3095,
"step": 42150
},
{
"epoch": 12.296037296037296,
"grad_norm": 0.38248223066329956,
"learning_rate": 0.0004527148688046647,
"loss": 3.3109,
"step": 42200
},
{
"epoch": 12.31060606060606,
"grad_norm": 0.35680779814720154,
"learning_rate": 0.0004525399416909621,
"loss": 3.3174,
"step": 42250
},
{
"epoch": 12.325174825174825,
"grad_norm": 0.3464656174182892,
"learning_rate": 0.00045236501457725947,
"loss": 3.3103,
"step": 42300
},
{
"epoch": 12.33974358974359,
"grad_norm": 0.36527353525161743,
"learning_rate": 0.0004521900874635568,
"loss": 3.3141,
"step": 42350
},
{
"epoch": 12.354312354312354,
"grad_norm": 0.34814345836639404,
"learning_rate": 0.00045201516034985416,
"loss": 3.3219,
"step": 42400
},
{
"epoch": 12.368881118881118,
"grad_norm": 0.3569382131099701,
"learning_rate": 0.00045184023323615154,
"loss": 3.318,
"step": 42450
},
{
"epoch": 12.383449883449883,
"grad_norm": 0.3441168963909149,
"learning_rate": 0.0004516653061224489,
"loss": 3.3209,
"step": 42500
},
{
"epoch": 12.398018648018647,
"grad_norm": 0.3545966148376465,
"learning_rate": 0.0004514903790087463,
"loss": 3.3156,
"step": 42550
},
{
"epoch": 12.412587412587413,
"grad_norm": 0.349949449300766,
"learning_rate": 0.0004513154518950437,
"loss": 3.3224,
"step": 42600
},
{
"epoch": 12.427156177156178,
"grad_norm": 0.375348836183548,
"learning_rate": 0.0004511405247813411,
"loss": 3.3445,
"step": 42650
},
{
"epoch": 12.441724941724942,
"grad_norm": 0.3431892395019531,
"learning_rate": 0.0004509655976676385,
"loss": 3.3242,
"step": 42700
},
{
"epoch": 12.456293706293707,
"grad_norm": 0.3887154161930084,
"learning_rate": 0.00045079067055393585,
"loss": 3.3199,
"step": 42750
},
{
"epoch": 12.470862470862471,
"grad_norm": 0.3668091297149658,
"learning_rate": 0.00045061574344023317,
"loss": 3.3249,
"step": 42800
},
{
"epoch": 12.485431235431236,
"grad_norm": 0.35282227396965027,
"learning_rate": 0.00045044081632653055,
"loss": 3.323,
"step": 42850
},
{
"epoch": 12.5,
"grad_norm": 0.34184911847114563,
"learning_rate": 0.0004502658892128279,
"loss": 3.3244,
"step": 42900
},
{
"epoch": 12.514568764568764,
"grad_norm": 0.3672352135181427,
"learning_rate": 0.0004500909620991253,
"loss": 3.3266,
"step": 42950
},
{
"epoch": 12.529137529137529,
"grad_norm": 0.3395889103412628,
"learning_rate": 0.00044991603498542273,
"loss": 3.3235,
"step": 43000
},
{
"epoch": 12.529137529137529,
"eval_accuracy": 0.37045567165432236,
"eval_loss": 3.553213119506836,
"eval_runtime": 180.0175,
"eval_samples_per_second": 92.447,
"eval_steps_per_second": 5.783,
"step": 43000
},
{
"epoch": 12.543706293706293,
"grad_norm": 0.3517553508281708,
"learning_rate": 0.0004497411078717201,
"loss": 3.3239,
"step": 43050
},
{
"epoch": 12.558275058275058,
"grad_norm": 0.3382112383842468,
"learning_rate": 0.0004495661807580175,
"loss": 3.3364,
"step": 43100
},
{
"epoch": 12.572843822843822,
"grad_norm": 0.3736812472343445,
"learning_rate": 0.00044939125364431486,
"loss": 3.3348,
"step": 43150
},
{
"epoch": 12.587412587412587,
"grad_norm": 0.36248350143432617,
"learning_rate": 0.00044921632653061223,
"loss": 3.3275,
"step": 43200
},
{
"epoch": 12.601981351981351,
"grad_norm": 0.3704957365989685,
"learning_rate": 0.00044904139941690955,
"loss": 3.3323,
"step": 43250
},
{
"epoch": 12.616550116550117,
"grad_norm": 0.36183616518974304,
"learning_rate": 0.00044886647230320693,
"loss": 3.329,
"step": 43300
},
{
"epoch": 12.631118881118882,
"grad_norm": 0.3404022455215454,
"learning_rate": 0.0004486915451895043,
"loss": 3.3414,
"step": 43350
},
{
"epoch": 12.645687645687646,
"grad_norm": 0.3847573399543762,
"learning_rate": 0.0004485166180758017,
"loss": 3.3339,
"step": 43400
},
{
"epoch": 12.66025641025641,
"grad_norm": 0.3649556636810303,
"learning_rate": 0.0004483416909620991,
"loss": 3.3361,
"step": 43450
},
{
"epoch": 12.674825174825175,
"grad_norm": 0.3577769696712494,
"learning_rate": 0.0004481667638483965,
"loss": 3.3328,
"step": 43500
},
{
"epoch": 12.68939393939394,
"grad_norm": 0.3543531000614166,
"learning_rate": 0.00044799183673469386,
"loss": 3.3446,
"step": 43550
},
{
"epoch": 12.703962703962704,
"grad_norm": 0.3769163191318512,
"learning_rate": 0.00044781690962099124,
"loss": 3.3362,
"step": 43600
},
{
"epoch": 12.718531468531468,
"grad_norm": 0.36154523491859436,
"learning_rate": 0.0004476419825072886,
"loss": 3.338,
"step": 43650
},
{
"epoch": 12.733100233100233,
"grad_norm": 0.3562755584716797,
"learning_rate": 0.00044746705539358593,
"loss": 3.3321,
"step": 43700
},
{
"epoch": 12.747668997668997,
"grad_norm": 0.34634169936180115,
"learning_rate": 0.0004472921282798833,
"loss": 3.3404,
"step": 43750
},
{
"epoch": 12.762237762237762,
"grad_norm": 0.35489341616630554,
"learning_rate": 0.0004471172011661807,
"loss": 3.3451,
"step": 43800
},
{
"epoch": 12.776806526806526,
"grad_norm": 0.349941223859787,
"learning_rate": 0.00044694227405247806,
"loss": 3.3459,
"step": 43850
},
{
"epoch": 12.791375291375292,
"grad_norm": 0.3710970878601074,
"learning_rate": 0.0004467673469387755,
"loss": 3.3494,
"step": 43900
},
{
"epoch": 12.805944055944057,
"grad_norm": 0.3861698806285858,
"learning_rate": 0.00044659241982507287,
"loss": 3.343,
"step": 43950
},
{
"epoch": 12.820512820512821,
"grad_norm": 0.3910358250141144,
"learning_rate": 0.00044641749271137024,
"loss": 3.3355,
"step": 44000
},
{
"epoch": 12.820512820512821,
"eval_accuracy": 0.3705464518015038,
"eval_loss": 3.54892635345459,
"eval_runtime": 179.9708,
"eval_samples_per_second": 92.471,
"eval_steps_per_second": 5.784,
"step": 44000
},
{
"epoch": 12.835081585081586,
"grad_norm": 0.3828602135181427,
"learning_rate": 0.0004462425655976676,
"loss": 3.3659,
"step": 44050
},
{
"epoch": 12.84965034965035,
"grad_norm": 0.3822254538536072,
"learning_rate": 0.000446067638483965,
"loss": 3.3387,
"step": 44100
},
{
"epoch": 12.864219114219114,
"grad_norm": 0.3691919445991516,
"learning_rate": 0.0004458927113702623,
"loss": 3.3439,
"step": 44150
},
{
"epoch": 12.878787878787879,
"grad_norm": 0.36081933975219727,
"learning_rate": 0.0004457177842565597,
"loss": 3.3431,
"step": 44200
},
{
"epoch": 12.893356643356643,
"grad_norm": 0.36241626739501953,
"learning_rate": 0.00044554285714285707,
"loss": 3.3497,
"step": 44250
},
{
"epoch": 12.907925407925408,
"grad_norm": 0.3698984980583191,
"learning_rate": 0.0004453679300291545,
"loss": 3.3471,
"step": 44300
},
{
"epoch": 12.922494172494172,
"grad_norm": 0.3605395555496216,
"learning_rate": 0.0004451930029154519,
"loss": 3.3469,
"step": 44350
},
{
"epoch": 12.937062937062937,
"grad_norm": 0.3236188590526581,
"learning_rate": 0.00044501807580174925,
"loss": 3.3546,
"step": 44400
},
{
"epoch": 12.951631701631701,
"grad_norm": 0.3255074620246887,
"learning_rate": 0.0004448431486880466,
"loss": 3.3495,
"step": 44450
},
{
"epoch": 12.966200466200466,
"grad_norm": 0.34724318981170654,
"learning_rate": 0.000444668221574344,
"loss": 3.3554,
"step": 44500
},
{
"epoch": 12.98076923076923,
"grad_norm": 0.3772111237049103,
"learning_rate": 0.0004444932944606414,
"loss": 3.3509,
"step": 44550
},
{
"epoch": 12.995337995337996,
"grad_norm": 0.3767688572406769,
"learning_rate": 0.0004443183673469387,
"loss": 3.3577,
"step": 44600
},
{
"epoch": 13.00990675990676,
"grad_norm": 0.35660919547080994,
"learning_rate": 0.0004441434402332361,
"loss": 3.27,
"step": 44650
},
{
"epoch": 13.024475524475525,
"grad_norm": 0.3383883833885193,
"learning_rate": 0.00044396851311953345,
"loss": 3.2467,
"step": 44700
},
{
"epoch": 13.03904428904429,
"grad_norm": 0.3448445796966553,
"learning_rate": 0.0004437935860058309,
"loss": 3.24,
"step": 44750
},
{
"epoch": 13.053613053613054,
"grad_norm": 0.387921541929245,
"learning_rate": 0.00044361865889212826,
"loss": 3.254,
"step": 44800
},
{
"epoch": 13.068181818181818,
"grad_norm": 0.3401538133621216,
"learning_rate": 0.00044344373177842563,
"loss": 3.2603,
"step": 44850
},
{
"epoch": 13.082750582750583,
"grad_norm": 0.3475644886493683,
"learning_rate": 0.000443268804664723,
"loss": 3.2626,
"step": 44900
},
{
"epoch": 13.097319347319347,
"grad_norm": 0.4108971357345581,
"learning_rate": 0.0004430938775510204,
"loss": 3.2448,
"step": 44950
},
{
"epoch": 13.111888111888112,
"grad_norm": 0.3426775336265564,
"learning_rate": 0.00044291895043731776,
"loss": 3.2631,
"step": 45000
},
{
"epoch": 13.111888111888112,
"eval_accuracy": 0.37012735796140717,
"eval_loss": 3.561819553375244,
"eval_runtime": 179.9972,
"eval_samples_per_second": 92.457,
"eval_steps_per_second": 5.783,
"step": 45000
},
{
"epoch": 13.126456876456876,
"grad_norm": 0.35371506214141846,
"learning_rate": 0.0004427440233236151,
"loss": 3.2639,
"step": 45050
},
{
"epoch": 13.14102564102564,
"grad_norm": 0.3580614924430847,
"learning_rate": 0.00044256909620991246,
"loss": 3.2704,
"step": 45100
},
{
"epoch": 13.155594405594405,
"grad_norm": 0.34704920649528503,
"learning_rate": 0.0004423941690962099,
"loss": 3.2731,
"step": 45150
},
{
"epoch": 13.17016317016317,
"grad_norm": 0.3504481613636017,
"learning_rate": 0.00044221924198250726,
"loss": 3.2719,
"step": 45200
},
{
"epoch": 13.184731934731936,
"grad_norm": 0.33558207750320435,
"learning_rate": 0.00044204431486880464,
"loss": 3.2841,
"step": 45250
},
{
"epoch": 13.1993006993007,
"grad_norm": 0.3562605679035187,
"learning_rate": 0.000441869387755102,
"loss": 3.2794,
"step": 45300
},
{
"epoch": 13.213869463869464,
"grad_norm": 0.34473833441734314,
"learning_rate": 0.0004416944606413994,
"loss": 3.2687,
"step": 45350
},
{
"epoch": 13.228438228438229,
"grad_norm": 0.35833731293678284,
"learning_rate": 0.00044151953352769677,
"loss": 3.2814,
"step": 45400
},
{
"epoch": 13.243006993006993,
"grad_norm": 0.37955713272094727,
"learning_rate": 0.00044134460641399414,
"loss": 3.286,
"step": 45450
},
{
"epoch": 13.257575757575758,
"grad_norm": 0.3702819049358368,
"learning_rate": 0.00044116967930029146,
"loss": 3.2925,
"step": 45500
},
{
"epoch": 13.272144522144522,
"grad_norm": 0.36276885867118835,
"learning_rate": 0.00044099475218658884,
"loss": 3.2913,
"step": 45550
},
{
"epoch": 13.286713286713287,
"grad_norm": 0.36508500576019287,
"learning_rate": 0.00044081982507288627,
"loss": 3.2984,
"step": 45600
},
{
"epoch": 13.301282051282051,
"grad_norm": 0.3717924952507019,
"learning_rate": 0.00044064489795918365,
"loss": 3.2987,
"step": 45650
},
{
"epoch": 13.315850815850816,
"grad_norm": 0.3577413856983185,
"learning_rate": 0.000440469970845481,
"loss": 3.295,
"step": 45700
},
{
"epoch": 13.33041958041958,
"grad_norm": 0.38441523909568787,
"learning_rate": 0.0004402950437317784,
"loss": 3.3003,
"step": 45750
},
{
"epoch": 13.344988344988344,
"grad_norm": 0.33631208539009094,
"learning_rate": 0.00044012011661807577,
"loss": 3.2865,
"step": 45800
},
{
"epoch": 13.359557109557109,
"grad_norm": 0.35587334632873535,
"learning_rate": 0.00043994518950437315,
"loss": 3.3047,
"step": 45850
},
{
"epoch": 13.374125874125873,
"grad_norm": 0.37436777353286743,
"learning_rate": 0.0004397702623906705,
"loss": 3.3059,
"step": 45900
},
{
"epoch": 13.38869463869464,
"grad_norm": 0.399141401052475,
"learning_rate": 0.00043959533527696785,
"loss": 3.296,
"step": 45950
},
{
"epoch": 13.403263403263404,
"grad_norm": 0.385282427072525,
"learning_rate": 0.0004394204081632652,
"loss": 3.3058,
"step": 46000
},
{
"epoch": 13.403263403263404,
"eval_accuracy": 0.37068544420301736,
"eval_loss": 3.5576846599578857,
"eval_runtime": 179.9529,
"eval_samples_per_second": 92.48,
"eval_steps_per_second": 5.785,
"step": 46000
},
{
"epoch": 13.417832167832168,
"grad_norm": 0.3681379556655884,
"learning_rate": 0.00043924548104956265,
"loss": 3.3096,
"step": 46050
},
{
"epoch": 13.432400932400933,
"grad_norm": 0.38421115279197693,
"learning_rate": 0.00043907055393586003,
"loss": 3.3126,
"step": 46100
},
{
"epoch": 13.446969696969697,
"grad_norm": 0.3728811740875244,
"learning_rate": 0.0004388956268221574,
"loss": 3.3038,
"step": 46150
},
{
"epoch": 13.461538461538462,
"grad_norm": 0.3609063923358917,
"learning_rate": 0.0004387206997084548,
"loss": 3.3158,
"step": 46200
},
{
"epoch": 13.476107226107226,
"grad_norm": 0.3440368175506592,
"learning_rate": 0.00043854577259475215,
"loss": 3.3067,
"step": 46250
},
{
"epoch": 13.49067599067599,
"grad_norm": 0.33095043897628784,
"learning_rate": 0.00043837084548104953,
"loss": 3.324,
"step": 46300
},
{
"epoch": 13.505244755244755,
"grad_norm": 0.32994940876960754,
"learning_rate": 0.0004381959183673469,
"loss": 3.3235,
"step": 46350
},
{
"epoch": 13.51981351981352,
"grad_norm": 0.34858837723731995,
"learning_rate": 0.00043802099125364423,
"loss": 3.3207,
"step": 46400
},
{
"epoch": 13.534382284382284,
"grad_norm": 0.3519839942455292,
"learning_rate": 0.00043784606413994166,
"loss": 3.3126,
"step": 46450
},
{
"epoch": 13.548951048951048,
"grad_norm": 0.3784167170524597,
"learning_rate": 0.00043767113702623903,
"loss": 3.3128,
"step": 46500
},
{
"epoch": 13.563519813519813,
"grad_norm": 0.35100221633911133,
"learning_rate": 0.0004374962099125364,
"loss": 3.3071,
"step": 46550
},
{
"epoch": 13.578088578088579,
"grad_norm": 0.37419000267982483,
"learning_rate": 0.0004373212827988338,
"loss": 3.3138,
"step": 46600
},
{
"epoch": 13.592657342657343,
"grad_norm": 0.34843066334724426,
"learning_rate": 0.00043714635568513116,
"loss": 3.3234,
"step": 46650
},
{
"epoch": 13.607226107226108,
"grad_norm": 0.3874765634536743,
"learning_rate": 0.00043697142857142854,
"loss": 3.3175,
"step": 46700
},
{
"epoch": 13.621794871794872,
"grad_norm": 0.3625902235507965,
"learning_rate": 0.0004367965014577259,
"loss": 3.3113,
"step": 46750
},
{
"epoch": 13.636363636363637,
"grad_norm": 0.362448513507843,
"learning_rate": 0.00043662157434402334,
"loss": 3.3231,
"step": 46800
},
{
"epoch": 13.650932400932401,
"grad_norm": 0.3466891348361969,
"learning_rate": 0.0004364466472303206,
"loss": 3.3209,
"step": 46850
},
{
"epoch": 13.665501165501166,
"grad_norm": 0.3769454061985016,
"learning_rate": 0.00043627172011661804,
"loss": 3.3104,
"step": 46900
},
{
"epoch": 13.68006993006993,
"grad_norm": 0.42102763056755066,
"learning_rate": 0.0004360967930029154,
"loss": 3.3251,
"step": 46950
},
{
"epoch": 13.694638694638694,
"grad_norm": 0.35508814454078674,
"learning_rate": 0.0004359218658892128,
"loss": 3.3095,
"step": 47000
},
{
"epoch": 13.694638694638694,
"eval_accuracy": 0.3713083230108153,
"eval_loss": 3.5499608516693115,
"eval_runtime": 180.0856,
"eval_samples_per_second": 92.412,
"eval_steps_per_second": 5.781,
"step": 47000
},
{
"epoch": 13.709207459207459,
"grad_norm": 0.3345799446105957,
"learning_rate": 0.00043574693877551017,
"loss": 3.3378,
"step": 47050
},
{
"epoch": 13.723776223776223,
"grad_norm": 0.33664801716804504,
"learning_rate": 0.00043557201166180754,
"loss": 3.3381,
"step": 47100
},
{
"epoch": 13.738344988344988,
"grad_norm": 0.35207465291023254,
"learning_rate": 0.0004353970845481049,
"loss": 3.3346,
"step": 47150
},
{
"epoch": 13.752913752913752,
"grad_norm": 0.34713587164878845,
"learning_rate": 0.0004352221574344023,
"loss": 3.3475,
"step": 47200
},
{
"epoch": 13.767482517482517,
"grad_norm": 0.37446174025535583,
"learning_rate": 0.0004350472303206997,
"loss": 3.3309,
"step": 47250
},
{
"epoch": 13.782051282051283,
"grad_norm": 0.33329087495803833,
"learning_rate": 0.00043487230320699705,
"loss": 3.3289,
"step": 47300
},
{
"epoch": 13.796620046620047,
"grad_norm": 0.38883328437805176,
"learning_rate": 0.0004346973760932944,
"loss": 3.3214,
"step": 47350
},
{
"epoch": 13.811188811188812,
"grad_norm": 0.388072669506073,
"learning_rate": 0.0004345224489795918,
"loss": 3.3401,
"step": 47400
},
{
"epoch": 13.825757575757576,
"grad_norm": 0.3695523142814636,
"learning_rate": 0.0004343475218658892,
"loss": 3.3128,
"step": 47450
},
{
"epoch": 13.84032634032634,
"grad_norm": 0.37850746512413025,
"learning_rate": 0.00043417259475218655,
"loss": 3.3294,
"step": 47500
},
{
"epoch": 13.854895104895105,
"grad_norm": 0.3716648519039154,
"learning_rate": 0.0004339976676384839,
"loss": 3.3223,
"step": 47550
},
{
"epoch": 13.86946386946387,
"grad_norm": 0.38063937425613403,
"learning_rate": 0.0004338227405247813,
"loss": 3.3279,
"step": 47600
},
{
"epoch": 13.884032634032634,
"grad_norm": 0.39428770542144775,
"learning_rate": 0.00043364781341107873,
"loss": 3.3375,
"step": 47650
},
{
"epoch": 13.898601398601398,
"grad_norm": 0.34782904386520386,
"learning_rate": 0.0004334728862973761,
"loss": 3.3189,
"step": 47700
},
{
"epoch": 13.913170163170163,
"grad_norm": 0.37662190198898315,
"learning_rate": 0.00043329795918367343,
"loss": 3.3221,
"step": 47750
},
{
"epoch": 13.927738927738927,
"grad_norm": 0.36181744933128357,
"learning_rate": 0.0004331230320699708,
"loss": 3.3231,
"step": 47800
},
{
"epoch": 13.942307692307692,
"grad_norm": 0.34516653418540955,
"learning_rate": 0.0004329481049562682,
"loss": 3.3375,
"step": 47850
},
{
"epoch": 13.956876456876456,
"grad_norm": 0.38795268535614014,
"learning_rate": 0.00043277317784256556,
"loss": 3.343,
"step": 47900
},
{
"epoch": 13.971445221445222,
"grad_norm": 0.3683021664619446,
"learning_rate": 0.00043259825072886293,
"loss": 3.3304,
"step": 47950
},
{
"epoch": 13.986013986013987,
"grad_norm": 0.3830035328865051,
"learning_rate": 0.0004324233236151603,
"loss": 3.3335,
"step": 48000
},
{
"epoch": 13.986013986013987,
"eval_accuracy": 0.37180502682130023,
"eval_loss": 3.538908004760742,
"eval_runtime": 180.2136,
"eval_samples_per_second": 92.346,
"eval_steps_per_second": 5.776,
"step": 48000
},
{
"epoch": 14.000582750582751,
"grad_norm": 0.3480665981769562,
"learning_rate": 0.0004322483965014577,
"loss": 3.3392,
"step": 48050
},
{
"epoch": 14.015151515151516,
"grad_norm": 0.37040701508522034,
"learning_rate": 0.0004320734693877551,
"loss": 3.2289,
"step": 48100
},
{
"epoch": 14.02972027972028,
"grad_norm": 0.3557523190975189,
"learning_rate": 0.0004318985422740525,
"loss": 3.2138,
"step": 48150
},
{
"epoch": 14.044289044289044,
"grad_norm": 0.35745498538017273,
"learning_rate": 0.0004317236151603498,
"loss": 3.2378,
"step": 48200
},
{
"epoch": 14.058857808857809,
"grad_norm": 0.35629206895828247,
"learning_rate": 0.0004315486880466472,
"loss": 3.2303,
"step": 48250
},
{
"epoch": 14.073426573426573,
"grad_norm": 0.386960506439209,
"learning_rate": 0.00043137376093294456,
"loss": 3.2403,
"step": 48300
},
{
"epoch": 14.087995337995338,
"grad_norm": 0.3604874312877655,
"learning_rate": 0.00043119883381924194,
"loss": 3.2408,
"step": 48350
},
{
"epoch": 14.102564102564102,
"grad_norm": 0.3432071805000305,
"learning_rate": 0.0004310239067055393,
"loss": 3.2521,
"step": 48400
},
{
"epoch": 14.117132867132867,
"grad_norm": 0.367196261882782,
"learning_rate": 0.0004308489795918367,
"loss": 3.2488,
"step": 48450
},
{
"epoch": 14.131701631701631,
"grad_norm": 0.39279282093048096,
"learning_rate": 0.00043067405247813407,
"loss": 3.2613,
"step": 48500
},
{
"epoch": 14.146270396270396,
"grad_norm": 0.38095659017562866,
"learning_rate": 0.0004304991253644315,
"loss": 3.2409,
"step": 48550
},
{
"epoch": 14.16083916083916,
"grad_norm": 0.3556550145149231,
"learning_rate": 0.00043032419825072887,
"loss": 3.2639,
"step": 48600
},
{
"epoch": 14.175407925407926,
"grad_norm": 0.36897987127304077,
"learning_rate": 0.0004301492711370262,
"loss": 3.2696,
"step": 48650
},
{
"epoch": 14.18997668997669,
"grad_norm": 0.4175944924354553,
"learning_rate": 0.00042997434402332357,
"loss": 3.2581,
"step": 48700
},
{
"epoch": 14.204545454545455,
"grad_norm": 0.3549362123012543,
"learning_rate": 0.00042979941690962094,
"loss": 3.2691,
"step": 48750
},
{
"epoch": 14.21911421911422,
"grad_norm": 0.36222416162490845,
"learning_rate": 0.0004296244897959183,
"loss": 3.2571,
"step": 48800
},
{
"epoch": 14.233682983682984,
"grad_norm": 0.372361958026886,
"learning_rate": 0.0004294495626822157,
"loss": 3.2698,
"step": 48850
},
{
"epoch": 14.248251748251748,
"grad_norm": 0.3558199405670166,
"learning_rate": 0.00042927463556851307,
"loss": 3.276,
"step": 48900
},
{
"epoch": 14.262820512820513,
"grad_norm": 0.35009899735450745,
"learning_rate": 0.0004290997084548105,
"loss": 3.2863,
"step": 48950
},
{
"epoch": 14.277389277389277,
"grad_norm": 0.37568387389183044,
"learning_rate": 0.0004289247813411079,
"loss": 3.2777,
"step": 49000
},
{
"epoch": 14.277389277389277,
"eval_accuracy": 0.3711598057492996,
"eval_loss": 3.5574018955230713,
"eval_runtime": 180.0118,
"eval_samples_per_second": 92.45,
"eval_steps_per_second": 5.783,
"step": 49000
},
{
"epoch": 14.291958041958042,
"grad_norm": 0.36017411947250366,
"learning_rate": 0.00042874985422740525,
"loss": 3.2748,
"step": 49050
},
{
"epoch": 14.306526806526806,
"grad_norm": 0.34823670983314514,
"learning_rate": 0.0004285749271137026,
"loss": 3.2609,
"step": 49100
},
{
"epoch": 14.32109557109557,
"grad_norm": 0.39780181646347046,
"learning_rate": 0.00042839999999999995,
"loss": 3.2832,
"step": 49150
},
{
"epoch": 14.335664335664335,
"grad_norm": 0.3572824001312256,
"learning_rate": 0.0004282250728862973,
"loss": 3.2827,
"step": 49200
},
{
"epoch": 14.3502331002331,
"grad_norm": 0.37133660912513733,
"learning_rate": 0.0004280501457725947,
"loss": 3.2703,
"step": 49250
},
{
"epoch": 14.364801864801866,
"grad_norm": 0.38326296210289,
"learning_rate": 0.0004278752186588921,
"loss": 3.2889,
"step": 49300
},
{
"epoch": 14.37937062937063,
"grad_norm": 0.339304655790329,
"learning_rate": 0.00042770029154518945,
"loss": 3.2866,
"step": 49350
},
{
"epoch": 14.393939393939394,
"grad_norm": 0.37460795044898987,
"learning_rate": 0.0004275253644314869,
"loss": 3.2895,
"step": 49400
},
{
"epoch": 14.408508158508159,
"grad_norm": 0.357552170753479,
"learning_rate": 0.00042735043731778426,
"loss": 3.2825,
"step": 49450
},
{
"epoch": 14.423076923076923,
"grad_norm": 0.3532392978668213,
"learning_rate": 0.00042717551020408164,
"loss": 3.2863,
"step": 49500
},
{
"epoch": 14.437645687645688,
"grad_norm": 0.3605949878692627,
"learning_rate": 0.00042700058309037896,
"loss": 3.2907,
"step": 49550
},
{
"epoch": 14.452214452214452,
"grad_norm": 0.3634662628173828,
"learning_rate": 0.00042682565597667633,
"loss": 3.3028,
"step": 49600
},
{
"epoch": 14.466783216783217,
"grad_norm": 0.3738766312599182,
"learning_rate": 0.0004266507288629737,
"loss": 3.2952,
"step": 49650
},
{
"epoch": 14.481351981351981,
"grad_norm": 0.3741609752178192,
"learning_rate": 0.0004264758017492711,
"loss": 3.279,
"step": 49700
},
{
"epoch": 14.495920745920746,
"grad_norm": 0.33341366052627563,
"learning_rate": 0.00042630087463556846,
"loss": 3.3027,
"step": 49750
},
{
"epoch": 14.51048951048951,
"grad_norm": 0.35941943526268005,
"learning_rate": 0.00042612594752186584,
"loss": 3.2978,
"step": 49800
},
{
"epoch": 14.525058275058274,
"grad_norm": 0.3725701868534088,
"learning_rate": 0.00042595102040816327,
"loss": 3.3026,
"step": 49850
},
{
"epoch": 14.539627039627039,
"grad_norm": 0.36116182804107666,
"learning_rate": 0.00042577609329446064,
"loss": 3.2948,
"step": 49900
},
{
"epoch": 14.554195804195803,
"grad_norm": 0.3436543345451355,
"learning_rate": 0.000425601166180758,
"loss": 3.3013,
"step": 49950
},
{
"epoch": 14.56876456876457,
"grad_norm": 0.3576006293296814,
"learning_rate": 0.00042542623906705534,
"loss": 3.313,
"step": 50000
},
{
"epoch": 14.56876456876457,
"eval_accuracy": 0.3715792523619889,
"eval_loss": 3.5501418113708496,
"eval_runtime": 180.1333,
"eval_samples_per_second": 92.387,
"eval_steps_per_second": 5.779,
"step": 50000
},
{
"epoch": 14.583333333333334,
"grad_norm": 0.36011427640914917,
"learning_rate": 0.0004252513119533527,
"loss": 3.308,
"step": 50050
},
{
"epoch": 14.597902097902098,
"grad_norm": 0.3479318916797638,
"learning_rate": 0.0004250763848396501,
"loss": 3.3094,
"step": 50100
},
{
"epoch": 14.612470862470863,
"grad_norm": 0.3761695623397827,
"learning_rate": 0.00042490145772594747,
"loss": 3.3024,
"step": 50150
},
{
"epoch": 14.627039627039627,
"grad_norm": 0.3592838644981384,
"learning_rate": 0.00042472653061224484,
"loss": 3.3189,
"step": 50200
},
{
"epoch": 14.641608391608392,
"grad_norm": 0.38095730543136597,
"learning_rate": 0.00042455160349854227,
"loss": 3.2947,
"step": 50250
},
{
"epoch": 14.656177156177156,
"grad_norm": 0.3472435772418976,
"learning_rate": 0.00042437667638483965,
"loss": 3.3113,
"step": 50300
},
{
"epoch": 14.67074592074592,
"grad_norm": 0.3782998323440552,
"learning_rate": 0.000424201749271137,
"loss": 3.3189,
"step": 50350
},
{
"epoch": 14.685314685314685,
"grad_norm": 0.35338160395622253,
"learning_rate": 0.0004240268221574344,
"loss": 3.309,
"step": 50400
},
{
"epoch": 14.69988344988345,
"grad_norm": 0.35487911105155945,
"learning_rate": 0.0004238518950437317,
"loss": 3.2962,
"step": 50450
},
{
"epoch": 14.714452214452214,
"grad_norm": 0.38159093260765076,
"learning_rate": 0.0004236769679300291,
"loss": 3.3062,
"step": 50500
},
{
"epoch": 14.729020979020978,
"grad_norm": 0.3594988286495209,
"learning_rate": 0.00042350204081632647,
"loss": 3.3045,
"step": 50550
},
{
"epoch": 14.743589743589745,
"grad_norm": 0.3737533390522003,
"learning_rate": 0.00042332711370262385,
"loss": 3.317,
"step": 50600
},
{
"epoch": 14.758158508158509,
"grad_norm": 0.3727569282054901,
"learning_rate": 0.0004231521865889212,
"loss": 3.3045,
"step": 50650
},
{
"epoch": 14.772727272727273,
"grad_norm": 0.3523760437965393,
"learning_rate": 0.00042297725947521865,
"loss": 3.3158,
"step": 50700
},
{
"epoch": 14.787296037296038,
"grad_norm": 0.3428592383861542,
"learning_rate": 0.00042280233236151603,
"loss": 3.2976,
"step": 50750
},
{
"epoch": 14.801864801864802,
"grad_norm": 0.3490734100341797,
"learning_rate": 0.0004226274052478134,
"loss": 3.3028,
"step": 50800
},
{
"epoch": 14.816433566433567,
"grad_norm": 0.3931259512901306,
"learning_rate": 0.0004224524781341108,
"loss": 3.3063,
"step": 50850
},
{
"epoch": 14.831002331002331,
"grad_norm": 0.35205090045928955,
"learning_rate": 0.0004222775510204081,
"loss": 3.3259,
"step": 50900
},
{
"epoch": 14.845571095571096,
"grad_norm": 0.3528735637664795,
"learning_rate": 0.0004221026239067055,
"loss": 3.3185,
"step": 50950
},
{
"epoch": 14.86013986013986,
"grad_norm": 0.38072121143341064,
"learning_rate": 0.00042192769679300285,
"loss": 3.3186,
"step": 51000
},
{
"epoch": 14.86013986013986,
"eval_accuracy": 0.37217014645471774,
"eval_loss": 3.5429024696350098,
"eval_runtime": 180.2191,
"eval_samples_per_second": 92.343,
"eval_steps_per_second": 5.776,
"step": 51000
},
{
"epoch": 14.874708624708624,
"grad_norm": 0.3460468649864197,
"learning_rate": 0.00042175276967930023,
"loss": 3.3226,
"step": 51050
},
{
"epoch": 14.889277389277389,
"grad_norm": 0.3763303756713867,
"learning_rate": 0.00042157784256559766,
"loss": 3.3287,
"step": 51100
},
{
"epoch": 14.903846153846153,
"grad_norm": 0.36650410294532776,
"learning_rate": 0.00042140291545189504,
"loss": 3.3108,
"step": 51150
},
{
"epoch": 14.918414918414918,
"grad_norm": 0.34479567408561707,
"learning_rate": 0.0004212279883381924,
"loss": 3.3206,
"step": 51200
},
{
"epoch": 14.932983682983682,
"grad_norm": 0.36906757950782776,
"learning_rate": 0.0004210530612244898,
"loss": 3.3259,
"step": 51250
},
{
"epoch": 14.947552447552448,
"grad_norm": 0.35507652163505554,
"learning_rate": 0.0004208781341107871,
"loss": 3.3299,
"step": 51300
},
{
"epoch": 14.962121212121213,
"grad_norm": 0.3915591239929199,
"learning_rate": 0.0004207032069970845,
"loss": 3.3257,
"step": 51350
},
{
"epoch": 14.976689976689977,
"grad_norm": 0.37498611211776733,
"learning_rate": 0.00042052827988338186,
"loss": 3.327,
"step": 51400
},
{
"epoch": 14.991258741258742,
"grad_norm": 0.3631887435913086,
"learning_rate": 0.00042035335276967924,
"loss": 3.3221,
"step": 51450
},
{
"epoch": 15.005827505827506,
"grad_norm": 0.3742659091949463,
"learning_rate": 0.0004201784256559766,
"loss": 3.2863,
"step": 51500
},
{
"epoch": 15.02039627039627,
"grad_norm": 0.36948665976524353,
"learning_rate": 0.00042000349854227404,
"loss": 3.2117,
"step": 51550
},
{
"epoch": 15.034965034965035,
"grad_norm": 0.35238027572631836,
"learning_rate": 0.0004198285714285714,
"loss": 3.2095,
"step": 51600
},
{
"epoch": 15.0495337995338,
"grad_norm": 0.36405453085899353,
"learning_rate": 0.0004196536443148688,
"loss": 3.2157,
"step": 51650
},
{
"epoch": 15.064102564102564,
"grad_norm": 0.3587688207626343,
"learning_rate": 0.00041947871720116617,
"loss": 3.2258,
"step": 51700
},
{
"epoch": 15.078671328671328,
"grad_norm": 0.38245049118995667,
"learning_rate": 0.0004193037900874635,
"loss": 3.2297,
"step": 51750
},
{
"epoch": 15.093240093240093,
"grad_norm": 0.3895741105079651,
"learning_rate": 0.00041912886297376087,
"loss": 3.2331,
"step": 51800
},
{
"epoch": 15.107808857808857,
"grad_norm": 0.36344262957572937,
"learning_rate": 0.00041895393586005824,
"loss": 3.246,
"step": 51850
},
{
"epoch": 15.122377622377622,
"grad_norm": 0.3528017997741699,
"learning_rate": 0.0004187790087463556,
"loss": 3.2407,
"step": 51900
},
{
"epoch": 15.136946386946388,
"grad_norm": 0.3691173791885376,
"learning_rate": 0.000418604081632653,
"loss": 3.2275,
"step": 51950
},
{
"epoch": 15.151515151515152,
"grad_norm": 0.3804149031639099,
"learning_rate": 0.0004184291545189504,
"loss": 3.2468,
"step": 52000
},
{
"epoch": 15.151515151515152,
"eval_accuracy": 0.37138205248268413,
"eval_loss": 3.5521976947784424,
"eval_runtime": 180.2046,
"eval_samples_per_second": 92.351,
"eval_steps_per_second": 5.777,
"step": 52000
},
{
"epoch": 15.166083916083917,
"grad_norm": 0.36506277322769165,
"learning_rate": 0.0004182542274052478,
"loss": 3.2333,
"step": 52050
},
{
"epoch": 15.180652680652681,
"grad_norm": 0.36347970366477966,
"learning_rate": 0.0004180793002915452,
"loss": 3.2548,
"step": 52100
},
{
"epoch": 15.195221445221446,
"grad_norm": 0.3790263831615448,
"learning_rate": 0.00041790437317784255,
"loss": 3.2381,
"step": 52150
},
{
"epoch": 15.20979020979021,
"grad_norm": 0.35307011008262634,
"learning_rate": 0.0004177294460641399,
"loss": 3.2399,
"step": 52200
},
{
"epoch": 15.224358974358974,
"grad_norm": 0.38718825578689575,
"learning_rate": 0.00041755451895043725,
"loss": 3.2514,
"step": 52250
},
{
"epoch": 15.238927738927739,
"grad_norm": 0.3450746238231659,
"learning_rate": 0.0004173795918367346,
"loss": 3.2618,
"step": 52300
},
{
"epoch": 15.253496503496503,
"grad_norm": 0.3694665729999542,
"learning_rate": 0.000417204664723032,
"loss": 3.2559,
"step": 52350
},
{
"epoch": 15.268065268065268,
"grad_norm": 0.3642129600048065,
"learning_rate": 0.00041702973760932943,
"loss": 3.2472,
"step": 52400
},
{
"epoch": 15.282634032634032,
"grad_norm": 0.36244091391563416,
"learning_rate": 0.0004168548104956268,
"loss": 3.2634,
"step": 52450
},
{
"epoch": 15.297202797202797,
"grad_norm": 0.3727215826511383,
"learning_rate": 0.0004166798833819242,
"loss": 3.2626,
"step": 52500
},
{
"epoch": 15.311771561771561,
"grad_norm": 0.3629833459854126,
"learning_rate": 0.00041650495626822156,
"loss": 3.2536,
"step": 52550
},
{
"epoch": 15.326340326340326,
"grad_norm": 0.39329707622528076,
"learning_rate": 0.00041633002915451893,
"loss": 3.2716,
"step": 52600
},
{
"epoch": 15.340909090909092,
"grad_norm": 0.38280272483825684,
"learning_rate": 0.00041615510204081626,
"loss": 3.2711,
"step": 52650
},
{
"epoch": 15.355477855477856,
"grad_norm": 0.38633573055267334,
"learning_rate": 0.00041598017492711363,
"loss": 3.2793,
"step": 52700
},
{
"epoch": 15.37004662004662,
"grad_norm": 0.3785382807254791,
"learning_rate": 0.000415805247813411,
"loss": 3.2758,
"step": 52750
},
{
"epoch": 15.384615384615385,
"grad_norm": 0.3668574094772339,
"learning_rate": 0.0004156303206997084,
"loss": 3.2892,
"step": 52800
},
{
"epoch": 15.39918414918415,
"grad_norm": 0.37236839532852173,
"learning_rate": 0.0004154553935860058,
"loss": 3.2606,
"step": 52850
},
{
"epoch": 15.413752913752914,
"grad_norm": 0.35925427079200745,
"learning_rate": 0.0004152804664723032,
"loss": 3.2751,
"step": 52900
},
{
"epoch": 15.428321678321678,
"grad_norm": 0.3505415618419647,
"learning_rate": 0.00041510553935860056,
"loss": 3.2763,
"step": 52950
},
{
"epoch": 15.442890442890443,
"grad_norm": 0.3869665563106537,
"learning_rate": 0.00041493061224489794,
"loss": 3.2767,
"step": 53000
},
{
"epoch": 15.442890442890443,
"eval_accuracy": 0.3715333919249413,
"eval_loss": 3.549450635910034,
"eval_runtime": 180.2062,
"eval_samples_per_second": 92.35,
"eval_steps_per_second": 5.777,
"step": 53000
},
{
"epoch": 15.457459207459207,
"grad_norm": 0.3662952780723572,
"learning_rate": 0.0004147556851311953,
"loss": 3.2921,
"step": 53050
},
{
"epoch": 15.472027972027972,
"grad_norm": 0.3680059313774109,
"learning_rate": 0.00041458075801749264,
"loss": 3.2848,
"step": 53100
},
{
"epoch": 15.486596736596736,
"grad_norm": 0.3896573781967163,
"learning_rate": 0.00041440583090379,
"loss": 3.2915,
"step": 53150
},
{
"epoch": 15.5011655011655,
"grad_norm": 0.35986337065696716,
"learning_rate": 0.0004142309037900874,
"loss": 3.2853,
"step": 53200
},
{
"epoch": 15.515734265734265,
"grad_norm": 0.3684069514274597,
"learning_rate": 0.0004140559766763848,
"loss": 3.283,
"step": 53250
},
{
"epoch": 15.530303030303031,
"grad_norm": 0.3770783841609955,
"learning_rate": 0.0004138810495626822,
"loss": 3.2694,
"step": 53300
},
{
"epoch": 15.544871794871796,
"grad_norm": 0.370087593793869,
"learning_rate": 0.00041370612244897957,
"loss": 3.2819,
"step": 53350
},
{
"epoch": 15.55944055944056,
"grad_norm": 0.42769378423690796,
"learning_rate": 0.00041353119533527695,
"loss": 3.2821,
"step": 53400
},
{
"epoch": 15.574009324009324,
"grad_norm": 0.3953687250614166,
"learning_rate": 0.0004133562682215743,
"loss": 3.2944,
"step": 53450
},
{
"epoch": 15.588578088578089,
"grad_norm": 0.38645675778388977,
"learning_rate": 0.0004131813411078717,
"loss": 3.289,
"step": 53500
},
{
"epoch": 15.603146853146853,
"grad_norm": 0.34176626801490784,
"learning_rate": 0.000413006413994169,
"loss": 3.2847,
"step": 53550
},
{
"epoch": 15.617715617715618,
"grad_norm": 0.3668389320373535,
"learning_rate": 0.0004128314868804664,
"loss": 3.2873,
"step": 53600
},
{
"epoch": 15.632284382284382,
"grad_norm": 0.38792678713798523,
"learning_rate": 0.00041265655976676377,
"loss": 3.2895,
"step": 53650
},
{
"epoch": 15.646853146853147,
"grad_norm": 0.3726765811443329,
"learning_rate": 0.0004124816326530612,
"loss": 3.2879,
"step": 53700
},
{
"epoch": 15.661421911421911,
"grad_norm": 0.38674214482307434,
"learning_rate": 0.0004123067055393586,
"loss": 3.2967,
"step": 53750
},
{
"epoch": 15.675990675990676,
"grad_norm": 0.3406377136707306,
"learning_rate": 0.00041213177842565595,
"loss": 3.2872,
"step": 53800
},
{
"epoch": 15.69055944055944,
"grad_norm": 0.368076354265213,
"learning_rate": 0.00041195685131195333,
"loss": 3.2957,
"step": 53850
},
{
"epoch": 15.705128205128204,
"grad_norm": 0.3570297956466675,
"learning_rate": 0.0004117819241982507,
"loss": 3.2905,
"step": 53900
},
{
"epoch": 15.719696969696969,
"grad_norm": 0.37296998500823975,
"learning_rate": 0.0004116069970845481,
"loss": 3.308,
"step": 53950
},
{
"epoch": 15.734265734265735,
"grad_norm": 0.35054120421409607,
"learning_rate": 0.0004114320699708454,
"loss": 3.2932,
"step": 54000
},
{
"epoch": 15.734265734265735,
"eval_accuracy": 0.3724629477066371,
"eval_loss": 3.539015054702759,
"eval_runtime": 179.9569,
"eval_samples_per_second": 92.478,
"eval_steps_per_second": 5.785,
"step": 54000
},
{
"epoch": 15.7488344988345,
"grad_norm": 0.3462640345096588,
"learning_rate": 0.0004112571428571428,
"loss": 3.3019,
"step": 54050
},
{
"epoch": 15.763403263403264,
"grad_norm": 0.38437676429748535,
"learning_rate": 0.00041108221574344015,
"loss": 3.3003,
"step": 54100
},
{
"epoch": 15.777972027972028,
"grad_norm": 0.37581923604011536,
"learning_rate": 0.0004109072886297376,
"loss": 3.2989,
"step": 54150
},
{
"epoch": 15.792540792540793,
"grad_norm": 0.4052378237247467,
"learning_rate": 0.00041073236151603496,
"loss": 3.2972,
"step": 54200
},
{
"epoch": 15.807109557109557,
"grad_norm": 0.4042377471923828,
"learning_rate": 0.00041055743440233234,
"loss": 3.2974,
"step": 54250
},
{
"epoch": 15.821678321678322,
"grad_norm": 0.3874850869178772,
"learning_rate": 0.0004103825072886297,
"loss": 3.3083,
"step": 54300
},
{
"epoch": 15.836247086247086,
"grad_norm": 0.3619961440563202,
"learning_rate": 0.0004102075801749271,
"loss": 3.2968,
"step": 54350
},
{
"epoch": 15.85081585081585,
"grad_norm": 0.3616982400417328,
"learning_rate": 0.00041003265306122446,
"loss": 3.305,
"step": 54400
},
{
"epoch": 15.865384615384615,
"grad_norm": 0.39646637439727783,
"learning_rate": 0.0004098577259475218,
"loss": 3.3048,
"step": 54450
},
{
"epoch": 15.87995337995338,
"grad_norm": 0.3735204041004181,
"learning_rate": 0.00040968279883381916,
"loss": 3.3097,
"step": 54500
},
{
"epoch": 15.894522144522144,
"grad_norm": 0.38079050183296204,
"learning_rate": 0.0004095078717201166,
"loss": 3.321,
"step": 54550
},
{
"epoch": 15.909090909090908,
"grad_norm": 0.3562343120574951,
"learning_rate": 0.00040933294460641397,
"loss": 3.312,
"step": 54600
},
{
"epoch": 15.923659673659674,
"grad_norm": 0.3720053434371948,
"learning_rate": 0.00040915801749271134,
"loss": 3.3141,
"step": 54650
},
{
"epoch": 15.938228438228439,
"grad_norm": 0.35950398445129395,
"learning_rate": 0.0004089830903790087,
"loss": 3.3188,
"step": 54700
},
{
"epoch": 15.952797202797203,
"grad_norm": 0.35162949562072754,
"learning_rate": 0.0004088081632653061,
"loss": 3.3067,
"step": 54750
},
{
"epoch": 15.967365967365968,
"grad_norm": 0.368741512298584,
"learning_rate": 0.00040863323615160347,
"loss": 3.2972,
"step": 54800
},
{
"epoch": 15.981934731934732,
"grad_norm": 0.3791120648384094,
"learning_rate": 0.00040845830903790085,
"loss": 3.2925,
"step": 54850
},
{
"epoch": 15.996503496503497,
"grad_norm": 0.38239946961402893,
"learning_rate": 0.00040828338192419817,
"loss": 3.31,
"step": 54900
},
{
"epoch": 16.01107226107226,
"grad_norm": 0.3918370306491852,
"learning_rate": 0.00040810845481049554,
"loss": 3.2242,
"step": 54950
},
{
"epoch": 16.025641025641026,
"grad_norm": 0.36443474888801575,
"learning_rate": 0.00040793352769679297,
"loss": 3.2057,
"step": 55000
},
{
"epoch": 16.025641025641026,
"eval_accuracy": 0.3720653729946936,
"eval_loss": 3.5517778396606445,
"eval_runtime": 179.9903,
"eval_samples_per_second": 92.461,
"eval_steps_per_second": 5.784,
"step": 55000
},
{
"epoch": 16.04020979020979,
"grad_norm": 0.3803104758262634,
"learning_rate": 0.00040775860058309035,
"loss": 3.2087,
"step": 55050
},
{
"epoch": 16.054778554778554,
"grad_norm": 0.37503015995025635,
"learning_rate": 0.0004075836734693877,
"loss": 3.2064,
"step": 55100
},
{
"epoch": 16.06934731934732,
"grad_norm": 0.422076553106308,
"learning_rate": 0.0004074087463556851,
"loss": 3.2159,
"step": 55150
},
{
"epoch": 16.083916083916083,
"grad_norm": 0.3793075382709503,
"learning_rate": 0.0004072338192419825,
"loss": 3.2157,
"step": 55200
},
{
"epoch": 16.098484848484848,
"grad_norm": 0.35776856541633606,
"learning_rate": 0.00040705889212827985,
"loss": 3.2258,
"step": 55250
},
{
"epoch": 16.113053613053612,
"grad_norm": 0.37177175283432007,
"learning_rate": 0.00040688396501457723,
"loss": 3.2206,
"step": 55300
},
{
"epoch": 16.127622377622377,
"grad_norm": 0.3849240243434906,
"learning_rate": 0.00040670903790087455,
"loss": 3.2165,
"step": 55350
},
{
"epoch": 16.14219114219114,
"grad_norm": 0.37367263436317444,
"learning_rate": 0.0004065341107871719,
"loss": 3.2369,
"step": 55400
},
{
"epoch": 16.156759906759905,
"grad_norm": 0.3775652050971985,
"learning_rate": 0.00040635918367346935,
"loss": 3.2323,
"step": 55450
},
{
"epoch": 16.17132867132867,
"grad_norm": 0.3845197260379791,
"learning_rate": 0.00040618425655976673,
"loss": 3.2327,
"step": 55500
},
{
"epoch": 16.185897435897434,
"grad_norm": 0.3939662575721741,
"learning_rate": 0.0004060093294460641,
"loss": 3.2396,
"step": 55550
},
{
"epoch": 16.2004662004662,
"grad_norm": 0.36457160115242004,
"learning_rate": 0.0004058344023323615,
"loss": 3.2348,
"step": 55600
},
{
"epoch": 16.215034965034967,
"grad_norm": 0.3628428876399994,
"learning_rate": 0.00040565947521865886,
"loss": 3.2387,
"step": 55650
},
{
"epoch": 16.22960372960373,
"grad_norm": 0.39392805099487305,
"learning_rate": 0.00040548454810495623,
"loss": 3.2455,
"step": 55700
},
{
"epoch": 16.244172494172496,
"grad_norm": 0.38273295760154724,
"learning_rate": 0.0004053096209912536,
"loss": 3.2404,
"step": 55750
},
{
"epoch": 16.25874125874126,
"grad_norm": 0.38725781440734863,
"learning_rate": 0.00040513469387755093,
"loss": 3.2394,
"step": 55800
},
{
"epoch": 16.273310023310025,
"grad_norm": 0.3701291084289551,
"learning_rate": 0.00040495976676384836,
"loss": 3.2477,
"step": 55850
},
{
"epoch": 16.28787878787879,
"grad_norm": 0.35124650597572327,
"learning_rate": 0.00040478483965014574,
"loss": 3.2511,
"step": 55900
},
{
"epoch": 16.302447552447553,
"grad_norm": 0.3882130980491638,
"learning_rate": 0.0004046099125364431,
"loss": 3.2493,
"step": 55950
},
{
"epoch": 16.317016317016318,
"grad_norm": 0.39306777715682983,
"learning_rate": 0.0004044349854227405,
"loss": 3.2652,
"step": 56000
},
{
"epoch": 16.317016317016318,
"eval_accuracy": 0.37204632327468923,
"eval_loss": 3.5508227348327637,
"eval_runtime": 180.1019,
"eval_samples_per_second": 92.403,
"eval_steps_per_second": 5.78,
"step": 56000
},
{
"epoch": 16.331585081585082,
"grad_norm": 0.36082884669303894,
"learning_rate": 0.00040426005830903786,
"loss": 3.2585,
"step": 56050
},
{
"epoch": 16.346153846153847,
"grad_norm": 0.38729801774024963,
"learning_rate": 0.00040408513119533524,
"loss": 3.2564,
"step": 56100
},
{
"epoch": 16.36072261072261,
"grad_norm": 0.35457730293273926,
"learning_rate": 0.0004039102040816326,
"loss": 3.2492,
"step": 56150
},
{
"epoch": 16.375291375291376,
"grad_norm": 0.3973914682865143,
"learning_rate": 0.00040373527696793005,
"loss": 3.2576,
"step": 56200
},
{
"epoch": 16.38986013986014,
"grad_norm": 0.40199264883995056,
"learning_rate": 0.0004035603498542273,
"loss": 3.2601,
"step": 56250
},
{
"epoch": 16.404428904428904,
"grad_norm": 0.3823801577091217,
"learning_rate": 0.00040338542274052474,
"loss": 3.2549,
"step": 56300
},
{
"epoch": 16.41899766899767,
"grad_norm": 0.3930020034313202,
"learning_rate": 0.0004032104956268221,
"loss": 3.2594,
"step": 56350
},
{
"epoch": 16.433566433566433,
"grad_norm": 0.3808688521385193,
"learning_rate": 0.0004030355685131195,
"loss": 3.2749,
"step": 56400
},
{
"epoch": 16.448135198135198,
"grad_norm": 0.36190223693847656,
"learning_rate": 0.00040286064139941687,
"loss": 3.2618,
"step": 56450
},
{
"epoch": 16.462703962703962,
"grad_norm": 0.37956106662750244,
"learning_rate": 0.00040268571428571425,
"loss": 3.2701,
"step": 56500
},
{
"epoch": 16.477272727272727,
"grad_norm": 0.349180668592453,
"learning_rate": 0.0004025107871720116,
"loss": 3.2726,
"step": 56550
},
{
"epoch": 16.49184149184149,
"grad_norm": 0.40453752875328064,
"learning_rate": 0.000402335860058309,
"loss": 3.2637,
"step": 56600
},
{
"epoch": 16.506410256410255,
"grad_norm": 0.3742610216140747,
"learning_rate": 0.00040216093294460643,
"loss": 3.2705,
"step": 56650
},
{
"epoch": 16.52097902097902,
"grad_norm": 0.35317522287368774,
"learning_rate": 0.00040198600583090375,
"loss": 3.2734,
"step": 56700
},
{
"epoch": 16.535547785547784,
"grad_norm": 0.3735811710357666,
"learning_rate": 0.0004018110787172011,
"loss": 3.2781,
"step": 56750
},
{
"epoch": 16.55011655011655,
"grad_norm": 0.37040281295776367,
"learning_rate": 0.0004016361516034985,
"loss": 3.2757,
"step": 56800
},
{
"epoch": 16.564685314685313,
"grad_norm": 0.3771756887435913,
"learning_rate": 0.0004014612244897959,
"loss": 3.2827,
"step": 56850
},
{
"epoch": 16.579254079254078,
"grad_norm": 0.4176678955554962,
"learning_rate": 0.00040128629737609325,
"loss": 3.2724,
"step": 56900
},
{
"epoch": 16.593822843822842,
"grad_norm": 0.38978853821754456,
"learning_rate": 0.00040111137026239063,
"loss": 3.2715,
"step": 56950
},
{
"epoch": 16.60839160839161,
"grad_norm": 0.3670736849308014,
"learning_rate": 0.000400936443148688,
"loss": 3.2886,
"step": 57000
},
{
"epoch": 16.60839160839161,
"eval_accuracy": 0.3724863482886178,
"eval_loss": 3.5458335876464844,
"eval_runtime": 180.2316,
"eval_samples_per_second": 92.337,
"eval_steps_per_second": 5.776,
"step": 57000
},
{
"epoch": 16.622960372960375,
"grad_norm": 0.33764857053756714,
"learning_rate": 0.00040076151603498543,
"loss": 3.2677,
"step": 57050
},
{
"epoch": 16.63752913752914,
"grad_norm": 0.3740048408508301,
"learning_rate": 0.0004005865889212828,
"loss": 3.29,
"step": 57100
},
{
"epoch": 16.652097902097903,
"grad_norm": 0.5407046675682068,
"learning_rate": 0.00040041166180758013,
"loss": 3.2744,
"step": 57150
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.34932196140289307,
"learning_rate": 0.0004002367346938775,
"loss": 3.2688,
"step": 57200
},
{
"epoch": 16.681235431235432,
"grad_norm": 0.4044182300567627,
"learning_rate": 0.0004000618075801749,
"loss": 3.2732,
"step": 57250
},
{
"epoch": 16.695804195804197,
"grad_norm": 0.3779875338077545,
"learning_rate": 0.00039988688046647226,
"loss": 3.2768,
"step": 57300
},
{
"epoch": 16.71037296037296,
"grad_norm": 0.3720763027667999,
"learning_rate": 0.00039971195335276963,
"loss": 3.2873,
"step": 57350
},
{
"epoch": 16.724941724941726,
"grad_norm": 0.3826303482055664,
"learning_rate": 0.000399537026239067,
"loss": 3.2898,
"step": 57400
},
{
"epoch": 16.73951048951049,
"grad_norm": 0.38142111897468567,
"learning_rate": 0.0003993620991253644,
"loss": 3.2731,
"step": 57450
},
{
"epoch": 16.754079254079254,
"grad_norm": 0.3760949373245239,
"learning_rate": 0.0003991871720116618,
"loss": 3.2895,
"step": 57500
},
{
"epoch": 16.76864801864802,
"grad_norm": 0.370598703622818,
"learning_rate": 0.0003990122448979592,
"loss": 3.2929,
"step": 57550
},
{
"epoch": 16.783216783216783,
"grad_norm": 0.37887001037597656,
"learning_rate": 0.0003988373177842565,
"loss": 3.2806,
"step": 57600
},
{
"epoch": 16.797785547785548,
"grad_norm": 0.37170445919036865,
"learning_rate": 0.0003986623906705539,
"loss": 3.3034,
"step": 57650
},
{
"epoch": 16.812354312354312,
"grad_norm": 0.37460795044898987,
"learning_rate": 0.00039848746355685127,
"loss": 3.2842,
"step": 57700
},
{
"epoch": 16.826923076923077,
"grad_norm": 0.36762702465057373,
"learning_rate": 0.00039831253644314864,
"loss": 3.2856,
"step": 57750
},
{
"epoch": 16.84149184149184,
"grad_norm": 0.3489190936088562,
"learning_rate": 0.000398137609329446,
"loss": 3.2822,
"step": 57800
},
{
"epoch": 16.856060606060606,
"grad_norm": 0.3669528663158417,
"learning_rate": 0.0003979626822157434,
"loss": 3.2862,
"step": 57850
},
{
"epoch": 16.87062937062937,
"grad_norm": 0.38453635573387146,
"learning_rate": 0.00039778775510204077,
"loss": 3.2791,
"step": 57900
},
{
"epoch": 16.885198135198134,
"grad_norm": 0.4126264750957489,
"learning_rate": 0.0003976128279883382,
"loss": 3.2916,
"step": 57950
},
{
"epoch": 16.8997668997669,
"grad_norm": 0.354192316532135,
"learning_rate": 0.0003974379008746356,
"loss": 3.2946,
"step": 58000
},
{
"epoch": 16.8997668997669,
"eval_accuracy": 0.37311945750160336,
"eval_loss": 3.5339043140411377,
"eval_runtime": 180.1703,
"eval_samples_per_second": 92.368,
"eval_steps_per_second": 5.778,
"step": 58000
},
{
"epoch": 16.914335664335663,
"grad_norm": 0.36521029472351074,
"learning_rate": 0.0003972629737609329,
"loss": 3.2874,
"step": 58050
},
{
"epoch": 16.928904428904428,
"grad_norm": 0.375409334897995,
"learning_rate": 0.00039708804664723027,
"loss": 3.2946,
"step": 58100
},
{
"epoch": 16.943473193473192,
"grad_norm": 0.3432232439517975,
"learning_rate": 0.00039691311953352765,
"loss": 3.2923,
"step": 58150
},
{
"epoch": 16.958041958041957,
"grad_norm": 0.35378479957580566,
"learning_rate": 0.000396738192419825,
"loss": 3.2955,
"step": 58200
},
{
"epoch": 16.97261072261072,
"grad_norm": 0.38814520835876465,
"learning_rate": 0.0003965632653061224,
"loss": 3.2975,
"step": 58250
},
{
"epoch": 16.98717948717949,
"grad_norm": 0.3771551251411438,
"learning_rate": 0.0003963883381924198,
"loss": 3.2997,
"step": 58300
},
{
"epoch": 17.001748251748253,
"grad_norm": 0.4017852246761322,
"learning_rate": 0.0003962134110787172,
"loss": 3.285,
"step": 58350
},
{
"epoch": 17.016317016317018,
"grad_norm": 0.3747924268245697,
"learning_rate": 0.0003960384839650146,
"loss": 3.1918,
"step": 58400
},
{
"epoch": 17.030885780885782,
"grad_norm": 0.3842487335205078,
"learning_rate": 0.00039586355685131196,
"loss": 3.1884,
"step": 58450
},
{
"epoch": 17.045454545454547,
"grad_norm": 0.38249558210372925,
"learning_rate": 0.0003956886297376093,
"loss": 3.1965,
"step": 58500
},
{
"epoch": 17.06002331002331,
"grad_norm": 0.3720807135105133,
"learning_rate": 0.00039551370262390665,
"loss": 3.2026,
"step": 58550
},
{
"epoch": 17.074592074592076,
"grad_norm": 0.36453333497047424,
"learning_rate": 0.00039533877551020403,
"loss": 3.1978,
"step": 58600
},
{
"epoch": 17.08916083916084,
"grad_norm": 0.37213221192359924,
"learning_rate": 0.0003951638483965014,
"loss": 3.193,
"step": 58650
},
{
"epoch": 17.103729603729604,
"grad_norm": 0.3810282051563263,
"learning_rate": 0.0003949889212827988,
"loss": 3.2161,
"step": 58700
},
{
"epoch": 17.11829836829837,
"grad_norm": 0.4005163311958313,
"learning_rate": 0.00039481399416909616,
"loss": 3.2006,
"step": 58750
},
{
"epoch": 17.132867132867133,
"grad_norm": 0.3493218719959259,
"learning_rate": 0.0003946390670553936,
"loss": 3.1989,
"step": 58800
},
{
"epoch": 17.147435897435898,
"grad_norm": 0.4085027575492859,
"learning_rate": 0.00039446413994169096,
"loss": 3.2141,
"step": 58850
},
{
"epoch": 17.162004662004662,
"grad_norm": 0.3723236918449402,
"learning_rate": 0.00039428921282798834,
"loss": 3.2122,
"step": 58900
},
{
"epoch": 17.176573426573427,
"grad_norm": 0.365602046251297,
"learning_rate": 0.00039411428571428566,
"loss": 3.2235,
"step": 58950
},
{
"epoch": 17.19114219114219,
"grad_norm": 0.3900095522403717,
"learning_rate": 0.00039393935860058304,
"loss": 3.2236,
"step": 59000
},
{
"epoch": 17.19114219114219,
"eval_accuracy": 0.37191403355243646,
"eval_loss": 3.5545146465301514,
"eval_runtime": 180.0902,
"eval_samples_per_second": 92.409,
"eval_steps_per_second": 5.78,
"step": 59000
},
{
"epoch": 17.205710955710956,
"grad_norm": 0.39965546131134033,
"learning_rate": 0.0003937644314868804,
"loss": 3.2383,
"step": 59050
},
{
"epoch": 17.22027972027972,
"grad_norm": 0.3811628818511963,
"learning_rate": 0.0003935895043731778,
"loss": 3.2326,
"step": 59100
},
{
"epoch": 17.234848484848484,
"grad_norm": 0.3733785152435303,
"learning_rate": 0.00039341457725947516,
"loss": 3.2351,
"step": 59150
},
{
"epoch": 17.24941724941725,
"grad_norm": 0.3800208568572998,
"learning_rate": 0.0003932396501457726,
"loss": 3.2306,
"step": 59200
},
{
"epoch": 17.263986013986013,
"grad_norm": 0.40144771337509155,
"learning_rate": 0.00039306472303206997,
"loss": 3.2401,
"step": 59250
},
{
"epoch": 17.278554778554778,
"grad_norm": 0.3901655375957489,
"learning_rate": 0.00039288979591836734,
"loss": 3.2329,
"step": 59300
},
{
"epoch": 17.293123543123542,
"grad_norm": 0.37534698843955994,
"learning_rate": 0.0003927148688046647,
"loss": 3.2392,
"step": 59350
},
{
"epoch": 17.307692307692307,
"grad_norm": 0.3891315460205078,
"learning_rate": 0.00039253994169096204,
"loss": 3.2275,
"step": 59400
},
{
"epoch": 17.32226107226107,
"grad_norm": 0.38900765776634216,
"learning_rate": 0.0003923650145772594,
"loss": 3.2505,
"step": 59450
},
{
"epoch": 17.336829836829835,
"grad_norm": 0.40090858936309814,
"learning_rate": 0.0003921900874635568,
"loss": 3.2417,
"step": 59500
},
{
"epoch": 17.3513986013986,
"grad_norm": 0.4910936653614044,
"learning_rate": 0.00039201516034985417,
"loss": 3.2483,
"step": 59550
},
{
"epoch": 17.365967365967364,
"grad_norm": 0.38942986726760864,
"learning_rate": 0.00039184023323615155,
"loss": 3.2518,
"step": 59600
},
{
"epoch": 17.38053613053613,
"grad_norm": 0.39612892270088196,
"learning_rate": 0.000391665306122449,
"loss": 3.2501,
"step": 59650
},
{
"epoch": 17.395104895104897,
"grad_norm": 0.37708619236946106,
"learning_rate": 0.00039149037900874635,
"loss": 3.2523,
"step": 59700
},
{
"epoch": 17.40967365967366,
"grad_norm": 0.3658439517021179,
"learning_rate": 0.0003913154518950437,
"loss": 3.2484,
"step": 59750
},
{
"epoch": 17.424242424242426,
"grad_norm": 0.37678831815719604,
"learning_rate": 0.0003911405247813411,
"loss": 3.2475,
"step": 59800
},
{
"epoch": 17.43881118881119,
"grad_norm": 0.375609815120697,
"learning_rate": 0.0003909655976676384,
"loss": 3.2691,
"step": 59850
},
{
"epoch": 17.453379953379955,
"grad_norm": 0.3910381495952606,
"learning_rate": 0.0003907906705539358,
"loss": 3.2618,
"step": 59900
},
{
"epoch": 17.46794871794872,
"grad_norm": 0.3733561038970947,
"learning_rate": 0.0003906157434402332,
"loss": 3.2444,
"step": 59950
},
{
"epoch": 17.482517482517483,
"grad_norm": 0.40049925446510315,
"learning_rate": 0.00039044081632653055,
"loss": 3.268,
"step": 60000
},
{
"epoch": 17.482517482517483,
"eval_accuracy": 0.37249316855874287,
"eval_loss": 3.5441601276397705,
"eval_runtime": 258.063,
"eval_samples_per_second": 64.488,
"eval_steps_per_second": 4.034,
"step": 60000
},
{
"epoch": 17.497086247086248,
"grad_norm": 0.3814074695110321,
"learning_rate": 0.00039026588921282793,
"loss": 3.2626,
"step": 60050
},
{
"epoch": 17.511655011655012,
"grad_norm": 0.39018502831459045,
"learning_rate": 0.00039009096209912536,
"loss": 3.2578,
"step": 60100
},
{
"epoch": 17.526223776223777,
"grad_norm": 0.41158032417297363,
"learning_rate": 0.00038991603498542273,
"loss": 3.2538,
"step": 60150
},
{
"epoch": 17.54079254079254,
"grad_norm": 0.3606468141078949,
"learning_rate": 0.0003897411078717201,
"loss": 3.2476,
"step": 60200
},
{
"epoch": 17.555361305361306,
"grad_norm": 0.3680814802646637,
"learning_rate": 0.0003895661807580175,
"loss": 3.2732,
"step": 60250
},
{
"epoch": 17.56993006993007,
"grad_norm": 0.4254201054573059,
"learning_rate": 0.0003893912536443148,
"loss": 3.2638,
"step": 60300
},
{
"epoch": 17.584498834498834,
"grad_norm": 0.3743131160736084,
"learning_rate": 0.0003892163265306122,
"loss": 3.2699,
"step": 60350
},
{
"epoch": 17.5990675990676,
"grad_norm": 0.40694037079811096,
"learning_rate": 0.00038904139941690956,
"loss": 3.2678,
"step": 60400
},
{
"epoch": 17.613636363636363,
"grad_norm": 0.3833000957965851,
"learning_rate": 0.00038886647230320693,
"loss": 3.2568,
"step": 60450
},
{
"epoch": 17.628205128205128,
"grad_norm": 0.38820087909698486,
"learning_rate": 0.00038869154518950436,
"loss": 3.2669,
"step": 60500
},
{
"epoch": 17.642773892773892,
"grad_norm": 0.3931123614311218,
"learning_rate": 0.00038851661807580174,
"loss": 3.2692,
"step": 60550
},
{
"epoch": 17.657342657342657,
"grad_norm": 0.4032643735408783,
"learning_rate": 0.0003883416909620991,
"loss": 3.2646,
"step": 60600
},
{
"epoch": 17.67191142191142,
"grad_norm": 0.3966495394706726,
"learning_rate": 0.0003881667638483965,
"loss": 3.2693,
"step": 60650
},
{
"epoch": 17.686480186480185,
"grad_norm": 0.3854696452617645,
"learning_rate": 0.00038799183673469387,
"loss": 3.2577,
"step": 60700
},
{
"epoch": 17.70104895104895,
"grad_norm": 0.4019714891910553,
"learning_rate": 0.0003878169096209912,
"loss": 3.2724,
"step": 60750
},
{
"epoch": 17.715617715617714,
"grad_norm": 0.3742389380931854,
"learning_rate": 0.00038764198250728856,
"loss": 3.2664,
"step": 60800
},
{
"epoch": 17.73018648018648,
"grad_norm": 0.40025609731674194,
"learning_rate": 0.00038746705539358594,
"loss": 3.2683,
"step": 60850
},
{
"epoch": 17.744755244755243,
"grad_norm": 0.36912432312965393,
"learning_rate": 0.0003872921282798833,
"loss": 3.2726,
"step": 60900
},
{
"epoch": 17.759324009324008,
"grad_norm": 0.37890687584877014,
"learning_rate": 0.00038711720116618075,
"loss": 3.2663,
"step": 60950
},
{
"epoch": 17.773892773892776,
"grad_norm": 0.38784855604171753,
"learning_rate": 0.0003869422740524781,
"loss": 3.2783,
"step": 61000
},
{
"epoch": 17.773892773892776,
"eval_accuracy": 0.37300621749935503,
"eval_loss": 3.5378918647766113,
"eval_runtime": 180.0537,
"eval_samples_per_second": 92.428,
"eval_steps_per_second": 5.782,
"step": 61000
},
{
"epoch": 17.78846153846154,
"grad_norm": 0.3923535943031311,
"learning_rate": 0.0003867673469387755,
"loss": 3.2814,
"step": 61050
},
{
"epoch": 17.803030303030305,
"grad_norm": 0.39278125762939453,
"learning_rate": 0.0003865924198250729,
"loss": 3.275,
"step": 61100
},
{
"epoch": 17.81759906759907,
"grad_norm": 0.3636791706085205,
"learning_rate": 0.00038641749271137025,
"loss": 3.2685,
"step": 61150
},
{
"epoch": 17.832167832167833,
"grad_norm": 0.3580476939678192,
"learning_rate": 0.00038624256559766757,
"loss": 3.2727,
"step": 61200
},
{
"epoch": 17.846736596736598,
"grad_norm": 0.3807421922683716,
"learning_rate": 0.00038606763848396495,
"loss": 3.2808,
"step": 61250
},
{
"epoch": 17.861305361305362,
"grad_norm": 0.4115810990333557,
"learning_rate": 0.0003858927113702623,
"loss": 3.2744,
"step": 61300
},
{
"epoch": 17.875874125874127,
"grad_norm": 0.3622573912143707,
"learning_rate": 0.00038571778425655975,
"loss": 3.2848,
"step": 61350
},
{
"epoch": 17.89044289044289,
"grad_norm": 0.41785427927970886,
"learning_rate": 0.00038554285714285713,
"loss": 3.2763,
"step": 61400
},
{
"epoch": 17.905011655011656,
"grad_norm": 0.40546101331710815,
"learning_rate": 0.0003853679300291545,
"loss": 3.2897,
"step": 61450
},
{
"epoch": 17.91958041958042,
"grad_norm": 0.36965155601501465,
"learning_rate": 0.0003851930029154519,
"loss": 3.2803,
"step": 61500
},
{
"epoch": 17.934149184149184,
"grad_norm": 0.38562914729118347,
"learning_rate": 0.00038501807580174926,
"loss": 3.2726,
"step": 61550
},
{
"epoch": 17.94871794871795,
"grad_norm": 0.4070318937301636,
"learning_rate": 0.00038484314868804663,
"loss": 3.2823,
"step": 61600
},
{
"epoch": 17.963286713286713,
"grad_norm": 0.36294493079185486,
"learning_rate": 0.00038466822157434395,
"loss": 3.2837,
"step": 61650
},
{
"epoch": 17.977855477855478,
"grad_norm": 0.402779757976532,
"learning_rate": 0.00038449329446064133,
"loss": 3.2822,
"step": 61700
},
{
"epoch": 17.992424242424242,
"grad_norm": 0.3932071626186371,
"learning_rate": 0.0003843183673469387,
"loss": 3.2756,
"step": 61750
},
{
"epoch": 18.006993006993007,
"grad_norm": 0.4182881712913513,
"learning_rate": 0.00038414344023323613,
"loss": 3.2423,
"step": 61800
},
{
"epoch": 18.02156177156177,
"grad_norm": 0.41027215123176575,
"learning_rate": 0.0003839685131195335,
"loss": 3.1628,
"step": 61850
},
{
"epoch": 18.036130536130536,
"grad_norm": 0.3935529291629791,
"learning_rate": 0.0003837935860058309,
"loss": 3.1793,
"step": 61900
},
{
"epoch": 18.0506993006993,
"grad_norm": 0.4026806950569153,
"learning_rate": 0.00038361865889212826,
"loss": 3.1907,
"step": 61950
},
{
"epoch": 18.065268065268064,
"grad_norm": 0.39838963747024536,
"learning_rate": 0.00038344373177842564,
"loss": 3.1921,
"step": 62000
},
{
"epoch": 18.065268065268064,
"eval_accuracy": 0.3720812477613639,
"eval_loss": 3.5565507411956787,
"eval_runtime": 180.0846,
"eval_samples_per_second": 92.412,
"eval_steps_per_second": 5.781,
"step": 62000
},
{
"epoch": 18.07983682983683,
"grad_norm": 0.3916713297367096,
"learning_rate": 0.000383268804664723,
"loss": 3.1856,
"step": 62050
},
{
"epoch": 18.094405594405593,
"grad_norm": 0.3870500922203064,
"learning_rate": 0.00038309387755102034,
"loss": 3.1768,
"step": 62100
},
{
"epoch": 18.108974358974358,
"grad_norm": 0.43737277388572693,
"learning_rate": 0.0003829189504373177,
"loss": 3.2043,
"step": 62150
},
{
"epoch": 18.123543123543122,
"grad_norm": 0.3725016415119171,
"learning_rate": 0.0003827440233236151,
"loss": 3.1973,
"step": 62200
},
{
"epoch": 18.138111888111887,
"grad_norm": 0.39706695079803467,
"learning_rate": 0.0003825690962099125,
"loss": 3.2012,
"step": 62250
},
{
"epoch": 18.15268065268065,
"grad_norm": 0.4028768241405487,
"learning_rate": 0.0003823941690962099,
"loss": 3.2017,
"step": 62300
},
{
"epoch": 18.16724941724942,
"grad_norm": 0.386943519115448,
"learning_rate": 0.00038221924198250727,
"loss": 3.2188,
"step": 62350
},
{
"epoch": 18.181818181818183,
"grad_norm": 0.37898704409599304,
"learning_rate": 0.00038204431486880464,
"loss": 3.2086,
"step": 62400
},
{
"epoch": 18.196386946386948,
"grad_norm": 0.39674118161201477,
"learning_rate": 0.000381869387755102,
"loss": 3.226,
"step": 62450
},
{
"epoch": 18.210955710955712,
"grad_norm": 0.38656550645828247,
"learning_rate": 0.0003816944606413994,
"loss": 3.2116,
"step": 62500
},
{
"epoch": 18.225524475524477,
"grad_norm": 0.434226393699646,
"learning_rate": 0.0003815195335276967,
"loss": 3.2205,
"step": 62550
},
{
"epoch": 18.24009324009324,
"grad_norm": 0.38330623507499695,
"learning_rate": 0.0003813446064139941,
"loss": 3.2152,
"step": 62600
},
{
"epoch": 18.254662004662006,
"grad_norm": 0.3713931739330292,
"learning_rate": 0.0003811696793002915,
"loss": 3.2287,
"step": 62650
},
{
"epoch": 18.26923076923077,
"grad_norm": 0.4108925759792328,
"learning_rate": 0.0003809947521865889,
"loss": 3.2264,
"step": 62700
},
{
"epoch": 18.283799533799534,
"grad_norm": 0.40365439653396606,
"learning_rate": 0.0003808198250728863,
"loss": 3.2186,
"step": 62750
},
{
"epoch": 18.2983682983683,
"grad_norm": 0.36954963207244873,
"learning_rate": 0.00038064489795918365,
"loss": 3.2303,
"step": 62800
},
{
"epoch": 18.312937062937063,
"grad_norm": 0.3932103216648102,
"learning_rate": 0.000380469970845481,
"loss": 3.2229,
"step": 62850
},
{
"epoch": 18.327505827505828,
"grad_norm": 0.3911256194114685,
"learning_rate": 0.0003802950437317784,
"loss": 3.2326,
"step": 62900
},
{
"epoch": 18.342074592074592,
"grad_norm": 0.3954258859157562,
"learning_rate": 0.0003801201166180758,
"loss": 3.2245,
"step": 62950
},
{
"epoch": 18.356643356643357,
"grad_norm": 0.39383465051651,
"learning_rate": 0.0003799451895043731,
"loss": 3.2344,
"step": 63000
},
{
"epoch": 18.356643356643357,
"eval_accuracy": 0.372702597887927,
"eval_loss": 3.5481696128845215,
"eval_runtime": 180.1418,
"eval_samples_per_second": 92.383,
"eval_steps_per_second": 5.779,
"step": 63000
},
{
"epoch": 18.37121212121212,
"grad_norm": 0.3588085472583771,
"learning_rate": 0.0003797702623906705,
"loss": 3.241,
"step": 63050
},
{
"epoch": 18.385780885780886,
"grad_norm": 0.3684539198875427,
"learning_rate": 0.0003795953352769679,
"loss": 3.2332,
"step": 63100
},
{
"epoch": 18.40034965034965,
"grad_norm": 0.3738718032836914,
"learning_rate": 0.0003794204081632653,
"loss": 3.2348,
"step": 63150
},
{
"epoch": 18.414918414918414,
"grad_norm": 0.41179683804512024,
"learning_rate": 0.00037924548104956266,
"loss": 3.2366,
"step": 63200
},
{
"epoch": 18.42948717948718,
"grad_norm": 0.3886624574661255,
"learning_rate": 0.00037907055393586003,
"loss": 3.2426,
"step": 63250
},
{
"epoch": 18.444055944055943,
"grad_norm": 0.40785521268844604,
"learning_rate": 0.0003788956268221574,
"loss": 3.2447,
"step": 63300
},
{
"epoch": 18.458624708624708,
"grad_norm": 0.36944928765296936,
"learning_rate": 0.0003787206997084548,
"loss": 3.2366,
"step": 63350
},
{
"epoch": 18.473193473193472,
"grad_norm": 0.4326910078525543,
"learning_rate": 0.00037854577259475216,
"loss": 3.2408,
"step": 63400
},
{
"epoch": 18.487762237762237,
"grad_norm": 0.3922458291053772,
"learning_rate": 0.0003783708454810495,
"loss": 3.2536,
"step": 63450
},
{
"epoch": 18.502331002331,
"grad_norm": 0.404136598110199,
"learning_rate": 0.00037819591836734686,
"loss": 3.2394,
"step": 63500
},
{
"epoch": 18.516899766899765,
"grad_norm": 0.37804481387138367,
"learning_rate": 0.0003780209912536443,
"loss": 3.2318,
"step": 63550
},
{
"epoch": 18.53146853146853,
"grad_norm": 0.38064321875572205,
"learning_rate": 0.00037784606413994166,
"loss": 3.25,
"step": 63600
},
{
"epoch": 18.546037296037294,
"grad_norm": 0.4081238806247711,
"learning_rate": 0.00037767113702623904,
"loss": 3.2407,
"step": 63650
},
{
"epoch": 18.560606060606062,
"grad_norm": 0.4321068227291107,
"learning_rate": 0.0003774962099125364,
"loss": 3.2478,
"step": 63700
},
{
"epoch": 18.575174825174827,
"grad_norm": 0.38897502422332764,
"learning_rate": 0.0003773212827988338,
"loss": 3.24,
"step": 63750
},
{
"epoch": 18.58974358974359,
"grad_norm": 0.39090046286582947,
"learning_rate": 0.00037714635568513117,
"loss": 3.2659,
"step": 63800
},
{
"epoch": 18.604312354312356,
"grad_norm": 0.3871738016605377,
"learning_rate": 0.00037697142857142854,
"loss": 3.2508,
"step": 63850
},
{
"epoch": 18.61888111888112,
"grad_norm": 0.4042744040489197,
"learning_rate": 0.00037679650145772586,
"loss": 3.2437,
"step": 63900
},
{
"epoch": 18.633449883449885,
"grad_norm": 0.3949747085571289,
"learning_rate": 0.0003766215743440233,
"loss": 3.2632,
"step": 63950
},
{
"epoch": 18.64801864801865,
"grad_norm": 0.39369118213653564,
"learning_rate": 0.00037644664723032067,
"loss": 3.2715,
"step": 64000
},
{
"epoch": 18.64801864801865,
"eval_accuracy": 0.3731554403060561,
"eval_loss": 3.5396413803100586,
"eval_runtime": 180.0063,
"eval_samples_per_second": 92.452,
"eval_steps_per_second": 5.783,
"step": 64000
},
{
"epoch": 18.662587412587413,
"grad_norm": 0.3882533609867096,
"learning_rate": 0.00037627172011661805,
"loss": 3.248,
"step": 64050
},
{
"epoch": 18.677156177156178,
"grad_norm": 0.38155344128608704,
"learning_rate": 0.0003760967930029154,
"loss": 3.258,
"step": 64100
},
{
"epoch": 18.691724941724942,
"grad_norm": 0.4137474298477173,
"learning_rate": 0.0003759218658892128,
"loss": 3.259,
"step": 64150
},
{
"epoch": 18.706293706293707,
"grad_norm": 0.38933834433555603,
"learning_rate": 0.00037574693877551017,
"loss": 3.2653,
"step": 64200
},
{
"epoch": 18.72086247086247,
"grad_norm": 0.3694373071193695,
"learning_rate": 0.00037557201166180755,
"loss": 3.2588,
"step": 64250
},
{
"epoch": 18.735431235431236,
"grad_norm": 0.36894160509109497,
"learning_rate": 0.000375397084548105,
"loss": 3.2735,
"step": 64300
},
{
"epoch": 18.75,
"grad_norm": 0.38931140303611755,
"learning_rate": 0.00037522215743440225,
"loss": 3.2714,
"step": 64350
},
{
"epoch": 18.764568764568764,
"grad_norm": 0.38522276282310486,
"learning_rate": 0.0003750472303206997,
"loss": 3.2577,
"step": 64400
},
{
"epoch": 18.77913752913753,
"grad_norm": 0.38880297541618347,
"learning_rate": 0.00037487230320699705,
"loss": 3.251,
"step": 64450
},
{
"epoch": 18.793706293706293,
"grad_norm": 0.3946090340614319,
"learning_rate": 0.00037469737609329443,
"loss": 3.2674,
"step": 64500
},
{
"epoch": 18.808275058275058,
"grad_norm": 0.38031676411628723,
"learning_rate": 0.0003745224489795918,
"loss": 3.2692,
"step": 64550
},
{
"epoch": 18.822843822843822,
"grad_norm": 0.4162115156650543,
"learning_rate": 0.0003743475218658892,
"loss": 3.2606,
"step": 64600
},
{
"epoch": 18.837412587412587,
"grad_norm": 0.37284985184669495,
"learning_rate": 0.00037417259475218655,
"loss": 3.2631,
"step": 64650
},
{
"epoch": 18.85198135198135,
"grad_norm": 0.37252700328826904,
"learning_rate": 0.00037399766763848393,
"loss": 3.2592,
"step": 64700
},
{
"epoch": 18.866550116550115,
"grad_norm": 0.39544638991355896,
"learning_rate": 0.00037382274052478136,
"loss": 3.2621,
"step": 64750
},
{
"epoch": 18.88111888111888,
"grad_norm": 0.3822038769721985,
"learning_rate": 0.0003736478134110787,
"loss": 3.2757,
"step": 64800
},
{
"epoch": 18.895687645687644,
"grad_norm": 0.37755459547042847,
"learning_rate": 0.00037347288629737606,
"loss": 3.2773,
"step": 64850
},
{
"epoch": 18.91025641025641,
"grad_norm": 0.37911224365234375,
"learning_rate": 0.00037329795918367343,
"loss": 3.2818,
"step": 64900
},
{
"epoch": 18.924825174825173,
"grad_norm": 0.380943238735199,
"learning_rate": 0.0003731230320699708,
"loss": 3.2727,
"step": 64950
},
{
"epoch": 18.939393939393938,
"grad_norm": 0.3881559669971466,
"learning_rate": 0.0003729481049562682,
"loss": 3.2683,
"step": 65000
},
{
"epoch": 18.939393939393938,
"eval_accuracy": 0.373618630720237,
"eval_loss": 3.5344443321228027,
"eval_runtime": 180.0973,
"eval_samples_per_second": 92.406,
"eval_steps_per_second": 5.78,
"step": 65000
},
{
"epoch": 18.953962703962706,
"grad_norm": 0.37991875410079956,
"learning_rate": 0.00037277317784256556,
"loss": 3.2692,
"step": 65050
},
{
"epoch": 18.96853146853147,
"grad_norm": 0.38167208433151245,
"learning_rate": 0.00037259825072886294,
"loss": 3.2683,
"step": 65100
},
{
"epoch": 18.983100233100235,
"grad_norm": 0.4169837236404419,
"learning_rate": 0.00037242332361516037,
"loss": 3.2738,
"step": 65150
},
{
"epoch": 18.997668997669,
"grad_norm": 0.377718985080719,
"learning_rate": 0.00037224839650145774,
"loss": 3.2681,
"step": 65200
},
{
"epoch": 19.012237762237763,
"grad_norm": 0.3844805061817169,
"learning_rate": 0.00037207346938775506,
"loss": 3.1864,
"step": 65250
},
{
"epoch": 19.026806526806528,
"grad_norm": 0.3732326626777649,
"learning_rate": 0.00037189854227405244,
"loss": 3.1594,
"step": 65300
},
{
"epoch": 19.041375291375292,
"grad_norm": 0.4207264184951782,
"learning_rate": 0.0003717236151603498,
"loss": 3.1827,
"step": 65350
},
{
"epoch": 19.055944055944057,
"grad_norm": 0.3737161457538605,
"learning_rate": 0.0003715486880466472,
"loss": 3.1722,
"step": 65400
},
{
"epoch": 19.07051282051282,
"grad_norm": 0.39451220631599426,
"learning_rate": 0.00037137376093294457,
"loss": 3.1706,
"step": 65450
},
{
"epoch": 19.085081585081586,
"grad_norm": 0.3852495849132538,
"learning_rate": 0.00037119883381924194,
"loss": 3.18,
"step": 65500
},
{
"epoch": 19.09965034965035,
"grad_norm": 0.39817318320274353,
"learning_rate": 0.0003710239067055393,
"loss": 3.1732,
"step": 65550
},
{
"epoch": 19.114219114219114,
"grad_norm": 0.3873465955257416,
"learning_rate": 0.00037084897959183675,
"loss": 3.1912,
"step": 65600
},
{
"epoch": 19.12878787878788,
"grad_norm": 0.4169006049633026,
"learning_rate": 0.0003706740524781341,
"loss": 3.1931,
"step": 65650
},
{
"epoch": 19.143356643356643,
"grad_norm": 0.39785993099212646,
"learning_rate": 0.00037049912536443145,
"loss": 3.1855,
"step": 65700
},
{
"epoch": 19.157925407925408,
"grad_norm": 0.40643396973609924,
"learning_rate": 0.0003703241982507288,
"loss": 3.1954,
"step": 65750
},
{
"epoch": 19.172494172494172,
"grad_norm": 0.37612253427505493,
"learning_rate": 0.0003701492711370262,
"loss": 3.1991,
"step": 65800
},
{
"epoch": 19.187062937062937,
"grad_norm": 0.3855384290218353,
"learning_rate": 0.0003699743440233236,
"loss": 3.1975,
"step": 65850
},
{
"epoch": 19.2016317016317,
"grad_norm": 0.3889921009540558,
"learning_rate": 0.00036979941690962095,
"loss": 3.2026,
"step": 65900
},
{
"epoch": 19.216200466200466,
"grad_norm": 0.3912866413593292,
"learning_rate": 0.0003696244897959183,
"loss": 3.2027,
"step": 65950
},
{
"epoch": 19.23076923076923,
"grad_norm": 0.38603660464286804,
"learning_rate": 0.0003694495626822157,
"loss": 3.2061,
"step": 66000
},
{
"epoch": 19.23076923076923,
"eval_accuracy": 0.37278585221979804,
"eval_loss": 3.546868085861206,
"eval_runtime": 180.1726,
"eval_samples_per_second": 92.367,
"eval_steps_per_second": 5.778,
"step": 66000
},
{
"epoch": 19.245337995337994,
"grad_norm": 0.4257270395755768,
"learning_rate": 0.00036927463556851313,
"loss": 3.1954,
"step": 66050
},
{
"epoch": 19.25990675990676,
"grad_norm": 0.3850635290145874,
"learning_rate": 0.0003690997084548105,
"loss": 3.1969,
"step": 66100
},
{
"epoch": 19.274475524475523,
"grad_norm": 0.3987645208835602,
"learning_rate": 0.00036892478134110783,
"loss": 3.21,
"step": 66150
},
{
"epoch": 19.289044289044288,
"grad_norm": 0.3949647843837738,
"learning_rate": 0.0003687498542274052,
"loss": 3.2137,
"step": 66200
},
{
"epoch": 19.303613053613052,
"grad_norm": 0.3926634192466736,
"learning_rate": 0.0003685749271137026,
"loss": 3.2339,
"step": 66250
},
{
"epoch": 19.318181818181817,
"grad_norm": 0.379131942987442,
"learning_rate": 0.00036839999999999996,
"loss": 3.2059,
"step": 66300
},
{
"epoch": 19.33275058275058,
"grad_norm": 0.39936667680740356,
"learning_rate": 0.00036822507288629733,
"loss": 3.2171,
"step": 66350
},
{
"epoch": 19.34731934731935,
"grad_norm": 0.401324599981308,
"learning_rate": 0.0003680501457725947,
"loss": 3.2303,
"step": 66400
},
{
"epoch": 19.361888111888113,
"grad_norm": 0.4343133270740509,
"learning_rate": 0.00036787521865889214,
"loss": 3.2231,
"step": 66450
},
{
"epoch": 19.376456876456878,
"grad_norm": 0.408704549074173,
"learning_rate": 0.0003677002915451895,
"loss": 3.219,
"step": 66500
},
{
"epoch": 19.391025641025642,
"grad_norm": 0.4143206775188446,
"learning_rate": 0.0003675253644314869,
"loss": 3.23,
"step": 66550
},
{
"epoch": 19.405594405594407,
"grad_norm": 0.4262617528438568,
"learning_rate": 0.0003673504373177842,
"loss": 3.2392,
"step": 66600
},
{
"epoch": 19.42016317016317,
"grad_norm": 0.42696934938430786,
"learning_rate": 0.0003671755102040816,
"loss": 3.2251,
"step": 66650
},
{
"epoch": 19.434731934731936,
"grad_norm": 0.3789410889148712,
"learning_rate": 0.00036700058309037896,
"loss": 3.2156,
"step": 66700
},
{
"epoch": 19.4493006993007,
"grad_norm": 0.38625243306159973,
"learning_rate": 0.00036682565597667634,
"loss": 3.2374,
"step": 66750
},
{
"epoch": 19.463869463869464,
"grad_norm": 0.43200168013572693,
"learning_rate": 0.0003666507288629737,
"loss": 3.2323,
"step": 66800
},
{
"epoch": 19.47843822843823,
"grad_norm": 0.4356413185596466,
"learning_rate": 0.0003664758017492711,
"loss": 3.2489,
"step": 66850
},
{
"epoch": 19.493006993006993,
"grad_norm": 0.38407424092292786,
"learning_rate": 0.0003663008746355685,
"loss": 3.2306,
"step": 66900
},
{
"epoch": 19.507575757575758,
"grad_norm": 0.39188191294670105,
"learning_rate": 0.0003661259475218659,
"loss": 3.2295,
"step": 66950
},
{
"epoch": 19.522144522144522,
"grad_norm": 0.39774879813194275,
"learning_rate": 0.00036595102040816327,
"loss": 3.2432,
"step": 67000
},
{
"epoch": 19.522144522144522,
"eval_accuracy": 0.373129217543334,
"eval_loss": 3.542863368988037,
"eval_runtime": 180.0798,
"eval_samples_per_second": 92.415,
"eval_steps_per_second": 5.781,
"step": 67000
},
{
"epoch": 19.536713286713287,
"grad_norm": 0.41966238617897034,
"learning_rate": 0.0003657760932944606,
"loss": 3.2393,
"step": 67050
},
{
"epoch": 19.55128205128205,
"grad_norm": 0.3979637026786804,
"learning_rate": 0.00036560116618075797,
"loss": 3.2318,
"step": 67100
},
{
"epoch": 19.565850815850816,
"grad_norm": 0.3898719847202301,
"learning_rate": 0.00036542623906705534,
"loss": 3.2325,
"step": 67150
},
{
"epoch": 19.58041958041958,
"grad_norm": 0.3797769546508789,
"learning_rate": 0.0003652513119533527,
"loss": 3.2352,
"step": 67200
},
{
"epoch": 19.594988344988344,
"grad_norm": 0.4092084765434265,
"learning_rate": 0.0003650763848396501,
"loss": 3.2359,
"step": 67250
},
{
"epoch": 19.60955710955711,
"grad_norm": 0.3929194509983063,
"learning_rate": 0.0003649014577259475,
"loss": 3.2403,
"step": 67300
},
{
"epoch": 19.624125874125873,
"grad_norm": 0.3677727282047272,
"learning_rate": 0.0003647265306122449,
"loss": 3.2368,
"step": 67350
},
{
"epoch": 19.638694638694638,
"grad_norm": 0.40046238899230957,
"learning_rate": 0.0003645516034985423,
"loss": 3.2353,
"step": 67400
},
{
"epoch": 19.653263403263402,
"grad_norm": 0.41111862659454346,
"learning_rate": 0.00036437667638483965,
"loss": 3.2407,
"step": 67450
},
{
"epoch": 19.667832167832167,
"grad_norm": 0.39563626050949097,
"learning_rate": 0.000364201749271137,
"loss": 3.2616,
"step": 67500
},
{
"epoch": 19.68240093240093,
"grad_norm": 0.3748413324356079,
"learning_rate": 0.00036402682215743435,
"loss": 3.2507,
"step": 67550
},
{
"epoch": 19.696969696969695,
"grad_norm": 0.40953975915908813,
"learning_rate": 0.0003638518950437317,
"loss": 3.2504,
"step": 67600
},
{
"epoch": 19.71153846153846,
"grad_norm": 0.40304210782051086,
"learning_rate": 0.0003636769679300291,
"loss": 3.2546,
"step": 67650
},
{
"epoch": 19.726107226107224,
"grad_norm": 0.3886299133300781,
"learning_rate": 0.0003635020408163265,
"loss": 3.2447,
"step": 67700
},
{
"epoch": 19.740675990675992,
"grad_norm": 0.38597121834754944,
"learning_rate": 0.0003633271137026239,
"loss": 3.2565,
"step": 67750
},
{
"epoch": 19.755244755244757,
"grad_norm": 0.36891189217567444,
"learning_rate": 0.0003631521865889213,
"loss": 3.2651,
"step": 67800
},
{
"epoch": 19.76981351981352,
"grad_norm": 0.39409512281417847,
"learning_rate": 0.00036297725947521866,
"loss": 3.253,
"step": 67850
},
{
"epoch": 19.784382284382286,
"grad_norm": 0.4034635126590729,
"learning_rate": 0.00036280233236151604,
"loss": 3.2613,
"step": 67900
},
{
"epoch": 19.79895104895105,
"grad_norm": 0.3696857690811157,
"learning_rate": 0.00036262740524781336,
"loss": 3.2566,
"step": 67950
},
{
"epoch": 19.813519813519815,
"grad_norm": 0.4099896550178528,
"learning_rate": 0.00036245247813411073,
"loss": 3.2545,
"step": 68000
},
{
"epoch": 19.813519813519815,
"eval_accuracy": 0.3735982875007261,
"eval_loss": 3.5354998111724854,
"eval_runtime": 180.0799,
"eval_samples_per_second": 92.415,
"eval_steps_per_second": 5.781,
"step": 68000
},
{
"epoch": 19.82808857808858,
"grad_norm": 0.40911927819252014,
"learning_rate": 0.0003622775510204081,
"loss": 3.2756,
"step": 68050
},
{
"epoch": 19.842657342657343,
"grad_norm": 0.36278921365737915,
"learning_rate": 0.0003621026239067055,
"loss": 3.2448,
"step": 68100
},
{
"epoch": 19.857226107226108,
"grad_norm": 0.4001983404159546,
"learning_rate": 0.00036192769679300286,
"loss": 3.2533,
"step": 68150
},
{
"epoch": 19.871794871794872,
"grad_norm": 0.40843650698661804,
"learning_rate": 0.0003617527696793003,
"loss": 3.2475,
"step": 68200
},
{
"epoch": 19.886363636363637,
"grad_norm": 0.38101208209991455,
"learning_rate": 0.00036157784256559767,
"loss": 3.2538,
"step": 68250
},
{
"epoch": 19.9009324009324,
"grad_norm": 0.4274073541164398,
"learning_rate": 0.00036140291545189504,
"loss": 3.2605,
"step": 68300
},
{
"epoch": 19.915501165501166,
"grad_norm": 0.3906668424606323,
"learning_rate": 0.0003612279883381924,
"loss": 3.2643,
"step": 68350
},
{
"epoch": 19.93006993006993,
"grad_norm": 0.35871779918670654,
"learning_rate": 0.00036105306122448974,
"loss": 3.2659,
"step": 68400
},
{
"epoch": 19.944638694638694,
"grad_norm": 0.3644237518310547,
"learning_rate": 0.0003608781341107871,
"loss": 3.2685,
"step": 68450
},
{
"epoch": 19.95920745920746,
"grad_norm": 0.45362988114356995,
"learning_rate": 0.0003607032069970845,
"loss": 3.2506,
"step": 68500
},
{
"epoch": 19.973776223776223,
"grad_norm": 0.38506874442100525,
"learning_rate": 0.00036052827988338187,
"loss": 3.2658,
"step": 68550
},
{
"epoch": 19.988344988344988,
"grad_norm": 0.445112407207489,
"learning_rate": 0.0003603533527696793,
"loss": 3.2594,
"step": 68600
},
{
"epoch": 20.002913752913752,
"grad_norm": 0.3915383219718933,
"learning_rate": 0.00036017842565597667,
"loss": 3.2511,
"step": 68650
},
{
"epoch": 20.017482517482517,
"grad_norm": 0.39854714274406433,
"learning_rate": 0.00036000349854227405,
"loss": 3.156,
"step": 68700
},
{
"epoch": 20.03205128205128,
"grad_norm": 0.4197991192340851,
"learning_rate": 0.0003598285714285714,
"loss": 3.1582,
"step": 68750
},
{
"epoch": 20.046620046620045,
"grad_norm": 0.415543794631958,
"learning_rate": 0.0003596536443148688,
"loss": 3.1528,
"step": 68800
},
{
"epoch": 20.06118881118881,
"grad_norm": 0.3774547874927521,
"learning_rate": 0.0003594787172011661,
"loss": 3.1522,
"step": 68850
},
{
"epoch": 20.075757575757574,
"grad_norm": 0.37995073199272156,
"learning_rate": 0.0003593037900874635,
"loss": 3.1586,
"step": 68900
},
{
"epoch": 20.09032634032634,
"grad_norm": 0.3786678612232208,
"learning_rate": 0.00035912886297376087,
"loss": 3.1593,
"step": 68950
},
{
"epoch": 20.104895104895103,
"grad_norm": 0.4058922231197357,
"learning_rate": 0.00035895393586005825,
"loss": 3.1751,
"step": 69000
},
{
"epoch": 20.104895104895103,
"eval_accuracy": 0.373238106683606,
"eval_loss": 3.54990553855896,
"eval_runtime": 180.2945,
"eval_samples_per_second": 92.305,
"eval_steps_per_second": 5.774,
"step": 69000
},
{
"epoch": 20.11946386946387,
"grad_norm": 0.4163013994693756,
"learning_rate": 0.0003587790087463557,
"loss": 3.1827,
"step": 69050
},
{
"epoch": 20.134032634032636,
"grad_norm": 0.3935016691684723,
"learning_rate": 0.00035860408163265305,
"loss": 3.1895,
"step": 69100
},
{
"epoch": 20.1486013986014,
"grad_norm": 0.3995712399482727,
"learning_rate": 0.00035842915451895043,
"loss": 3.193,
"step": 69150
},
{
"epoch": 20.163170163170165,
"grad_norm": 0.38892659544944763,
"learning_rate": 0.0003582542274052478,
"loss": 3.1873,
"step": 69200
},
{
"epoch": 20.17773892773893,
"grad_norm": 0.4174294173717499,
"learning_rate": 0.0003580793002915452,
"loss": 3.1912,
"step": 69250
},
{
"epoch": 20.192307692307693,
"grad_norm": 0.4112229645252228,
"learning_rate": 0.0003579043731778425,
"loss": 3.1825,
"step": 69300
},
{
"epoch": 20.206876456876458,
"grad_norm": 0.405185729265213,
"learning_rate": 0.0003577294460641399,
"loss": 3.1985,
"step": 69350
},
{
"epoch": 20.221445221445222,
"grad_norm": 0.3787309527397156,
"learning_rate": 0.00035755451895043725,
"loss": 3.1956,
"step": 69400
},
{
"epoch": 20.236013986013987,
"grad_norm": 0.38647204637527466,
"learning_rate": 0.00035737959183673463,
"loss": 3.1993,
"step": 69450
},
{
"epoch": 20.25058275058275,
"grad_norm": 0.39363858103752136,
"learning_rate": 0.00035720466472303206,
"loss": 3.1986,
"step": 69500
},
{
"epoch": 20.265151515151516,
"grad_norm": 0.38450032472610474,
"learning_rate": 0.00035702973760932944,
"loss": 3.2011,
"step": 69550
},
{
"epoch": 20.27972027972028,
"grad_norm": 0.4013044834136963,
"learning_rate": 0.0003568548104956268,
"loss": 3.2051,
"step": 69600
},
{
"epoch": 20.294289044289044,
"grad_norm": 0.385503888130188,
"learning_rate": 0.0003566798833819242,
"loss": 3.2117,
"step": 69650
},
{
"epoch": 20.30885780885781,
"grad_norm": 0.39147505164146423,
"learning_rate": 0.00035650495626822156,
"loss": 3.2079,
"step": 69700
},
{
"epoch": 20.323426573426573,
"grad_norm": 0.4091346561908722,
"learning_rate": 0.0003563300291545189,
"loss": 3.2091,
"step": 69750
},
{
"epoch": 20.337995337995338,
"grad_norm": 0.38685789704322815,
"learning_rate": 0.00035615510204081626,
"loss": 3.2058,
"step": 69800
},
{
"epoch": 20.352564102564102,
"grad_norm": 0.3918103873729706,
"learning_rate": 0.00035598017492711364,
"loss": 3.201,
"step": 69850
},
{
"epoch": 20.367132867132867,
"grad_norm": 0.37603530287742615,
"learning_rate": 0.00035580524781341107,
"loss": 3.2128,
"step": 69900
},
{
"epoch": 20.38170163170163,
"grad_norm": 0.3842366933822632,
"learning_rate": 0.00035563032069970844,
"loss": 3.212,
"step": 69950
},
{
"epoch": 20.396270396270396,
"grad_norm": 0.3946133852005005,
"learning_rate": 0.0003554553935860058,
"loss": 3.2055,
"step": 70000
},
{
"epoch": 20.396270396270396,
"eval_accuracy": 0.3729368388894625,
"eval_loss": 3.549053192138672,
"eval_runtime": 180.0767,
"eval_samples_per_second": 92.416,
"eval_steps_per_second": 5.781,
"step": 70000
},
{
"epoch": 20.41083916083916,
"grad_norm": 0.4046531915664673,
"learning_rate": 0.0003552804664723032,
"loss": 3.2123,
"step": 70050
},
{
"epoch": 20.425407925407924,
"grad_norm": 0.39396217465400696,
"learning_rate": 0.00035510553935860057,
"loss": 3.2179,
"step": 70100
},
{
"epoch": 20.43997668997669,
"grad_norm": 0.4202251434326172,
"learning_rate": 0.00035493061224489795,
"loss": 3.2314,
"step": 70150
},
{
"epoch": 20.454545454545453,
"grad_norm": 0.4146822392940521,
"learning_rate": 0.00035475568513119527,
"loss": 3.2219,
"step": 70200
},
{
"epoch": 20.469114219114218,
"grad_norm": 0.44110000133514404,
"learning_rate": 0.00035458075801749264,
"loss": 3.2281,
"step": 70250
},
{
"epoch": 20.483682983682982,
"grad_norm": 0.43832480907440186,
"learning_rate": 0.00035440583090379,
"loss": 3.2258,
"step": 70300
},
{
"epoch": 20.498251748251747,
"grad_norm": 0.4042571783065796,
"learning_rate": 0.00035423090379008745,
"loss": 3.2261,
"step": 70350
},
{
"epoch": 20.51282051282051,
"grad_norm": 0.4489721953868866,
"learning_rate": 0.0003540559766763848,
"loss": 3.2331,
"step": 70400
},
{
"epoch": 20.52738927738928,
"grad_norm": 0.4016249179840088,
"learning_rate": 0.0003538810495626822,
"loss": 3.2284,
"step": 70450
},
{
"epoch": 20.541958041958043,
"grad_norm": 0.5365473628044128,
"learning_rate": 0.0003537061224489796,
"loss": 3.2255,
"step": 70500
},
{
"epoch": 20.556526806526808,
"grad_norm": 0.4111057221889496,
"learning_rate": 0.00035353119533527695,
"loss": 3.2203,
"step": 70550
},
{
"epoch": 20.571095571095572,
"grad_norm": 0.4217143654823303,
"learning_rate": 0.00035335626822157433,
"loss": 3.2344,
"step": 70600
},
{
"epoch": 20.585664335664337,
"grad_norm": 0.41283461451530457,
"learning_rate": 0.00035318134110787165,
"loss": 3.2391,
"step": 70650
},
{
"epoch": 20.6002331002331,
"grad_norm": 0.4114687442779541,
"learning_rate": 0.000353006413994169,
"loss": 3.2305,
"step": 70700
},
{
"epoch": 20.614801864801866,
"grad_norm": 0.4181511402130127,
"learning_rate": 0.00035283148688046646,
"loss": 3.2326,
"step": 70750
},
{
"epoch": 20.62937062937063,
"grad_norm": 0.4368336796760559,
"learning_rate": 0.00035265655976676383,
"loss": 3.2398,
"step": 70800
},
{
"epoch": 20.643939393939394,
"grad_norm": 0.38254714012145996,
"learning_rate": 0.0003524816326530612,
"loss": 3.2424,
"step": 70850
},
{
"epoch": 20.65850815850816,
"grad_norm": 0.3811262249946594,
"learning_rate": 0.0003523067055393586,
"loss": 3.2214,
"step": 70900
},
{
"epoch": 20.673076923076923,
"grad_norm": 0.38296961784362793,
"learning_rate": 0.00035213177842565596,
"loss": 3.2346,
"step": 70950
},
{
"epoch": 20.687645687645688,
"grad_norm": 0.42211851477622986,
"learning_rate": 0.00035195685131195333,
"loss": 3.2364,
"step": 71000
},
{
"epoch": 20.687645687645688,
"eval_accuracy": 0.37363732766764873,
"eval_loss": 3.5381784439086914,
"eval_runtime": 180.1083,
"eval_samples_per_second": 92.4,
"eval_steps_per_second": 5.78,
"step": 71000
},
{
"epoch": 20.702214452214452,
"grad_norm": 0.39245641231536865,
"learning_rate": 0.0003517819241982507,
"loss": 3.2294,
"step": 71050
},
{
"epoch": 20.716783216783217,
"grad_norm": 0.4119685888290405,
"learning_rate": 0.00035160699708454803,
"loss": 3.2396,
"step": 71100
},
{
"epoch": 20.73135198135198,
"grad_norm": 0.41568517684936523,
"learning_rate": 0.0003514320699708454,
"loss": 3.2331,
"step": 71150
},
{
"epoch": 20.745920745920746,
"grad_norm": 0.4322853684425354,
"learning_rate": 0.00035125714285714284,
"loss": 3.2411,
"step": 71200
},
{
"epoch": 20.76048951048951,
"grad_norm": 0.41902053356170654,
"learning_rate": 0.0003510822157434402,
"loss": 3.2489,
"step": 71250
},
{
"epoch": 20.775058275058274,
"grad_norm": 0.4021133482456207,
"learning_rate": 0.0003509072886297376,
"loss": 3.2422,
"step": 71300
},
{
"epoch": 20.78962703962704,
"grad_norm": 0.3975924849510193,
"learning_rate": 0.00035073236151603497,
"loss": 3.2368,
"step": 71350
},
{
"epoch": 20.804195804195803,
"grad_norm": 0.38075828552246094,
"learning_rate": 0.00035055743440233234,
"loss": 3.2414,
"step": 71400
},
{
"epoch": 20.818764568764568,
"grad_norm": 0.4107215702533722,
"learning_rate": 0.0003503825072886297,
"loss": 3.2484,
"step": 71450
},
{
"epoch": 20.833333333333332,
"grad_norm": 0.39930886030197144,
"learning_rate": 0.0003502075801749271,
"loss": 3.2399,
"step": 71500
},
{
"epoch": 20.847902097902097,
"grad_norm": 0.43924176692962646,
"learning_rate": 0.0003500326530612244,
"loss": 3.238,
"step": 71550
},
{
"epoch": 20.86247086247086,
"grad_norm": 0.4071701467037201,
"learning_rate": 0.0003498577259475218,
"loss": 3.2547,
"step": 71600
},
{
"epoch": 20.877039627039625,
"grad_norm": 0.3740857243537903,
"learning_rate": 0.0003496827988338192,
"loss": 3.2425,
"step": 71650
},
{
"epoch": 20.89160839160839,
"grad_norm": 0.4022287428379059,
"learning_rate": 0.0003495078717201166,
"loss": 3.2539,
"step": 71700
},
{
"epoch": 20.906177156177158,
"grad_norm": 0.43108558654785156,
"learning_rate": 0.00034933294460641397,
"loss": 3.2453,
"step": 71750
},
{
"epoch": 20.920745920745922,
"grad_norm": 0.38944077491760254,
"learning_rate": 0.00034915801749271135,
"loss": 3.2474,
"step": 71800
},
{
"epoch": 20.935314685314687,
"grad_norm": 0.3937079906463623,
"learning_rate": 0.0003489830903790087,
"loss": 3.255,
"step": 71850
},
{
"epoch": 20.94988344988345,
"grad_norm": 0.45583540201187134,
"learning_rate": 0.0003488081632653061,
"loss": 3.2412,
"step": 71900
},
{
"epoch": 20.964452214452216,
"grad_norm": 0.40046775341033936,
"learning_rate": 0.0003486332361516035,
"loss": 3.2441,
"step": 71950
},
{
"epoch": 20.97902097902098,
"grad_norm": 0.4097670018672943,
"learning_rate": 0.0003484583090379008,
"loss": 3.2603,
"step": 72000
},
{
"epoch": 20.97902097902098,
"eval_accuracy": 0.374076647136392,
"eval_loss": 3.532677173614502,
"eval_runtime": 180.1507,
"eval_samples_per_second": 92.378,
"eval_steps_per_second": 5.778,
"step": 72000
},
{
"epoch": 20.993589743589745,
"grad_norm": 0.4118557274341583,
"learning_rate": 0.0003482833819241982,
"loss": 3.2535,
"step": 72050
},
{
"epoch": 21.00815850815851,
"grad_norm": 0.4162599444389343,
"learning_rate": 0.0003481084548104956,
"loss": 3.1958,
"step": 72100
},
{
"epoch": 21.022727272727273,
"grad_norm": 0.40431153774261475,
"learning_rate": 0.000347933527696793,
"loss": 3.1384,
"step": 72150
},
{
"epoch": 21.037296037296038,
"grad_norm": 0.40858593583106995,
"learning_rate": 0.00034775860058309035,
"loss": 3.1558,
"step": 72200
},
{
"epoch": 21.051864801864802,
"grad_norm": 0.4380705654621124,
"learning_rate": 0.00034758367346938773,
"loss": 3.1546,
"step": 72250
},
{
"epoch": 21.066433566433567,
"grad_norm": 0.39787471294403076,
"learning_rate": 0.0003474087463556851,
"loss": 3.1561,
"step": 72300
},
{
"epoch": 21.08100233100233,
"grad_norm": 0.38657239079475403,
"learning_rate": 0.0003472338192419825,
"loss": 3.1574,
"step": 72350
},
{
"epoch": 21.095571095571096,
"grad_norm": 0.38952600955963135,
"learning_rate": 0.0003470588921282799,
"loss": 3.1672,
"step": 72400
},
{
"epoch": 21.11013986013986,
"grad_norm": 0.41245877742767334,
"learning_rate": 0.0003468839650145772,
"loss": 3.175,
"step": 72450
},
{
"epoch": 21.124708624708624,
"grad_norm": 0.40463095903396606,
"learning_rate": 0.0003467090379008746,
"loss": 3.1695,
"step": 72500
},
{
"epoch": 21.13927738927739,
"grad_norm": 0.41737812757492065,
"learning_rate": 0.000346534110787172,
"loss": 3.1721,
"step": 72550
},
{
"epoch": 21.153846153846153,
"grad_norm": 0.5267341136932373,
"learning_rate": 0.00034635918367346936,
"loss": 3.1717,
"step": 72600
},
{
"epoch": 21.168414918414918,
"grad_norm": 0.37345781922340393,
"learning_rate": 0.00034618425655976674,
"loss": 3.1725,
"step": 72650
},
{
"epoch": 21.182983682983682,
"grad_norm": 0.4185006618499756,
"learning_rate": 0.0003460093294460641,
"loss": 3.1749,
"step": 72700
},
{
"epoch": 21.197552447552447,
"grad_norm": 0.3790314495563507,
"learning_rate": 0.0003458344023323615,
"loss": 3.1786,
"step": 72750
},
{
"epoch": 21.21212121212121,
"grad_norm": 0.39847561717033386,
"learning_rate": 0.00034565947521865886,
"loss": 3.1873,
"step": 72800
},
{
"epoch": 21.226689976689975,
"grad_norm": 0.4208274781703949,
"learning_rate": 0.0003454845481049562,
"loss": 3.1832,
"step": 72850
},
{
"epoch": 21.24125874125874,
"grad_norm": 0.411825567483902,
"learning_rate": 0.0003453096209912536,
"loss": 3.1955,
"step": 72900
},
{
"epoch": 21.255827505827504,
"grad_norm": 0.42742517590522766,
"learning_rate": 0.000345134693877551,
"loss": 3.1811,
"step": 72950
},
{
"epoch": 21.27039627039627,
"grad_norm": 0.40830036997795105,
"learning_rate": 0.00034495976676384837,
"loss": 3.1853,
"step": 73000
},
{
"epoch": 21.27039627039627,
"eval_accuracy": 0.37328361434806095,
"eval_loss": 3.5470807552337646,
"eval_runtime": 180.3065,
"eval_samples_per_second": 92.298,
"eval_steps_per_second": 5.774,
"step": 73000
},
{
"epoch": 21.284965034965033,
"grad_norm": 0.39452317357063293,
"learning_rate": 0.00034478483965014574,
"loss": 3.1955,
"step": 73050
},
{
"epoch": 21.2995337995338,
"grad_norm": 0.39910146594047546,
"learning_rate": 0.0003446099125364431,
"loss": 3.1821,
"step": 73100
},
{
"epoch": 21.314102564102566,
"grad_norm": 0.3868776559829712,
"learning_rate": 0.0003444349854227405,
"loss": 3.2021,
"step": 73150
},
{
"epoch": 21.32867132867133,
"grad_norm": 0.3973531126976013,
"learning_rate": 0.00034426005830903787,
"loss": 3.1984,
"step": 73200
},
{
"epoch": 21.343240093240095,
"grad_norm": 0.40514352917671204,
"learning_rate": 0.0003440851311953353,
"loss": 3.2046,
"step": 73250
},
{
"epoch": 21.35780885780886,
"grad_norm": 0.41221529245376587,
"learning_rate": 0.00034391020408163257,
"loss": 3.2093,
"step": 73300
},
{
"epoch": 21.372377622377623,
"grad_norm": 0.4092659056186676,
"learning_rate": 0.00034373527696793,
"loss": 3.2081,
"step": 73350
},
{
"epoch": 21.386946386946388,
"grad_norm": 0.38245689868927,
"learning_rate": 0.00034356034985422737,
"loss": 3.1956,
"step": 73400
},
{
"epoch": 21.401515151515152,
"grad_norm": 0.4299090802669525,
"learning_rate": 0.00034338542274052475,
"loss": 3.2053,
"step": 73450
},
{
"epoch": 21.416083916083917,
"grad_norm": 0.37501996755599976,
"learning_rate": 0.0003432104956268221,
"loss": 3.2119,
"step": 73500
},
{
"epoch": 21.43065268065268,
"grad_norm": 0.4129965305328369,
"learning_rate": 0.0003430355685131195,
"loss": 3.2079,
"step": 73550
},
{
"epoch": 21.445221445221446,
"grad_norm": 0.4159983694553375,
"learning_rate": 0.0003428606413994169,
"loss": 3.2124,
"step": 73600
},
{
"epoch": 21.45979020979021,
"grad_norm": 0.4268476665019989,
"learning_rate": 0.00034268571428571425,
"loss": 3.2133,
"step": 73650
},
{
"epoch": 21.474358974358974,
"grad_norm": 0.40658941864967346,
"learning_rate": 0.0003425107871720117,
"loss": 3.2121,
"step": 73700
},
{
"epoch": 21.48892773892774,
"grad_norm": 0.4047592580318451,
"learning_rate": 0.00034233586005830895,
"loss": 3.204,
"step": 73750
},
{
"epoch": 21.503496503496503,
"grad_norm": 0.39788851141929626,
"learning_rate": 0.0003421609329446064,
"loss": 3.212,
"step": 73800
},
{
"epoch": 21.518065268065268,
"grad_norm": 0.4571487307548523,
"learning_rate": 0.00034198600583090375,
"loss": 3.2139,
"step": 73850
},
{
"epoch": 21.532634032634032,
"grad_norm": 0.4046323597431183,
"learning_rate": 0.00034181107871720113,
"loss": 3.2159,
"step": 73900
},
{
"epoch": 21.547202797202797,
"grad_norm": 0.4000145196914673,
"learning_rate": 0.0003416361516034985,
"loss": 3.2115,
"step": 73950
},
{
"epoch": 21.56177156177156,
"grad_norm": 0.39644888043403625,
"learning_rate": 0.0003414612244897959,
"loss": 3.2229,
"step": 74000
},
{
"epoch": 21.56177156177156,
"eval_accuracy": 0.3736430896199957,
"eval_loss": 3.542844772338867,
"eval_runtime": 180.201,
"eval_samples_per_second": 92.352,
"eval_steps_per_second": 5.777,
"step": 74000
},
{
"epoch": 21.576340326340326,
"grad_norm": 0.38864246010780334,
"learning_rate": 0.00034128629737609326,
"loss": 3.2182,
"step": 74050
},
{
"epoch": 21.59090909090909,
"grad_norm": 0.39571088552474976,
"learning_rate": 0.00034111137026239063,
"loss": 3.2163,
"step": 74100
},
{
"epoch": 21.605477855477854,
"grad_norm": 0.40646892786026,
"learning_rate": 0.00034093644314868806,
"loss": 3.2255,
"step": 74150
},
{
"epoch": 21.62004662004662,
"grad_norm": 0.41352179646492004,
"learning_rate": 0.0003407615160349854,
"loss": 3.2206,
"step": 74200
},
{
"epoch": 21.634615384615383,
"grad_norm": 0.4505552053451538,
"learning_rate": 0.00034058658892128276,
"loss": 3.2268,
"step": 74250
},
{
"epoch": 21.649184149184148,
"grad_norm": 0.41021692752838135,
"learning_rate": 0.00034041166180758014,
"loss": 3.2119,
"step": 74300
},
{
"epoch": 21.663752913752912,
"grad_norm": 0.42527827620506287,
"learning_rate": 0.0003402367346938775,
"loss": 3.2275,
"step": 74350
},
{
"epoch": 21.67832167832168,
"grad_norm": 0.3895317018032074,
"learning_rate": 0.0003400618075801749,
"loss": 3.2306,
"step": 74400
},
{
"epoch": 21.692890442890445,
"grad_norm": 0.3877924978733063,
"learning_rate": 0.00033988688046647226,
"loss": 3.2244,
"step": 74450
},
{
"epoch": 21.70745920745921,
"grad_norm": 0.3887571394443512,
"learning_rate": 0.00033971195335276964,
"loss": 3.2274,
"step": 74500
},
{
"epoch": 21.722027972027973,
"grad_norm": 0.401986300945282,
"learning_rate": 0.00033953702623906707,
"loss": 3.2308,
"step": 74550
},
{
"epoch": 21.736596736596738,
"grad_norm": 0.421645849943161,
"learning_rate": 0.00033936209912536445,
"loss": 3.2348,
"step": 74600
},
{
"epoch": 21.751165501165502,
"grad_norm": 0.40199247002601624,
"learning_rate": 0.00033918717201166177,
"loss": 3.2305,
"step": 74650
},
{
"epoch": 21.765734265734267,
"grad_norm": 0.456487238407135,
"learning_rate": 0.00033901224489795914,
"loss": 3.2305,
"step": 74700
},
{
"epoch": 21.78030303030303,
"grad_norm": 0.4060775935649872,
"learning_rate": 0.0003388373177842565,
"loss": 3.2375,
"step": 74750
},
{
"epoch": 21.794871794871796,
"grad_norm": 0.40119364857673645,
"learning_rate": 0.0003386623906705539,
"loss": 3.2358,
"step": 74800
},
{
"epoch": 21.80944055944056,
"grad_norm": 0.42308083176612854,
"learning_rate": 0.00033848746355685127,
"loss": 3.2385,
"step": 74850
},
{
"epoch": 21.824009324009324,
"grad_norm": 0.42308709025382996,
"learning_rate": 0.00033831253644314865,
"loss": 3.2376,
"step": 74900
},
{
"epoch": 21.83857808857809,
"grad_norm": 0.42288511991500854,
"learning_rate": 0.000338137609329446,
"loss": 3.2356,
"step": 74950
},
{
"epoch": 21.853146853146853,
"grad_norm": 0.4140841066837311,
"learning_rate": 0.00033796268221574345,
"loss": 3.2503,
"step": 75000
},
{
"epoch": 21.853146853146853,
"eval_accuracy": 0.3742927791448369,
"eval_loss": 3.533069610595703,
"eval_runtime": 180.2744,
"eval_samples_per_second": 92.315,
"eval_steps_per_second": 5.775,
"step": 75000
},
{
"epoch": 21.867715617715618,
"grad_norm": 0.4126766622066498,
"learning_rate": 0.00033778775510204083,
"loss": 3.2405,
"step": 75050
},
{
"epoch": 21.882284382284382,
"grad_norm": 0.3706146776676178,
"learning_rate": 0.00033761282798833815,
"loss": 3.2475,
"step": 75100
},
{
"epoch": 21.896853146853147,
"grad_norm": 0.40189969539642334,
"learning_rate": 0.0003374379008746355,
"loss": 3.2266,
"step": 75150
},
{
"epoch": 21.91142191142191,
"grad_norm": 0.39839887619018555,
"learning_rate": 0.0003372629737609329,
"loss": 3.24,
"step": 75200
},
{
"epoch": 21.925990675990676,
"grad_norm": 0.4024837911128998,
"learning_rate": 0.0003370880466472303,
"loss": 3.2304,
"step": 75250
},
{
"epoch": 21.94055944055944,
"grad_norm": 0.4002145230770111,
"learning_rate": 0.00033691311953352765,
"loss": 3.2409,
"step": 75300
},
{
"epoch": 21.955128205128204,
"grad_norm": 0.4018450975418091,
"learning_rate": 0.00033673819241982503,
"loss": 3.2406,
"step": 75350
},
{
"epoch": 21.96969696969697,
"grad_norm": 0.39322006702423096,
"learning_rate": 0.00033656326530612246,
"loss": 3.2375,
"step": 75400
},
{
"epoch": 21.984265734265733,
"grad_norm": 0.40275838971138,
"learning_rate": 0.00033638833819241983,
"loss": 3.236,
"step": 75450
},
{
"epoch": 21.998834498834498,
"grad_norm": 0.4216180741786957,
"learning_rate": 0.0003362134110787172,
"loss": 3.2382,
"step": 75500
},
{
"epoch": 22.013403263403262,
"grad_norm": 0.41659364104270935,
"learning_rate": 0.00033603848396501453,
"loss": 3.1556,
"step": 75550
},
{
"epoch": 22.027972027972027,
"grad_norm": 0.4074558615684509,
"learning_rate": 0.0003358635568513119,
"loss": 3.1399,
"step": 75600
},
{
"epoch": 22.04254079254079,
"grad_norm": 0.4153924584388733,
"learning_rate": 0.0003356886297376093,
"loss": 3.1437,
"step": 75650
},
{
"epoch": 22.057109557109555,
"grad_norm": 0.4084555506706238,
"learning_rate": 0.00033551370262390666,
"loss": 3.1462,
"step": 75700
},
{
"epoch": 22.071678321678323,
"grad_norm": 0.4191485047340393,
"learning_rate": 0.00033533877551020403,
"loss": 3.1581,
"step": 75750
},
{
"epoch": 22.086247086247088,
"grad_norm": 0.4273689091205597,
"learning_rate": 0.0003351638483965014,
"loss": 3.1518,
"step": 75800
},
{
"epoch": 22.100815850815852,
"grad_norm": 0.413409948348999,
"learning_rate": 0.00033498892128279884,
"loss": 3.1435,
"step": 75850
},
{
"epoch": 22.115384615384617,
"grad_norm": 0.39928290247917175,
"learning_rate": 0.0003348139941690962,
"loss": 3.1549,
"step": 75900
},
{
"epoch": 22.12995337995338,
"grad_norm": 0.41116443276405334,
"learning_rate": 0.0003346390670553936,
"loss": 3.1695,
"step": 75950
},
{
"epoch": 22.144522144522146,
"grad_norm": 0.4264084994792938,
"learning_rate": 0.0003344641399416909,
"loss": 3.1733,
"step": 76000
},
{
"epoch": 22.144522144522146,
"eval_accuracy": 0.3729999851835511,
"eval_loss": 3.551360607147217,
"eval_runtime": 181.2907,
"eval_samples_per_second": 91.797,
"eval_steps_per_second": 5.742,
"step": 76000
},
{
"epoch": 22.15909090909091,
"grad_norm": 0.4089738428592682,
"learning_rate": 0.0003342892128279883,
"loss": 3.1636,
"step": 76050
},
{
"epoch": 22.173659673659674,
"grad_norm": 0.4810738265514374,
"learning_rate": 0.00033411428571428567,
"loss": 3.1689,
"step": 76100
},
{
"epoch": 22.18822843822844,
"grad_norm": 0.43918928503990173,
"learning_rate": 0.00033393935860058304,
"loss": 3.1725,
"step": 76150
},
{
"epoch": 22.202797202797203,
"grad_norm": 0.4179990291595459,
"learning_rate": 0.0003337644314868804,
"loss": 3.1798,
"step": 76200
},
{
"epoch": 22.217365967365968,
"grad_norm": 0.4197443127632141,
"learning_rate": 0.0003335895043731778,
"loss": 3.1672,
"step": 76250
},
{
"epoch": 22.231934731934732,
"grad_norm": 0.3869195282459259,
"learning_rate": 0.0003334145772594752,
"loss": 3.1915,
"step": 76300
},
{
"epoch": 22.246503496503497,
"grad_norm": 0.3932842016220093,
"learning_rate": 0.0003332396501457726,
"loss": 3.169,
"step": 76350
},
{
"epoch": 22.26107226107226,
"grad_norm": 0.4352983832359314,
"learning_rate": 0.00033306472303207,
"loss": 3.1863,
"step": 76400
},
{
"epoch": 22.275641025641026,
"grad_norm": 0.3932501971721649,
"learning_rate": 0.0003328897959183673,
"loss": 3.1677,
"step": 76450
},
{
"epoch": 22.29020979020979,
"grad_norm": 0.4138025939464569,
"learning_rate": 0.00033271486880466467,
"loss": 3.1914,
"step": 76500
},
{
"epoch": 22.304778554778554,
"grad_norm": 0.4232082664966583,
"learning_rate": 0.00033253994169096205,
"loss": 3.1857,
"step": 76550
},
{
"epoch": 22.31934731934732,
"grad_norm": 0.39019161462783813,
"learning_rate": 0.0003323650145772594,
"loss": 3.1815,
"step": 76600
},
{
"epoch": 22.333916083916083,
"grad_norm": 0.3833393454551697,
"learning_rate": 0.0003321900874635568,
"loss": 3.1873,
"step": 76650
},
{
"epoch": 22.348484848484848,
"grad_norm": 0.4164026975631714,
"learning_rate": 0.00033201516034985423,
"loss": 3.1863,
"step": 76700
},
{
"epoch": 22.363053613053612,
"grad_norm": 0.41754239797592163,
"learning_rate": 0.0003318402332361516,
"loss": 3.1991,
"step": 76750
},
{
"epoch": 22.377622377622377,
"grad_norm": 0.41156837344169617,
"learning_rate": 0.000331665306122449,
"loss": 3.207,
"step": 76800
},
{
"epoch": 22.39219114219114,
"grad_norm": 0.4196581244468689,
"learning_rate": 0.00033149037900874636,
"loss": 3.1954,
"step": 76850
},
{
"epoch": 22.406759906759905,
"grad_norm": 0.41179725527763367,
"learning_rate": 0.0003313154518950437,
"loss": 3.2009,
"step": 76900
},
{
"epoch": 22.42132867132867,
"grad_norm": 0.41211971640586853,
"learning_rate": 0.00033114052478134105,
"loss": 3.1883,
"step": 76950
},
{
"epoch": 22.435897435897434,
"grad_norm": 0.4655284583568573,
"learning_rate": 0.00033096559766763843,
"loss": 3.1996,
"step": 77000
},
{
"epoch": 22.435897435897434,
"eval_accuracy": 0.3736674309288902,
"eval_loss": 3.5417394638061523,
"eval_runtime": 179.8191,
"eval_samples_per_second": 92.549,
"eval_steps_per_second": 5.789,
"step": 77000
},
{
"epoch": 22.4504662004662,
"grad_norm": 0.38444826006889343,
"learning_rate": 0.0003307906705539358,
"loss": 3.2122,
"step": 77050
},
{
"epoch": 22.465034965034967,
"grad_norm": 0.4191334545612335,
"learning_rate": 0.0003306157434402332,
"loss": 3.1952,
"step": 77100
},
{
"epoch": 22.47960372960373,
"grad_norm": 0.4504798352718353,
"learning_rate": 0.0003304408163265306,
"loss": 3.2073,
"step": 77150
},
{
"epoch": 22.494172494172496,
"grad_norm": 0.4140982925891876,
"learning_rate": 0.000330265889212828,
"loss": 3.1971,
"step": 77200
},
{
"epoch": 22.50874125874126,
"grad_norm": 0.4141624867916107,
"learning_rate": 0.00033009096209912536,
"loss": 3.2065,
"step": 77250
},
{
"epoch": 22.523310023310025,
"grad_norm": 0.39601314067840576,
"learning_rate": 0.00032991603498542274,
"loss": 3.206,
"step": 77300
},
{
"epoch": 22.53787878787879,
"grad_norm": 0.3927428126335144,
"learning_rate": 0.00032974110787172006,
"loss": 3.1968,
"step": 77350
},
{
"epoch": 22.552447552447553,
"grad_norm": 0.42265579104423523,
"learning_rate": 0.00032956618075801744,
"loss": 3.2056,
"step": 77400
},
{
"epoch": 22.567016317016318,
"grad_norm": 0.39944592118263245,
"learning_rate": 0.0003293912536443148,
"loss": 3.213,
"step": 77450
},
{
"epoch": 22.581585081585082,
"grad_norm": 0.4446455240249634,
"learning_rate": 0.0003292163265306122,
"loss": 3.2058,
"step": 77500
},
{
"epoch": 22.596153846153847,
"grad_norm": 0.41780418157577515,
"learning_rate": 0.00032904139941690956,
"loss": 3.2082,
"step": 77550
},
{
"epoch": 22.61072261072261,
"grad_norm": 0.3988536596298218,
"learning_rate": 0.000328866472303207,
"loss": 3.2213,
"step": 77600
},
{
"epoch": 22.625291375291376,
"grad_norm": 0.4125400185585022,
"learning_rate": 0.00032869154518950437,
"loss": 3.2192,
"step": 77650
},
{
"epoch": 22.63986013986014,
"grad_norm": 0.43208998441696167,
"learning_rate": 0.00032851661807580174,
"loss": 3.2108,
"step": 77700
},
{
"epoch": 22.654428904428904,
"grad_norm": 0.4006592631340027,
"learning_rate": 0.0003283416909620991,
"loss": 3.2152,
"step": 77750
},
{
"epoch": 22.66899766899767,
"grad_norm": 0.40716752409935,
"learning_rate": 0.00032816676384839644,
"loss": 3.2174,
"step": 77800
},
{
"epoch": 22.683566433566433,
"grad_norm": 0.4060366451740265,
"learning_rate": 0.0003279918367346938,
"loss": 3.2187,
"step": 77850
},
{
"epoch": 22.698135198135198,
"grad_norm": 0.38326430320739746,
"learning_rate": 0.0003278169096209912,
"loss": 3.2087,
"step": 77900
},
{
"epoch": 22.712703962703962,
"grad_norm": 0.3733854591846466,
"learning_rate": 0.00032764198250728857,
"loss": 3.2059,
"step": 77950
},
{
"epoch": 22.727272727272727,
"grad_norm": 0.4183722138404846,
"learning_rate": 0.000327467055393586,
"loss": 3.2241,
"step": 78000
},
{
"epoch": 22.727272727272727,
"eval_accuracy": 0.3741929445011102,
"eval_loss": 3.53352952003479,
"eval_runtime": 179.8425,
"eval_samples_per_second": 92.537,
"eval_steps_per_second": 5.788,
"step": 78000
},
{
"epoch": 22.74184149184149,
"grad_norm": 0.41641032695770264,
"learning_rate": 0.0003272921282798834,
"loss": 3.2211,
"step": 78050
},
{
"epoch": 22.756410256410255,
"grad_norm": 0.40354758501052856,
"learning_rate": 0.00032711720116618075,
"loss": 3.2277,
"step": 78100
},
{
"epoch": 22.77097902097902,
"grad_norm": 0.41052916646003723,
"learning_rate": 0.0003269422740524781,
"loss": 3.2228,
"step": 78150
},
{
"epoch": 22.785547785547784,
"grad_norm": 0.38687601685523987,
"learning_rate": 0.0003267673469387755,
"loss": 3.2047,
"step": 78200
},
{
"epoch": 22.80011655011655,
"grad_norm": 0.42369911074638367,
"learning_rate": 0.0003265924198250728,
"loss": 3.2269,
"step": 78250
},
{
"epoch": 22.814685314685313,
"grad_norm": 0.43428516387939453,
"learning_rate": 0.0003264174927113702,
"loss": 3.2192,
"step": 78300
},
{
"epoch": 22.829254079254078,
"grad_norm": 0.3913278877735138,
"learning_rate": 0.0003262425655976676,
"loss": 3.2241,
"step": 78350
},
{
"epoch": 22.843822843822842,
"grad_norm": 0.39617830514907837,
"learning_rate": 0.00032606763848396495,
"loss": 3.2347,
"step": 78400
},
{
"epoch": 22.85839160839161,
"grad_norm": 0.4081778824329376,
"learning_rate": 0.0003258927113702624,
"loss": 3.2338,
"step": 78450
},
{
"epoch": 22.872960372960375,
"grad_norm": 0.41960251331329346,
"learning_rate": 0.00032571778425655976,
"loss": 3.2207,
"step": 78500
},
{
"epoch": 22.88752913752914,
"grad_norm": 0.39189615845680237,
"learning_rate": 0.00032554285714285713,
"loss": 3.2217,
"step": 78550
},
{
"epoch": 22.902097902097903,
"grad_norm": 0.41321077942848206,
"learning_rate": 0.0003253679300291545,
"loss": 3.2203,
"step": 78600
},
{
"epoch": 22.916666666666668,
"grad_norm": 0.38586828112602234,
"learning_rate": 0.0003251930029154519,
"loss": 3.2301,
"step": 78650
},
{
"epoch": 22.931235431235432,
"grad_norm": 0.40068861842155457,
"learning_rate": 0.0003250180758017492,
"loss": 3.2131,
"step": 78700
},
{
"epoch": 22.945804195804197,
"grad_norm": 0.40055349469184875,
"learning_rate": 0.0003248431486880466,
"loss": 3.2253,
"step": 78750
},
{
"epoch": 22.96037296037296,
"grad_norm": 0.416927695274353,
"learning_rate": 0.00032466822157434396,
"loss": 3.2276,
"step": 78800
},
{
"epoch": 22.974941724941726,
"grad_norm": 0.39231857657432556,
"learning_rate": 0.0003244932944606414,
"loss": 3.2387,
"step": 78850
},
{
"epoch": 22.98951048951049,
"grad_norm": 0.39868852496147156,
"learning_rate": 0.00032431836734693876,
"loss": 3.2249,
"step": 78900
},
{
"epoch": 23.004079254079254,
"grad_norm": 0.4433835744857788,
"learning_rate": 0.00032414344023323614,
"loss": 3.2051,
"step": 78950
},
{
"epoch": 23.01864801864802,
"grad_norm": 0.43138596415519714,
"learning_rate": 0.0003239685131195335,
"loss": 3.1207,
"step": 79000
},
{
"epoch": 23.01864801864802,
"eval_accuracy": 0.37376632484570316,
"eval_loss": 3.548415184020996,
"eval_runtime": 216.5967,
"eval_samples_per_second": 76.834,
"eval_steps_per_second": 4.806,
"step": 79000
},
{
"epoch": 23.033216783216783,
"grad_norm": 0.4261009097099304,
"learning_rate": 0.0003237935860058309,
"loss": 3.1319,
"step": 79050
},
{
"epoch": 23.047785547785548,
"grad_norm": 0.3840440809726715,
"learning_rate": 0.00032361865889212827,
"loss": 3.1392,
"step": 79100
},
{
"epoch": 23.062354312354312,
"grad_norm": 0.40722066164016724,
"learning_rate": 0.0003234437317784256,
"loss": 3.142,
"step": 79150
},
{
"epoch": 23.076923076923077,
"grad_norm": 0.4106970727443695,
"learning_rate": 0.00032326880466472296,
"loss": 3.1438,
"step": 79200
},
{
"epoch": 23.09149184149184,
"grad_norm": 0.42081838846206665,
"learning_rate": 0.00032309387755102034,
"loss": 3.146,
"step": 79250
},
{
"epoch": 23.106060606060606,
"grad_norm": 0.41058728098869324,
"learning_rate": 0.00032291895043731777,
"loss": 3.1509,
"step": 79300
},
{
"epoch": 23.12062937062937,
"grad_norm": 0.45436742901802063,
"learning_rate": 0.00032274402332361515,
"loss": 3.1516,
"step": 79350
},
{
"epoch": 23.135198135198134,
"grad_norm": 0.4210570752620697,
"learning_rate": 0.0003225690962099125,
"loss": 3.1577,
"step": 79400
},
{
"epoch": 23.1497668997669,
"grad_norm": 0.3977571129798889,
"learning_rate": 0.0003223941690962099,
"loss": 3.1589,
"step": 79450
},
{
"epoch": 23.164335664335663,
"grad_norm": 0.4383814036846161,
"learning_rate": 0.0003222192419825073,
"loss": 3.1622,
"step": 79500
},
{
"epoch": 23.178904428904428,
"grad_norm": 0.40101122856140137,
"learning_rate": 0.00032204431486880465,
"loss": 3.1485,
"step": 79550
},
{
"epoch": 23.193473193473192,
"grad_norm": 0.4000990390777588,
"learning_rate": 0.00032186938775510197,
"loss": 3.1623,
"step": 79600
},
{
"epoch": 23.208041958041957,
"grad_norm": 0.3868107199668884,
"learning_rate": 0.00032169446064139935,
"loss": 3.1657,
"step": 79650
},
{
"epoch": 23.22261072261072,
"grad_norm": 0.43203461170196533,
"learning_rate": 0.0003215195335276967,
"loss": 3.1601,
"step": 79700
},
{
"epoch": 23.237179487179485,
"grad_norm": 0.4248724579811096,
"learning_rate": 0.00032134460641399415,
"loss": 3.1676,
"step": 79750
},
{
"epoch": 23.251748251748253,
"grad_norm": 0.4065852761268616,
"learning_rate": 0.00032116967930029153,
"loss": 3.1693,
"step": 79800
},
{
"epoch": 23.266317016317018,
"grad_norm": 0.43213963508605957,
"learning_rate": 0.0003209947521865889,
"loss": 3.1721,
"step": 79850
},
{
"epoch": 23.280885780885782,
"grad_norm": 0.4326905310153961,
"learning_rate": 0.0003208198250728863,
"loss": 3.1854,
"step": 79900
},
{
"epoch": 23.295454545454547,
"grad_norm": 0.4199202060699463,
"learning_rate": 0.00032064489795918366,
"loss": 3.1652,
"step": 79950
},
{
"epoch": 23.31002331002331,
"grad_norm": 0.40087878704071045,
"learning_rate": 0.00032046997084548103,
"loss": 3.1867,
"step": 80000
},
{
"epoch": 23.31002331002331,
"eval_accuracy": 0.3736945944185261,
"eval_loss": 3.550523519515991,
"eval_runtime": 242.1111,
"eval_samples_per_second": 68.737,
"eval_steps_per_second": 4.3,
"step": 80000
},
{
"epoch": 23.324592074592076,
"grad_norm": 0.39655831456184387,
"learning_rate": 0.00032029504373177835,
"loss": 3.1418,
"step": 80050
},
{
"epoch": 23.33916083916084,
"grad_norm": 0.4348229467868805,
"learning_rate": 0.00032012011661807573,
"loss": 3.1364,
"step": 80100
},
{
"epoch": 23.353729603729604,
"grad_norm": 0.42798683047294617,
"learning_rate": 0.00031994518950437316,
"loss": 3.1377,
"step": 80150
},
{
"epoch": 23.36829836829837,
"grad_norm": 0.4182716906070709,
"learning_rate": 0.00031977026239067053,
"loss": 3.1419,
"step": 80200
},
{
"epoch": 23.382867132867133,
"grad_norm": 0.4539523720741272,
"learning_rate": 0.0003195953352769679,
"loss": 3.157,
"step": 80250
},
{
"epoch": 23.397435897435898,
"grad_norm": 0.42409515380859375,
"learning_rate": 0.0003194204081632653,
"loss": 3.1638,
"step": 80300
},
{
"epoch": 23.412004662004662,
"grad_norm": 0.4075915813446045,
"learning_rate": 0.00031924548104956266,
"loss": 3.1426,
"step": 80350
},
{
"epoch": 23.426573426573427,
"grad_norm": 0.4082591235637665,
"learning_rate": 0.00031907055393586004,
"loss": 3.1518,
"step": 80400
},
{
"epoch": 23.44114219114219,
"grad_norm": 0.40404626727104187,
"learning_rate": 0.0003188956268221574,
"loss": 3.1526,
"step": 80450
},
{
"epoch": 23.455710955710956,
"grad_norm": 0.4418063461780548,
"learning_rate": 0.00031872069970845474,
"loss": 3.1499,
"step": 80500
},
{
"epoch": 23.47027972027972,
"grad_norm": 0.39059650897979736,
"learning_rate": 0.0003185457725947521,
"loss": 3.1609,
"step": 80550
},
{
"epoch": 23.484848484848484,
"grad_norm": 0.4021977186203003,
"learning_rate": 0.00031837084548104954,
"loss": 3.165,
"step": 80600
},
{
"epoch": 23.49941724941725,
"grad_norm": 0.4297217130661011,
"learning_rate": 0.0003181959183673469,
"loss": 3.17,
"step": 80650
},
{
"epoch": 23.513986013986013,
"grad_norm": 0.41383132338523865,
"learning_rate": 0.0003180209912536443,
"loss": 3.1562,
"step": 80700
},
{
"epoch": 23.528554778554778,
"grad_norm": 0.4195021986961365,
"learning_rate": 0.00031784606413994167,
"loss": 3.1654,
"step": 80750
},
{
"epoch": 23.543123543123542,
"grad_norm": 0.40288063883781433,
"learning_rate": 0.00031767113702623904,
"loss": 3.1678,
"step": 80800
},
{
"epoch": 23.557692307692307,
"grad_norm": 0.43353042006492615,
"learning_rate": 0.0003174962099125364,
"loss": 3.1641,
"step": 80850
},
{
"epoch": 23.57226107226107,
"grad_norm": 0.44536879658699036,
"learning_rate": 0.0003173212827988338,
"loss": 3.1672,
"step": 80900
},
{
"epoch": 23.586829836829835,
"grad_norm": 0.4158283472061157,
"learning_rate": 0.0003171463556851311,
"loss": 3.1782,
"step": 80950
},
{
"epoch": 23.6013986013986,
"grad_norm": 0.40795543789863586,
"learning_rate": 0.00031697142857142855,
"loss": 3.1736,
"step": 81000
},
{
"epoch": 23.6013986013986,
"eval_accuracy": 0.37375009730644015,
"eval_loss": 3.5509092807769775,
"eval_runtime": 181.2172,
"eval_samples_per_second": 91.835,
"eval_steps_per_second": 5.744,
"step": 81000
},
{
"epoch": 23.615967365967364,
"grad_norm": 0.44809621572494507,
"learning_rate": 0.0003167965014577259,
"loss": 3.1694,
"step": 81050
},
{
"epoch": 23.63053613053613,
"grad_norm": 0.43004634976387024,
"learning_rate": 0.0003166215743440233,
"loss": 3.1852,
"step": 81100
},
{
"epoch": 23.645104895104897,
"grad_norm": 0.4156273901462555,
"learning_rate": 0.0003164466472303207,
"loss": 3.1741,
"step": 81150
},
{
"epoch": 23.65967365967366,
"grad_norm": 0.40418311953544617,
"learning_rate": 0.00031627172011661805,
"loss": 3.1762,
"step": 81200
},
{
"epoch": 23.674242424242426,
"grad_norm": 0.42274951934814453,
"learning_rate": 0.0003160967930029154,
"loss": 3.1866,
"step": 81250
},
{
"epoch": 23.68881118881119,
"grad_norm": 0.43228334188461304,
"learning_rate": 0.0003159218658892128,
"loss": 3.1837,
"step": 81300
},
{
"epoch": 23.703379953379955,
"grad_norm": 0.40298640727996826,
"learning_rate": 0.00031574693877551023,
"loss": 3.1876,
"step": 81350
},
{
"epoch": 23.71794871794872,
"grad_norm": 0.3895956873893738,
"learning_rate": 0.0003155720116618075,
"loss": 3.1807,
"step": 81400
},
{
"epoch": 23.732517482517483,
"grad_norm": 0.40143319964408875,
"learning_rate": 0.00031539708454810493,
"loss": 3.1806,
"step": 81450
},
{
"epoch": 23.747086247086248,
"grad_norm": 0.4256174564361572,
"learning_rate": 0.0003152221574344023,
"loss": 3.1951,
"step": 81500
},
{
"epoch": 23.761655011655012,
"grad_norm": 0.43005815148353577,
"learning_rate": 0.0003150472303206997,
"loss": 3.1962,
"step": 81550
},
{
"epoch": 23.776223776223777,
"grad_norm": 0.42560434341430664,
"learning_rate": 0.00031487230320699706,
"loss": 3.1905,
"step": 81600
},
{
"epoch": 23.79079254079254,
"grad_norm": 0.4113454520702362,
"learning_rate": 0.00031469737609329443,
"loss": 3.2006,
"step": 81650
},
{
"epoch": 23.805361305361306,
"grad_norm": 0.41583073139190674,
"learning_rate": 0.0003145224489795918,
"loss": 3.2035,
"step": 81700
},
{
"epoch": 23.81993006993007,
"grad_norm": 0.46412721276283264,
"learning_rate": 0.0003143475218658892,
"loss": 3.1954,
"step": 81750
},
{
"epoch": 23.834498834498834,
"grad_norm": 0.4346821904182434,
"learning_rate": 0.0003141725947521866,
"loss": 3.1997,
"step": 81800
},
{
"epoch": 23.8490675990676,
"grad_norm": 0.4530637562274933,
"learning_rate": 0.0003139976676384839,
"loss": 3.1965,
"step": 81850
},
{
"epoch": 23.863636363636363,
"grad_norm": 0.39855870604515076,
"learning_rate": 0.0003138227405247813,
"loss": 3.1931,
"step": 81900
},
{
"epoch": 23.878205128205128,
"grad_norm": 0.41262272000312805,
"learning_rate": 0.0003136478134110787,
"loss": 3.2012,
"step": 81950
},
{
"epoch": 23.892773892773892,
"grad_norm": 0.39622053503990173,
"learning_rate": 0.00031347288629737606,
"loss": 3.1804,
"step": 82000
},
{
"epoch": 23.892773892773892,
"eval_accuracy": 0.37398633735266745,
"eval_loss": 3.5458834171295166,
"eval_runtime": 179.9531,
"eval_samples_per_second": 92.48,
"eval_steps_per_second": 5.785,
"step": 82000
},
{
"epoch": 23.907342657342657,
"grad_norm": 0.520991861820221,
"learning_rate": 0.00031329795918367344,
"loss": 3.1926,
"step": 82050
},
{
"epoch": 23.92191142191142,
"grad_norm": 0.4120796322822571,
"learning_rate": 0.0003131230320699708,
"loss": 3.1929,
"step": 82100
},
{
"epoch": 23.936480186480185,
"grad_norm": 0.3996407389640808,
"learning_rate": 0.0003129481049562682,
"loss": 3.1937,
"step": 82150
},
{
"epoch": 23.95104895104895,
"grad_norm": 0.4584140181541443,
"learning_rate": 0.00031277317784256557,
"loss": 3.1978,
"step": 82200
},
{
"epoch": 23.965617715617714,
"grad_norm": 0.41645756363868713,
"learning_rate": 0.000312598250728863,
"loss": 3.1941,
"step": 82250
},
{
"epoch": 23.98018648018648,
"grad_norm": 0.3965514898300171,
"learning_rate": 0.0003124233236151603,
"loss": 3.2015,
"step": 82300
},
{
"epoch": 23.994755244755243,
"grad_norm": 0.4470701217651367,
"learning_rate": 0.0003122483965014577,
"loss": 3.2066,
"step": 82350
},
{
"epoch": 24.009324009324008,
"grad_norm": 0.4205404818058014,
"learning_rate": 0.00031207346938775507,
"loss": 3.1394,
"step": 82400
},
{
"epoch": 24.023892773892776,
"grad_norm": 0.3919592499732971,
"learning_rate": 0.00031189854227405245,
"loss": 3.1313,
"step": 82450
},
{
"epoch": 24.03846153846154,
"grad_norm": 0.420136034488678,
"learning_rate": 0.0003117236151603498,
"loss": 3.1255,
"step": 82500
},
{
"epoch": 24.053030303030305,
"grad_norm": 0.4368348717689514,
"learning_rate": 0.0003115486880466472,
"loss": 3.1299,
"step": 82550
},
{
"epoch": 24.06759906759907,
"grad_norm": 0.43539777398109436,
"learning_rate": 0.00031137376093294457,
"loss": 3.1331,
"step": 82600
},
{
"epoch": 24.082167832167833,
"grad_norm": 0.3938283920288086,
"learning_rate": 0.000311198833819242,
"loss": 3.1423,
"step": 82650
},
{
"epoch": 24.096736596736598,
"grad_norm": 0.43436935544013977,
"learning_rate": 0.0003110239067055394,
"loss": 3.1361,
"step": 82700
},
{
"epoch": 24.111305361305362,
"grad_norm": 0.40530428290367126,
"learning_rate": 0.0003108489795918367,
"loss": 3.1474,
"step": 82750
},
{
"epoch": 24.125874125874127,
"grad_norm": 0.42183616757392883,
"learning_rate": 0.0003106740524781341,
"loss": 3.1602,
"step": 82800
},
{
"epoch": 24.14044289044289,
"grad_norm": 0.4034741520881653,
"learning_rate": 0.00031049912536443145,
"loss": 3.1481,
"step": 82850
},
{
"epoch": 24.155011655011656,
"grad_norm": 0.4140460789203644,
"learning_rate": 0.00031032419825072883,
"loss": 3.1438,
"step": 82900
},
{
"epoch": 24.16958041958042,
"grad_norm": 0.4160782992839813,
"learning_rate": 0.0003101492711370262,
"loss": 3.1535,
"step": 82950
},
{
"epoch": 24.184149184149184,
"grad_norm": 0.4291095435619354,
"learning_rate": 0.0003099743440233236,
"loss": 3.166,
"step": 83000
},
{
"epoch": 24.184149184149184,
"eval_accuracy": 0.37344577214982677,
"eval_loss": 3.5539300441741943,
"eval_runtime": 187.6495,
"eval_samples_per_second": 88.687,
"eval_steps_per_second": 5.548,
"step": 83000
},
{
"epoch": 24.19871794871795,
"grad_norm": 0.4372076988220215,
"learning_rate": 0.00030979941690962095,
"loss": 3.1622,
"step": 83050
},
{
"epoch": 24.213286713286713,
"grad_norm": 0.43101027607917786,
"learning_rate": 0.0003096244897959184,
"loss": 3.1568,
"step": 83100
},
{
"epoch": 24.227855477855478,
"grad_norm": 0.43224048614501953,
"learning_rate": 0.00030944956268221576,
"loss": 3.1681,
"step": 83150
},
{
"epoch": 24.242424242424242,
"grad_norm": 0.3828674852848053,
"learning_rate": 0.0003092746355685131,
"loss": 3.1676,
"step": 83200
},
{
"epoch": 24.256993006993007,
"grad_norm": 0.41439077258110046,
"learning_rate": 0.00030909970845481046,
"loss": 3.1635,
"step": 83250
},
{
"epoch": 24.27156177156177,
"grad_norm": 0.4204312562942505,
"learning_rate": 0.00030892478134110783,
"loss": 3.1724,
"step": 83300
},
{
"epoch": 24.286130536130536,
"grad_norm": 0.42064785957336426,
"learning_rate": 0.0003087498542274052,
"loss": 3.1548,
"step": 83350
},
{
"epoch": 24.3006993006993,
"grad_norm": 0.42346125841140747,
"learning_rate": 0.0003085749271137026,
"loss": 3.1665,
"step": 83400
},
{
"epoch": 24.315268065268064,
"grad_norm": 0.41940489411354065,
"learning_rate": 0.00030839999999999996,
"loss": 3.1784,
"step": 83450
},
{
"epoch": 24.32983682983683,
"grad_norm": 0.40471509099006653,
"learning_rate": 0.0003082250728862974,
"loss": 3.1763,
"step": 83500
},
{
"epoch": 24.344405594405593,
"grad_norm": 0.401151567697525,
"learning_rate": 0.00030805014577259477,
"loss": 3.1704,
"step": 83550
},
{
"epoch": 24.358974358974358,
"grad_norm": 0.4783649742603302,
"learning_rate": 0.00030787521865889214,
"loss": 3.1734,
"step": 83600
},
{
"epoch": 24.373543123543122,
"grad_norm": 0.4198293089866638,
"learning_rate": 0.00030770029154518946,
"loss": 3.182,
"step": 83650
},
{
"epoch": 24.388111888111887,
"grad_norm": 0.42297860980033875,
"learning_rate": 0.00030752536443148684,
"loss": 3.1696,
"step": 83700
},
{
"epoch": 24.40268065268065,
"grad_norm": 0.45142224431037903,
"learning_rate": 0.0003073504373177842,
"loss": 3.1804,
"step": 83750
},
{
"epoch": 24.41724941724942,
"grad_norm": 0.40544140338897705,
"learning_rate": 0.0003071755102040816,
"loss": 3.1888,
"step": 83800
},
{
"epoch": 24.431818181818183,
"grad_norm": 0.42886918783187866,
"learning_rate": 0.00030700058309037897,
"loss": 3.1823,
"step": 83850
},
{
"epoch": 24.446386946386948,
"grad_norm": 0.41986921429634094,
"learning_rate": 0.00030682565597667634,
"loss": 3.1728,
"step": 83900
},
{
"epoch": 24.460955710955712,
"grad_norm": 0.4673488140106201,
"learning_rate": 0.0003066507288629738,
"loss": 3.1857,
"step": 83950
},
{
"epoch": 24.475524475524477,
"grad_norm": 0.4163152873516083,
"learning_rate": 0.00030647580174927115,
"loss": 3.193,
"step": 84000
},
{
"epoch": 24.475524475524477,
"eval_accuracy": 0.37401114902501886,
"eval_loss": 3.5467607975006104,
"eval_runtime": 180.4483,
"eval_samples_per_second": 92.226,
"eval_steps_per_second": 5.769,
"step": 84000
},
{
"epoch": 24.49009324009324,
"grad_norm": 0.4326895475387573,
"learning_rate": 0.0003063008746355685,
"loss": 3.1831,
"step": 84050
},
{
"epoch": 24.504662004662006,
"grad_norm": 0.4216519892215729,
"learning_rate": 0.00030612594752186585,
"loss": 3.1873,
"step": 84100
},
{
"epoch": 24.51923076923077,
"grad_norm": 0.4140453636646271,
"learning_rate": 0.0003059510204081632,
"loss": 3.1893,
"step": 84150
},
{
"epoch": 24.533799533799534,
"grad_norm": 0.41393721103668213,
"learning_rate": 0.0003057760932944606,
"loss": 3.1888,
"step": 84200
},
{
"epoch": 24.5483682983683,
"grad_norm": 0.40341731905937195,
"learning_rate": 0.000305601166180758,
"loss": 3.1879,
"step": 84250
},
{
"epoch": 24.562937062937063,
"grad_norm": 0.4132257103919983,
"learning_rate": 0.00030542623906705535,
"loss": 3.1901,
"step": 84300
},
{
"epoch": 24.577505827505828,
"grad_norm": 0.42835623025894165,
"learning_rate": 0.0003052513119533527,
"loss": 3.1866,
"step": 84350
},
{
"epoch": 24.592074592074592,
"grad_norm": 0.43141621351242065,
"learning_rate": 0.00030507638483965016,
"loss": 3.2013,
"step": 84400
},
{
"epoch": 24.606643356643357,
"grad_norm": 0.41042980551719666,
"learning_rate": 0.00030490145772594753,
"loss": 3.1867,
"step": 84450
},
{
"epoch": 24.62121212121212,
"grad_norm": 0.4320753216743469,
"learning_rate": 0.0003047265306122449,
"loss": 3.1914,
"step": 84500
},
{
"epoch": 24.635780885780886,
"grad_norm": 0.43569475412368774,
"learning_rate": 0.00030455160349854223,
"loss": 3.2025,
"step": 84550
},
{
"epoch": 24.65034965034965,
"grad_norm": 0.4299875795841217,
"learning_rate": 0.0003043766763848396,
"loss": 3.1958,
"step": 84600
},
{
"epoch": 24.664918414918414,
"grad_norm": 0.40073278546333313,
"learning_rate": 0.000304201749271137,
"loss": 3.2059,
"step": 84650
},
{
"epoch": 24.67948717948718,
"grad_norm": 0.43635815382003784,
"learning_rate": 0.00030402682215743436,
"loss": 3.1934,
"step": 84700
},
{
"epoch": 24.694055944055943,
"grad_norm": 0.438896507024765,
"learning_rate": 0.00030385189504373173,
"loss": 3.2093,
"step": 84750
},
{
"epoch": 24.708624708624708,
"grad_norm": 0.4136168360710144,
"learning_rate": 0.00030367696793002916,
"loss": 3.2136,
"step": 84800
},
{
"epoch": 24.723193473193472,
"grad_norm": 0.3988734483718872,
"learning_rate": 0.00030350204081632654,
"loss": 3.1998,
"step": 84850
},
{
"epoch": 24.737762237762237,
"grad_norm": 0.4054337441921234,
"learning_rate": 0.0003033271137026239,
"loss": 3.2049,
"step": 84900
},
{
"epoch": 24.752331002331,
"grad_norm": 0.4305388927459717,
"learning_rate": 0.0003031521865889213,
"loss": 3.2119,
"step": 84950
},
{
"epoch": 24.766899766899765,
"grad_norm": 0.4009264409542084,
"learning_rate": 0.0003029772594752186,
"loss": 3.2055,
"step": 85000
},
{
"epoch": 24.766899766899765,
"eval_accuracy": 0.3745446587760061,
"eval_loss": 3.5372345447540283,
"eval_runtime": 180.1149,
"eval_samples_per_second": 92.397,
"eval_steps_per_second": 5.78,
"step": 85000
},
{
"epoch": 24.78146853146853,
"grad_norm": 0.429372102022171,
"learning_rate": 0.000302802332361516,
"loss": 3.2005,
"step": 85050
},
{
"epoch": 24.796037296037294,
"grad_norm": 0.43524783849716187,
"learning_rate": 0.00030262740524781336,
"loss": 3.2103,
"step": 85100
},
{
"epoch": 24.810606060606062,
"grad_norm": 0.39188215136528015,
"learning_rate": 0.00030245247813411074,
"loss": 3.2086,
"step": 85150
},
{
"epoch": 24.825174825174827,
"grad_norm": 0.43128883838653564,
"learning_rate": 0.0003022775510204081,
"loss": 3.1966,
"step": 85200
},
{
"epoch": 24.83974358974359,
"grad_norm": 0.42142587900161743,
"learning_rate": 0.00030210262390670554,
"loss": 3.2024,
"step": 85250
},
{
"epoch": 24.854312354312356,
"grad_norm": 0.41260868310928345,
"learning_rate": 0.0003019276967930029,
"loss": 3.2134,
"step": 85300
},
{
"epoch": 24.86888111888112,
"grad_norm": 0.420828253030777,
"learning_rate": 0.0003017527696793003,
"loss": 3.2118,
"step": 85350
},
{
"epoch": 24.883449883449885,
"grad_norm": 0.40663793683052063,
"learning_rate": 0.00030157784256559767,
"loss": 3.2062,
"step": 85400
},
{
"epoch": 24.89801864801865,
"grad_norm": 0.4167376756668091,
"learning_rate": 0.000301402915451895,
"loss": 3.2122,
"step": 85450
},
{
"epoch": 24.912587412587413,
"grad_norm": 0.41143983602523804,
"learning_rate": 0.00030122798833819237,
"loss": 3.2072,
"step": 85500
},
{
"epoch": 24.927156177156178,
"grad_norm": 0.43470826745033264,
"learning_rate": 0.00030105306122448974,
"loss": 3.2142,
"step": 85550
},
{
"epoch": 24.941724941724942,
"grad_norm": 0.44198209047317505,
"learning_rate": 0.0003008781341107871,
"loss": 3.2156,
"step": 85600
},
{
"epoch": 24.956293706293707,
"grad_norm": 0.40351226925849915,
"learning_rate": 0.0003007032069970845,
"loss": 3.2137,
"step": 85650
},
{
"epoch": 24.97086247086247,
"grad_norm": 0.3993280529975891,
"learning_rate": 0.0003005282798833819,
"loss": 3.2199,
"step": 85700
},
{
"epoch": 24.985431235431236,
"grad_norm": 0.42026591300964355,
"learning_rate": 0.0003003533527696793,
"loss": 3.2142,
"step": 85750
},
{
"epoch": 25.0,
"grad_norm": 0.4102586805820465,
"learning_rate": 0.0003001784256559767,
"loss": 3.2235,
"step": 85800
},
{
"epoch": 25.014568764568764,
"grad_norm": 0.43756303191185,
"learning_rate": 0.00030000349854227405,
"loss": 3.1136,
"step": 85850
},
{
"epoch": 25.02913752913753,
"grad_norm": 0.472674697637558,
"learning_rate": 0.00029982857142857143,
"loss": 3.1127,
"step": 85900
},
{
"epoch": 25.043706293706293,
"grad_norm": 0.40704378485679626,
"learning_rate": 0.0002996536443148688,
"loss": 3.1196,
"step": 85950
},
{
"epoch": 25.058275058275058,
"grad_norm": 0.42157062888145447,
"learning_rate": 0.0002994787172011661,
"loss": 3.125,
"step": 86000
},
{
"epoch": 25.058275058275058,
"eval_accuracy": 0.3741105133052887,
"eval_loss": 3.549722194671631,
"eval_runtime": 179.5998,
"eval_samples_per_second": 92.662,
"eval_steps_per_second": 5.796,
"step": 86000
},
{
"epoch": 25.072843822843822,
"grad_norm": 0.38535892963409424,
"learning_rate": 0.0002993037900874635,
"loss": 3.1172,
"step": 86050
},
{
"epoch": 25.087412587412587,
"grad_norm": 0.39269933104515076,
"learning_rate": 0.00029912886297376093,
"loss": 3.1283,
"step": 86100
},
{
"epoch": 25.10198135198135,
"grad_norm": 0.4526817500591278,
"learning_rate": 0.0002989539358600583,
"loss": 3.1342,
"step": 86150
},
{
"epoch": 25.116550116550115,
"grad_norm": 0.4041959047317505,
"learning_rate": 0.0002987790087463557,
"loss": 3.1267,
"step": 86200
},
{
"epoch": 25.13111888111888,
"grad_norm": 0.4244619905948639,
"learning_rate": 0.000298604081632653,
"loss": 3.1408,
"step": 86250
},
{
"epoch": 25.145687645687644,
"grad_norm": 0.42524653673171997,
"learning_rate": 0.00029842915451895044,
"loss": 3.138,
"step": 86300
},
{
"epoch": 25.16025641025641,
"grad_norm": 0.41081714630126953,
"learning_rate": 0.0002982542274052478,
"loss": 3.1426,
"step": 86350
},
{
"epoch": 25.174825174825173,
"grad_norm": 0.42908602952957153,
"learning_rate": 0.0002980793002915452,
"loss": 3.1439,
"step": 86400
},
{
"epoch": 25.189393939393938,
"grad_norm": 0.4357958734035492,
"learning_rate": 0.0002979043731778425,
"loss": 3.1352,
"step": 86450
},
{
"epoch": 25.203962703962706,
"grad_norm": 0.404898077249527,
"learning_rate": 0.0002977294460641399,
"loss": 3.1462,
"step": 86500
},
{
"epoch": 25.21853146853147,
"grad_norm": 0.4236783981323242,
"learning_rate": 0.0002975545189504373,
"loss": 3.1655,
"step": 86550
},
{
"epoch": 25.233100233100235,
"grad_norm": 0.4077592194080353,
"learning_rate": 0.0002973795918367347,
"loss": 3.1619,
"step": 86600
},
{
"epoch": 25.247668997669,
"grad_norm": 0.41832277178764343,
"learning_rate": 0.00029720466472303207,
"loss": 3.1489,
"step": 86650
},
{
"epoch": 25.262237762237763,
"grad_norm": 0.446024090051651,
"learning_rate": 0.0002970297376093294,
"loss": 3.1604,
"step": 86700
},
{
"epoch": 25.276806526806528,
"grad_norm": 0.4226219952106476,
"learning_rate": 0.0002968548104956268,
"loss": 3.1565,
"step": 86750
},
{
"epoch": 25.291375291375292,
"grad_norm": 0.4449532628059387,
"learning_rate": 0.0002966798833819242,
"loss": 3.1703,
"step": 86800
},
{
"epoch": 25.305944055944057,
"grad_norm": 0.4353093206882477,
"learning_rate": 0.00029650495626822157,
"loss": 3.1603,
"step": 86850
},
{
"epoch": 25.32051282051282,
"grad_norm": 0.41738250851631165,
"learning_rate": 0.0002963300291545189,
"loss": 3.1538,
"step": 86900
},
{
"epoch": 25.335081585081586,
"grad_norm": 0.4185565412044525,
"learning_rate": 0.0002961551020408163,
"loss": 3.1657,
"step": 86950
},
{
"epoch": 25.34965034965035,
"grad_norm": 0.438228577375412,
"learning_rate": 0.0002959801749271137,
"loss": 3.1831,
"step": 87000
},
{
"epoch": 25.34965034965035,
"eval_accuracy": 0.3737184653639637,
"eval_loss": 3.5500197410583496,
"eval_runtime": 179.576,
"eval_samples_per_second": 92.674,
"eval_steps_per_second": 5.797,
"step": 87000
},
{
"epoch": 25.364219114219114,
"grad_norm": 0.3924265205860138,
"learning_rate": 0.00029580524781341107,
"loss": 3.1609,
"step": 87050
},
{
"epoch": 25.37878787878788,
"grad_norm": 0.41711100935935974,
"learning_rate": 0.00029563032069970845,
"loss": 3.1553,
"step": 87100
},
{
"epoch": 25.393356643356643,
"grad_norm": 0.4473123550415039,
"learning_rate": 0.00029545539358600577,
"loss": 3.1595,
"step": 87150
},
{
"epoch": 25.407925407925408,
"grad_norm": 0.42639219760894775,
"learning_rate": 0.0002952804664723032,
"loss": 3.1708,
"step": 87200
},
{
"epoch": 25.422494172494172,
"grad_norm": 0.4454313814640045,
"learning_rate": 0.0002951055393586006,
"loss": 3.1641,
"step": 87250
},
{
"epoch": 25.437062937062937,
"grad_norm": 0.4151012599468231,
"learning_rate": 0.00029493061224489795,
"loss": 3.1709,
"step": 87300
},
{
"epoch": 25.4516317016317,
"grad_norm": 0.4197435975074768,
"learning_rate": 0.00029475568513119527,
"loss": 3.1739,
"step": 87350
},
{
"epoch": 25.466200466200466,
"grad_norm": 0.4248720705509186,
"learning_rate": 0.0002945807580174927,
"loss": 3.1689,
"step": 87400
},
{
"epoch": 25.48076923076923,
"grad_norm": 0.4041939079761505,
"learning_rate": 0.0002944058309037901,
"loss": 3.1715,
"step": 87450
},
{
"epoch": 25.495337995337994,
"grad_norm": 0.4290682077407837,
"learning_rate": 0.00029423090379008745,
"loss": 3.1731,
"step": 87500
},
{
"epoch": 25.50990675990676,
"grad_norm": 0.4535789489746094,
"learning_rate": 0.00029405597667638483,
"loss": 3.1684,
"step": 87550
},
{
"epoch": 25.524475524475523,
"grad_norm": 0.41663211584091187,
"learning_rate": 0.0002938810495626822,
"loss": 3.1749,
"step": 87600
},
{
"epoch": 25.539044289044288,
"grad_norm": 0.39012715220451355,
"learning_rate": 0.0002937061224489796,
"loss": 3.1681,
"step": 87650
},
{
"epoch": 25.553613053613052,
"grad_norm": 0.39378008246421814,
"learning_rate": 0.00029353119533527696,
"loss": 3.1925,
"step": 87700
},
{
"epoch": 25.568181818181817,
"grad_norm": 0.40518417954444885,
"learning_rate": 0.00029335626822157433,
"loss": 3.1886,
"step": 87750
},
{
"epoch": 25.582750582750585,
"grad_norm": 0.4220457673072815,
"learning_rate": 0.00029318134110787166,
"loss": 3.1855,
"step": 87800
},
{
"epoch": 25.59731934731935,
"grad_norm": 0.4504868686199188,
"learning_rate": 0.0002930064139941691,
"loss": 3.1812,
"step": 87850
},
{
"epoch": 25.611888111888113,
"grad_norm": 0.41079261898994446,
"learning_rate": 0.00029283148688046646,
"loss": 3.1858,
"step": 87900
},
{
"epoch": 25.626456876456878,
"grad_norm": 0.39636367559432983,
"learning_rate": 0.00029265655976676384,
"loss": 3.1923,
"step": 87950
},
{
"epoch": 25.641025641025642,
"grad_norm": 0.4257923364639282,
"learning_rate": 0.0002924816326530612,
"loss": 3.1967,
"step": 88000
},
{
"epoch": 25.641025641025642,
"eval_accuracy": 0.37449691688513087,
"eval_loss": 3.5418219566345215,
"eval_runtime": 181.8935,
"eval_samples_per_second": 91.493,
"eval_steps_per_second": 5.723,
"step": 88000
},
{
"epoch": 25.655594405594407,
"grad_norm": 0.42990806698799133,
"learning_rate": 0.0002923067055393586,
"loss": 3.1933,
"step": 88050
},
{
"epoch": 25.67016317016317,
"grad_norm": 0.41336822509765625,
"learning_rate": 0.00029213177842565596,
"loss": 3.1863,
"step": 88100
},
{
"epoch": 25.684731934731936,
"grad_norm": 0.42269113659858704,
"learning_rate": 0.00029195685131195334,
"loss": 3.1963,
"step": 88150
},
{
"epoch": 25.6993006993007,
"grad_norm": 0.42279052734375,
"learning_rate": 0.0002917819241982507,
"loss": 3.1905,
"step": 88200
},
{
"epoch": 25.713869463869464,
"grad_norm": 0.4442167580127716,
"learning_rate": 0.0002916069970845481,
"loss": 3.1855,
"step": 88250
},
{
"epoch": 25.72843822843823,
"grad_norm": 0.44783613085746765,
"learning_rate": 0.00029143206997084547,
"loss": 3.1926,
"step": 88300
},
{
"epoch": 25.743006993006993,
"grad_norm": 0.4339323341846466,
"learning_rate": 0.00029125714285714284,
"loss": 3.2009,
"step": 88350
},
{
"epoch": 25.757575757575758,
"grad_norm": 0.4134320616722107,
"learning_rate": 0.0002910822157434402,
"loss": 3.1883,
"step": 88400
},
{
"epoch": 25.772144522144522,
"grad_norm": 0.40038609504699707,
"learning_rate": 0.0002909072886297376,
"loss": 3.2002,
"step": 88450
},
{
"epoch": 25.786713286713287,
"grad_norm": 0.40860867500305176,
"learning_rate": 0.00029073236151603497,
"loss": 3.1973,
"step": 88500
},
{
"epoch": 25.80128205128205,
"grad_norm": 0.4466297924518585,
"learning_rate": 0.00029055743440233235,
"loss": 3.1971,
"step": 88550
},
{
"epoch": 25.815850815850816,
"grad_norm": 0.3885636329650879,
"learning_rate": 0.0002903825072886297,
"loss": 3.1982,
"step": 88600
},
{
"epoch": 25.83041958041958,
"grad_norm": 0.4263664186000824,
"learning_rate": 0.0002902075801749271,
"loss": 3.1932,
"step": 88650
},
{
"epoch": 25.844988344988344,
"grad_norm": 0.4211662709712982,
"learning_rate": 0.0002900326530612245,
"loss": 3.1943,
"step": 88700
},
{
"epoch": 25.85955710955711,
"grad_norm": 0.408115953207016,
"learning_rate": 0.00028985772594752185,
"loss": 3.1981,
"step": 88750
},
{
"epoch": 25.874125874125873,
"grad_norm": 0.41231685876846313,
"learning_rate": 0.0002896827988338192,
"loss": 3.2069,
"step": 88800
},
{
"epoch": 25.888694638694638,
"grad_norm": 0.42815014719963074,
"learning_rate": 0.0002895078717201166,
"loss": 3.2032,
"step": 88850
},
{
"epoch": 25.903263403263402,
"grad_norm": 0.3903138339519501,
"learning_rate": 0.000289332944606414,
"loss": 3.2072,
"step": 88900
},
{
"epoch": 25.917832167832167,
"grad_norm": 0.4448038339614868,
"learning_rate": 0.00028915801749271135,
"loss": 3.2024,
"step": 88950
},
{
"epoch": 25.93240093240093,
"grad_norm": 0.4322460889816284,
"learning_rate": 0.00028898309037900873,
"loss": 3.198,
"step": 89000
},
{
"epoch": 25.93240093240093,
"eval_accuracy": 0.3751864697129442,
"eval_loss": 3.5322518348693848,
"eval_runtime": 181.8154,
"eval_samples_per_second": 91.532,
"eval_steps_per_second": 5.726,
"step": 89000
},
{
"epoch": 25.946969696969695,
"grad_norm": 0.40827956795692444,
"learning_rate": 0.0002888081632653061,
"loss": 3.1979,
"step": 89050
},
{
"epoch": 25.96153846153846,
"grad_norm": 0.4115292727947235,
"learning_rate": 0.0002886332361516035,
"loss": 3.1955,
"step": 89100
},
{
"epoch": 25.976107226107224,
"grad_norm": 0.42888209223747253,
"learning_rate": 0.00028845830903790086,
"loss": 3.2055,
"step": 89150
},
{
"epoch": 25.990675990675992,
"grad_norm": 0.411482572555542,
"learning_rate": 0.00028828338192419823,
"loss": 3.2077,
"step": 89200
},
{
"epoch": 26.005244755244757,
"grad_norm": 0.3968612551689148,
"learning_rate": 0.0002881084548104956,
"loss": 3.1665,
"step": 89250
},
{
"epoch": 26.01981351981352,
"grad_norm": 0.42154043912887573,
"learning_rate": 0.000287933527696793,
"loss": 3.1057,
"step": 89300
},
{
"epoch": 26.034382284382286,
"grad_norm": 0.4301404356956482,
"learning_rate": 0.00028775860058309036,
"loss": 3.0975,
"step": 89350
},
{
"epoch": 26.04895104895105,
"grad_norm": 0.4084946811199188,
"learning_rate": 0.00028758367346938773,
"loss": 3.1097,
"step": 89400
},
{
"epoch": 26.063519813519815,
"grad_norm": 0.41912487149238586,
"learning_rate": 0.0002874087463556851,
"loss": 3.1278,
"step": 89450
},
{
"epoch": 26.07808857808858,
"grad_norm": 0.4204862713813782,
"learning_rate": 0.0002872338192419825,
"loss": 3.1172,
"step": 89500
},
{
"epoch": 26.092657342657343,
"grad_norm": 0.41860130429267883,
"learning_rate": 0.00028705889212827986,
"loss": 3.124,
"step": 89550
},
{
"epoch": 26.107226107226108,
"grad_norm": 0.41904857754707336,
"learning_rate": 0.00028688396501457724,
"loss": 3.1286,
"step": 89600
},
{
"epoch": 26.121794871794872,
"grad_norm": 0.418357789516449,
"learning_rate": 0.0002867090379008746,
"loss": 3.1165,
"step": 89650
},
{
"epoch": 26.136363636363637,
"grad_norm": 0.4240556061267853,
"learning_rate": 0.000286534110787172,
"loss": 3.1232,
"step": 89700
},
{
"epoch": 26.1509324009324,
"grad_norm": 0.41585004329681396,
"learning_rate": 0.00028635918367346937,
"loss": 3.1326,
"step": 89750
},
{
"epoch": 26.165501165501166,
"grad_norm": 0.4257110357284546,
"learning_rate": 0.00028618425655976674,
"loss": 3.1437,
"step": 89800
},
{
"epoch": 26.18006993006993,
"grad_norm": 0.41216740012168884,
"learning_rate": 0.0002860093294460641,
"loss": 3.1363,
"step": 89850
},
{
"epoch": 26.194638694638694,
"grad_norm": 0.4310924708843231,
"learning_rate": 0.0002858344023323615,
"loss": 3.1374,
"step": 89900
},
{
"epoch": 26.20920745920746,
"grad_norm": 0.44105252623558044,
"learning_rate": 0.00028565947521865887,
"loss": 3.1325,
"step": 89950
},
{
"epoch": 26.223776223776223,
"grad_norm": 0.4135071933269501,
"learning_rate": 0.00028548454810495624,
"loss": 3.1476,
"step": 90000
},
{
"epoch": 26.223776223776223,
"eval_accuracy": 0.37405395209959663,
"eval_loss": 3.5517945289611816,
"eval_runtime": 181.7859,
"eval_samples_per_second": 91.547,
"eval_steps_per_second": 5.727,
"step": 90000
},
{
"epoch": 26.238344988344988,
"grad_norm": 0.3974347412586212,
"learning_rate": 0.0002853096209912536,
"loss": 3.1361,
"step": 90050
},
{
"epoch": 26.252913752913752,
"grad_norm": 0.42860710620880127,
"learning_rate": 0.000285134693877551,
"loss": 3.1546,
"step": 90100
},
{
"epoch": 26.267482517482517,
"grad_norm": 0.41412895917892456,
"learning_rate": 0.00028495976676384837,
"loss": 3.1557,
"step": 90150
},
{
"epoch": 26.28205128205128,
"grad_norm": 0.41339340806007385,
"learning_rate": 0.00028478483965014575,
"loss": 3.1351,
"step": 90200
},
{
"epoch": 26.296620046620045,
"grad_norm": 0.42824968695640564,
"learning_rate": 0.0002846099125364431,
"loss": 3.1567,
"step": 90250
},
{
"epoch": 26.31118881118881,
"grad_norm": 0.40359318256378174,
"learning_rate": 0.0002844349854227405,
"loss": 3.1551,
"step": 90300
},
{
"epoch": 26.325757575757574,
"grad_norm": 0.4394044578075409,
"learning_rate": 0.0002842600583090379,
"loss": 3.157,
"step": 90350
},
{
"epoch": 26.34032634032634,
"grad_norm": 0.4745209515094757,
"learning_rate": 0.00028408513119533525,
"loss": 3.1352,
"step": 90400
},
{
"epoch": 26.354895104895103,
"grad_norm": 0.4229678213596344,
"learning_rate": 0.0002839102040816326,
"loss": 3.1542,
"step": 90450
},
{
"epoch": 26.36946386946387,
"grad_norm": 0.44974929094314575,
"learning_rate": 0.00028373527696793,
"loss": 3.1574,
"step": 90500
},
{
"epoch": 26.384032634032636,
"grad_norm": 0.4348454177379608,
"learning_rate": 0.0002835603498542274,
"loss": 3.162,
"step": 90550
},
{
"epoch": 26.3986013986014,
"grad_norm": 0.42422547936439514,
"learning_rate": 0.00028338542274052475,
"loss": 3.1615,
"step": 90600
},
{
"epoch": 26.413170163170165,
"grad_norm": 0.4377591609954834,
"learning_rate": 0.00028321049562682213,
"loss": 3.171,
"step": 90650
},
{
"epoch": 26.42773892773893,
"grad_norm": 0.4380994737148285,
"learning_rate": 0.0002830355685131195,
"loss": 3.1649,
"step": 90700
},
{
"epoch": 26.442307692307693,
"grad_norm": 0.4348939061164856,
"learning_rate": 0.0002828606413994169,
"loss": 3.1701,
"step": 90750
},
{
"epoch": 26.456876456876458,
"grad_norm": 0.4366742968559265,
"learning_rate": 0.00028268571428571426,
"loss": 3.1557,
"step": 90800
},
{
"epoch": 26.471445221445222,
"grad_norm": 0.44443434476852417,
"learning_rate": 0.00028251078717201163,
"loss": 3.1722,
"step": 90850
},
{
"epoch": 26.486013986013987,
"grad_norm": 0.4162462055683136,
"learning_rate": 0.000282335860058309,
"loss": 3.1655,
"step": 90900
},
{
"epoch": 26.50058275058275,
"grad_norm": 0.4146740138530731,
"learning_rate": 0.0002821609329446064,
"loss": 3.1597,
"step": 90950
},
{
"epoch": 26.515151515151516,
"grad_norm": 0.4478785991668701,
"learning_rate": 0.00028198600583090376,
"loss": 3.1699,
"step": 91000
},
{
"epoch": 26.515151515151516,
"eval_accuracy": 0.37451432033303617,
"eval_loss": 3.5433578491210938,
"eval_runtime": 181.9511,
"eval_samples_per_second": 91.464,
"eval_steps_per_second": 5.721,
"step": 91000
},
{
"epoch": 26.52972027972028,
"grad_norm": 0.43394702672958374,
"learning_rate": 0.00028181107871720114,
"loss": 3.1615,
"step": 91050
},
{
"epoch": 26.544289044289044,
"grad_norm": 0.4158869683742523,
"learning_rate": 0.0002816361516034985,
"loss": 3.1802,
"step": 91100
},
{
"epoch": 26.55885780885781,
"grad_norm": 0.43913793563842773,
"learning_rate": 0.0002814612244897959,
"loss": 3.1835,
"step": 91150
},
{
"epoch": 26.573426573426573,
"grad_norm": 0.4561229944229126,
"learning_rate": 0.00028128629737609326,
"loss": 3.1617,
"step": 91200
},
{
"epoch": 26.587995337995338,
"grad_norm": 0.4013047218322754,
"learning_rate": 0.00028111137026239064,
"loss": 3.1696,
"step": 91250
},
{
"epoch": 26.602564102564102,
"grad_norm": 0.4297322928905487,
"learning_rate": 0.000280936443148688,
"loss": 3.1684,
"step": 91300
},
{
"epoch": 26.617132867132867,
"grad_norm": 0.44311001896858215,
"learning_rate": 0.0002807615160349854,
"loss": 3.1867,
"step": 91350
},
{
"epoch": 26.63170163170163,
"grad_norm": 0.4296179413795471,
"learning_rate": 0.00028058658892128277,
"loss": 3.1862,
"step": 91400
},
{
"epoch": 26.646270396270396,
"grad_norm": 0.41797104477882385,
"learning_rate": 0.00028041166180758014,
"loss": 3.1884,
"step": 91450
},
{
"epoch": 26.66083916083916,
"grad_norm": 0.4100271165370941,
"learning_rate": 0.0002802367346938775,
"loss": 3.1727,
"step": 91500
},
{
"epoch": 26.675407925407924,
"grad_norm": 0.4347117245197296,
"learning_rate": 0.0002800618075801749,
"loss": 3.1848,
"step": 91550
},
{
"epoch": 26.68997668997669,
"grad_norm": 0.4233982563018799,
"learning_rate": 0.00027988688046647227,
"loss": 3.1763,
"step": 91600
},
{
"epoch": 26.704545454545453,
"grad_norm": 0.4804072976112366,
"learning_rate": 0.00027971195335276965,
"loss": 3.1964,
"step": 91650
},
{
"epoch": 26.719114219114218,
"grad_norm": 0.4588761627674103,
"learning_rate": 0.000279537026239067,
"loss": 3.185,
"step": 91700
},
{
"epoch": 26.733682983682982,
"grad_norm": 0.45718541741371155,
"learning_rate": 0.0002793620991253644,
"loss": 3.1868,
"step": 91750
},
{
"epoch": 26.748251748251747,
"grad_norm": 0.43570733070373535,
"learning_rate": 0.00027918717201166177,
"loss": 3.1869,
"step": 91800
},
{
"epoch": 26.76282051282051,
"grad_norm": 0.4332759976387024,
"learning_rate": 0.00027901224489795915,
"loss": 3.1816,
"step": 91850
},
{
"epoch": 26.77738927738928,
"grad_norm": 0.42579400539398193,
"learning_rate": 0.0002788373177842565,
"loss": 3.1852,
"step": 91900
},
{
"epoch": 26.791958041958043,
"grad_norm": 0.41861170530319214,
"learning_rate": 0.0002786623906705539,
"loss": 3.1825,
"step": 91950
},
{
"epoch": 26.806526806526808,
"grad_norm": 0.4397330582141876,
"learning_rate": 0.0002784874635568513,
"loss": 3.1932,
"step": 92000
},
{
"epoch": 26.806526806526808,
"eval_accuracy": 0.3745991033461421,
"eval_loss": 3.5361201763153076,
"eval_runtime": 182.0117,
"eval_samples_per_second": 91.434,
"eval_steps_per_second": 5.719,
"step": 92000
},
{
"epoch": 26.821095571095572,
"grad_norm": 0.4330262839794159,
"learning_rate": 0.00027831253644314865,
"loss": 3.1821,
"step": 92050
},
{
"epoch": 26.835664335664337,
"grad_norm": 0.4355219602584839,
"learning_rate": 0.00027813760932944603,
"loss": 3.1778,
"step": 92100
},
{
"epoch": 26.8502331002331,
"grad_norm": 0.4293540418148041,
"learning_rate": 0.0002779626822157434,
"loss": 3.1707,
"step": 92150
},
{
"epoch": 26.864801864801866,
"grad_norm": 0.41315534710884094,
"learning_rate": 0.0002777877551020408,
"loss": 3.1928,
"step": 92200
},
{
"epoch": 26.87937062937063,
"grad_norm": 0.42750847339630127,
"learning_rate": 0.0002776128279883382,
"loss": 3.1829,
"step": 92250
},
{
"epoch": 26.893939393939394,
"grad_norm": 0.4304085671901703,
"learning_rate": 0.00027743790087463553,
"loss": 3.193,
"step": 92300
},
{
"epoch": 26.90850815850816,
"grad_norm": 0.4276728332042694,
"learning_rate": 0.0002772629737609329,
"loss": 3.1917,
"step": 92350
},
{
"epoch": 26.923076923076923,
"grad_norm": 0.4612192213535309,
"learning_rate": 0.0002770880466472303,
"loss": 3.1975,
"step": 92400
},
{
"epoch": 26.937645687645688,
"grad_norm": 0.4436318874359131,
"learning_rate": 0.00027691311953352766,
"loss": 3.1836,
"step": 92450
},
{
"epoch": 26.952214452214452,
"grad_norm": 0.4181881844997406,
"learning_rate": 0.00027673819241982503,
"loss": 3.1878,
"step": 92500
},
{
"epoch": 26.966783216783217,
"grad_norm": 0.420480340719223,
"learning_rate": 0.0002765632653061224,
"loss": 3.1883,
"step": 92550
},
{
"epoch": 26.98135198135198,
"grad_norm": 0.3930075764656067,
"learning_rate": 0.0002763883381924198,
"loss": 3.1982,
"step": 92600
},
{
"epoch": 26.995920745920746,
"grad_norm": 0.43089744448661804,
"learning_rate": 0.00027621341107871716,
"loss": 3.1935,
"step": 92650
},
{
"epoch": 27.01048951048951,
"grad_norm": 0.4508107006549835,
"learning_rate": 0.0002760384839650146,
"loss": 3.1204,
"step": 92700
},
{
"epoch": 27.025058275058274,
"grad_norm": 0.4305284321308136,
"learning_rate": 0.0002758635568513119,
"loss": 3.1035,
"step": 92750
},
{
"epoch": 27.03962703962704,
"grad_norm": 0.42484739422798157,
"learning_rate": 0.0002756886297376093,
"loss": 3.0986,
"step": 92800
},
{
"epoch": 27.054195804195803,
"grad_norm": 0.44576549530029297,
"learning_rate": 0.00027551370262390666,
"loss": 3.102,
"step": 92850
},
{
"epoch": 27.068764568764568,
"grad_norm": 0.42311638593673706,
"learning_rate": 0.0002753387755102041,
"loss": 3.1047,
"step": 92900
},
{
"epoch": 27.083333333333332,
"grad_norm": 0.43278342485427856,
"learning_rate": 0.0002751638483965014,
"loss": 3.1077,
"step": 92950
},
{
"epoch": 27.097902097902097,
"grad_norm": 0.4202318787574768,
"learning_rate": 0.0002749889212827988,
"loss": 3.129,
"step": 93000
},
{
"epoch": 27.097902097902097,
"eval_accuracy": 0.37368542233111657,
"eval_loss": 3.5532684326171875,
"eval_runtime": 179.8211,
"eval_samples_per_second": 92.548,
"eval_steps_per_second": 5.789,
"step": 93000
},
{
"epoch": 27.11247086247086,
"grad_norm": 0.4148189127445221,
"learning_rate": 0.00027481399416909617,
"loss": 3.1146,
"step": 93050
},
{
"epoch": 27.127039627039625,
"grad_norm": 0.4284904897212982,
"learning_rate": 0.00027463906705539354,
"loss": 3.1226,
"step": 93100
},
{
"epoch": 27.14160839160839,
"grad_norm": 0.4357090890407562,
"learning_rate": 0.000274464139941691,
"loss": 3.1164,
"step": 93150
},
{
"epoch": 27.156177156177158,
"grad_norm": 0.4211357831954956,
"learning_rate": 0.0002742892128279883,
"loss": 3.1247,
"step": 93200
},
{
"epoch": 27.170745920745922,
"grad_norm": 0.4347440302371979,
"learning_rate": 0.00027411428571428567,
"loss": 3.1199,
"step": 93250
},
{
"epoch": 27.185314685314687,
"grad_norm": 0.4083610475063324,
"learning_rate": 0.00027393935860058305,
"loss": 3.1151,
"step": 93300
},
{
"epoch": 27.19988344988345,
"grad_norm": 0.451709121465683,
"learning_rate": 0.0002737644314868805,
"loss": 3.1266,
"step": 93350
},
{
"epoch": 27.214452214452216,
"grad_norm": 0.44590309262275696,
"learning_rate": 0.0002735895043731778,
"loss": 3.1305,
"step": 93400
},
{
"epoch": 27.22902097902098,
"grad_norm": 0.4306364357471466,
"learning_rate": 0.0002734145772594752,
"loss": 3.127,
"step": 93450
},
{
"epoch": 27.243589743589745,
"grad_norm": 0.4174492359161377,
"learning_rate": 0.00027323965014577255,
"loss": 3.1401,
"step": 93500
},
{
"epoch": 27.25815850815851,
"grad_norm": 0.4140374958515167,
"learning_rate": 0.00027306472303207,
"loss": 3.1344,
"step": 93550
},
{
"epoch": 27.272727272727273,
"grad_norm": 0.4186951518058777,
"learning_rate": 0.00027288979591836736,
"loss": 3.1323,
"step": 93600
},
{
"epoch": 27.287296037296038,
"grad_norm": 0.4227970838546753,
"learning_rate": 0.0002727148688046647,
"loss": 3.1298,
"step": 93650
},
{
"epoch": 27.301864801864802,
"grad_norm": 0.4474739730358124,
"learning_rate": 0.00027253994169096205,
"loss": 3.1429,
"step": 93700
},
{
"epoch": 27.316433566433567,
"grad_norm": 0.399031400680542,
"learning_rate": 0.00027236501457725943,
"loss": 3.1415,
"step": 93750
},
{
"epoch": 27.33100233100233,
"grad_norm": 0.4408739507198334,
"learning_rate": 0.00027219008746355686,
"loss": 3.1598,
"step": 93800
},
{
"epoch": 27.345571095571096,
"grad_norm": 0.4364926218986511,
"learning_rate": 0.0002720151603498542,
"loss": 3.1415,
"step": 93850
},
{
"epoch": 27.36013986013986,
"grad_norm": 0.43380430340766907,
"learning_rate": 0.00027184023323615156,
"loss": 3.1496,
"step": 93900
},
{
"epoch": 27.374708624708624,
"grad_norm": 0.4094713032245636,
"learning_rate": 0.00027166530612244893,
"loss": 3.152,
"step": 93950
},
{
"epoch": 27.38927738927739,
"grad_norm": 0.43629366159439087,
"learning_rate": 0.00027149037900874636,
"loss": 3.1585,
"step": 94000
},
{
"epoch": 27.38927738927739,
"eval_accuracy": 0.374065123231698,
"eval_loss": 3.546992301940918,
"eval_runtime": 179.6616,
"eval_samples_per_second": 92.63,
"eval_steps_per_second": 5.794,
"step": 94000
},
{
"epoch": 27.403846153846153,
"grad_norm": 0.4074999690055847,
"learning_rate": 0.00027131545189504374,
"loss": 3.1506,
"step": 94050
},
{
"epoch": 27.418414918414918,
"grad_norm": 0.45839694142341614,
"learning_rate": 0.00027114052478134106,
"loss": 3.1496,
"step": 94100
},
{
"epoch": 27.432983682983682,
"grad_norm": 0.4555986821651459,
"learning_rate": 0.00027096559766763843,
"loss": 3.1427,
"step": 94150
},
{
"epoch": 27.447552447552447,
"grad_norm": 0.4265489876270294,
"learning_rate": 0.00027079067055393586,
"loss": 3.1546,
"step": 94200
},
{
"epoch": 27.46212121212121,
"grad_norm": 0.4634106159210205,
"learning_rate": 0.00027061574344023324,
"loss": 3.1474,
"step": 94250
},
{
"epoch": 27.476689976689975,
"grad_norm": 0.46226605772972107,
"learning_rate": 0.00027044081632653056,
"loss": 3.16,
"step": 94300
},
{
"epoch": 27.49125874125874,
"grad_norm": 0.4321894347667694,
"learning_rate": 0.00027026588921282794,
"loss": 3.1583,
"step": 94350
},
{
"epoch": 27.505827505827504,
"grad_norm": 0.44746580719947815,
"learning_rate": 0.0002700909620991253,
"loss": 3.1516,
"step": 94400
},
{
"epoch": 27.52039627039627,
"grad_norm": 0.4335636794567108,
"learning_rate": 0.00026991603498542274,
"loss": 3.1583,
"step": 94450
},
{
"epoch": 27.534965034965033,
"grad_norm": 0.4515124559402466,
"learning_rate": 0.0002697411078717201,
"loss": 3.1625,
"step": 94500
},
{
"epoch": 27.5495337995338,
"grad_norm": 0.47119760513305664,
"learning_rate": 0.00026956618075801744,
"loss": 3.1551,
"step": 94550
},
{
"epoch": 27.564102564102566,
"grad_norm": 0.42898598313331604,
"learning_rate": 0.0002693912536443148,
"loss": 3.1534,
"step": 94600
},
{
"epoch": 27.57867132867133,
"grad_norm": 0.4289074242115021,
"learning_rate": 0.00026921632653061225,
"loss": 3.1637,
"step": 94650
},
{
"epoch": 27.593240093240095,
"grad_norm": 0.4532720148563385,
"learning_rate": 0.0002690413994169096,
"loss": 3.1754,
"step": 94700
},
{
"epoch": 27.60780885780886,
"grad_norm": 0.4546985626220703,
"learning_rate": 0.00026886647230320694,
"loss": 3.1657,
"step": 94750
},
{
"epoch": 27.622377622377623,
"grad_norm": 0.42070233821868896,
"learning_rate": 0.0002686915451895043,
"loss": 3.1667,
"step": 94800
},
{
"epoch": 27.636946386946388,
"grad_norm": 0.4291580021381378,
"learning_rate": 0.00026851661807580175,
"loss": 3.1627,
"step": 94850
},
{
"epoch": 27.651515151515152,
"grad_norm": 0.4327968657016754,
"learning_rate": 0.0002683416909620991,
"loss": 3.1766,
"step": 94900
},
{
"epoch": 27.666083916083917,
"grad_norm": 0.4629499912261963,
"learning_rate": 0.0002681667638483965,
"loss": 3.1685,
"step": 94950
},
{
"epoch": 27.68065268065268,
"grad_norm": 0.4146984815597534,
"learning_rate": 0.0002679918367346938,
"loss": 3.1692,
"step": 95000
},
{
"epoch": 27.68065268065268,
"eval_accuracy": 0.37507440561933814,
"eval_loss": 3.5368077754974365,
"eval_runtime": 179.6334,
"eval_samples_per_second": 92.644,
"eval_steps_per_second": 5.795,
"step": 95000
},
{
"epoch": 27.695221445221446,
"grad_norm": 0.41717877984046936,
"learning_rate": 0.00026781690962099125,
"loss": 3.1761,
"step": 95050
},
{
"epoch": 27.70979020979021,
"grad_norm": 0.43753674626350403,
"learning_rate": 0.00026764198250728863,
"loss": 3.1782,
"step": 95100
},
{
"epoch": 27.724358974358974,
"grad_norm": 0.42679542303085327,
"learning_rate": 0.000267467055393586,
"loss": 3.1722,
"step": 95150
},
{
"epoch": 27.73892773892774,
"grad_norm": 0.42229217290878296,
"learning_rate": 0.0002672921282798833,
"loss": 3.1785,
"step": 95200
},
{
"epoch": 27.753496503496503,
"grad_norm": 0.40234848856925964,
"learning_rate": 0.0002671172011661807,
"loss": 3.1675,
"step": 95250
},
{
"epoch": 27.768065268065268,
"grad_norm": 0.4103754758834839,
"learning_rate": 0.00026694227405247813,
"loss": 3.1787,
"step": 95300
},
{
"epoch": 27.782634032634032,
"grad_norm": 0.4386822581291199,
"learning_rate": 0.0002667673469387755,
"loss": 3.1718,
"step": 95350
},
{
"epoch": 27.797202797202797,
"grad_norm": 0.41249123215675354,
"learning_rate": 0.0002665924198250729,
"loss": 3.1797,
"step": 95400
},
{
"epoch": 27.81177156177156,
"grad_norm": 0.41915491223335266,
"learning_rate": 0.0002664174927113702,
"loss": 3.1828,
"step": 95450
},
{
"epoch": 27.826340326340326,
"grad_norm": 0.42128297686576843,
"learning_rate": 0.00026624256559766764,
"loss": 3.1821,
"step": 95500
},
{
"epoch": 27.84090909090909,
"grad_norm": 0.42073553800582886,
"learning_rate": 0.000266067638483965,
"loss": 3.2004,
"step": 95550
},
{
"epoch": 27.855477855477854,
"grad_norm": 0.4234536588191986,
"learning_rate": 0.0002658927113702624,
"loss": 3.1694,
"step": 95600
},
{
"epoch": 27.87004662004662,
"grad_norm": 0.4323881268501282,
"learning_rate": 0.0002657177842565597,
"loss": 3.183,
"step": 95650
},
{
"epoch": 27.884615384615383,
"grad_norm": 0.4549892544746399,
"learning_rate": 0.00026554285714285714,
"loss": 3.1858,
"step": 95700
},
{
"epoch": 27.899184149184148,
"grad_norm": 0.4438720941543579,
"learning_rate": 0.0002653679300291545,
"loss": 3.1879,
"step": 95750
},
{
"epoch": 27.913752913752912,
"grad_norm": 0.42953628301620483,
"learning_rate": 0.0002651930029154519,
"loss": 3.1892,
"step": 95800
},
{
"epoch": 27.92832167832168,
"grad_norm": 0.4279513657093048,
"learning_rate": 0.00026501807580174927,
"loss": 3.1857,
"step": 95850
},
{
"epoch": 27.942890442890445,
"grad_norm": 0.44362860918045044,
"learning_rate": 0.0002648431486880466,
"loss": 3.1813,
"step": 95900
},
{
"epoch": 27.95745920745921,
"grad_norm": 0.4112043082714081,
"learning_rate": 0.000264668221574344,
"loss": 3.1902,
"step": 95950
},
{
"epoch": 27.972027972027973,
"grad_norm": 0.4335898756980896,
"learning_rate": 0.0002644932944606414,
"loss": 3.2099,
"step": 96000
},
{
"epoch": 27.972027972027973,
"eval_accuracy": 0.37546492487942823,
"eval_loss": 3.533925771713257,
"eval_runtime": 179.0974,
"eval_samples_per_second": 92.922,
"eval_steps_per_second": 5.812,
"step": 96000
},
{
"epoch": 27.986596736596738,
"grad_norm": 0.3973131477832794,
"learning_rate": 0.00026431836734693877,
"loss": 3.1847,
"step": 96050
},
{
"epoch": 28.001165501165502,
"grad_norm": 0.4298485219478607,
"learning_rate": 0.0002641434402332361,
"loss": 3.1814,
"step": 96100
},
{
"epoch": 28.015734265734267,
"grad_norm": 0.4456147253513336,
"learning_rate": 0.0002639685131195335,
"loss": 3.0832,
"step": 96150
},
{
"epoch": 28.03030303030303,
"grad_norm": 0.4482695162296295,
"learning_rate": 0.0002637935860058309,
"loss": 3.0932,
"step": 96200
},
{
"epoch": 28.044871794871796,
"grad_norm": 0.4353080689907074,
"learning_rate": 0.00026361865889212827,
"loss": 3.0925,
"step": 96250
},
{
"epoch": 28.05944055944056,
"grad_norm": 0.438823938369751,
"learning_rate": 0.00026344373177842565,
"loss": 3.0935,
"step": 96300
},
{
"epoch": 28.074009324009324,
"grad_norm": 0.43021658062934875,
"learning_rate": 0.000263268804664723,
"loss": 3.1006,
"step": 96350
},
{
"epoch": 28.08857808857809,
"grad_norm": 0.4441169202327728,
"learning_rate": 0.0002630938775510204,
"loss": 3.101,
"step": 96400
},
{
"epoch": 28.103146853146853,
"grad_norm": 0.4555242359638214,
"learning_rate": 0.0002629189504373178,
"loss": 3.1125,
"step": 96450
},
{
"epoch": 28.117715617715618,
"grad_norm": 0.47698360681533813,
"learning_rate": 0.00026274402332361515,
"loss": 3.1132,
"step": 96500
},
{
"epoch": 28.132284382284382,
"grad_norm": 0.46771177649497986,
"learning_rate": 0.0002625690962099125,
"loss": 3.0982,
"step": 96550
},
{
"epoch": 28.146853146853147,
"grad_norm": 0.3934149146080017,
"learning_rate": 0.0002623941690962099,
"loss": 3.1162,
"step": 96600
},
{
"epoch": 28.16142191142191,
"grad_norm": 0.4455907642841339,
"learning_rate": 0.0002622192419825073,
"loss": 3.1177,
"step": 96650
},
{
"epoch": 28.175990675990676,
"grad_norm": 0.42018404603004456,
"learning_rate": 0.00026204431486880465,
"loss": 3.1113,
"step": 96700
},
{
"epoch": 28.19055944055944,
"grad_norm": 0.4048352837562561,
"learning_rate": 0.00026186938775510203,
"loss": 3.1236,
"step": 96750
},
{
"epoch": 28.205128205128204,
"grad_norm": 0.4386017620563507,
"learning_rate": 0.0002616944606413994,
"loss": 3.1282,
"step": 96800
},
{
"epoch": 28.21969696969697,
"grad_norm": 0.4359148144721985,
"learning_rate": 0.0002615195335276968,
"loss": 3.1246,
"step": 96850
},
{
"epoch": 28.234265734265733,
"grad_norm": 0.44523295760154724,
"learning_rate": 0.00026134460641399416,
"loss": 3.123,
"step": 96900
},
{
"epoch": 28.248834498834498,
"grad_norm": 0.43056222796440125,
"learning_rate": 0.00026116967930029153,
"loss": 3.1311,
"step": 96950
},
{
"epoch": 28.263403263403262,
"grad_norm": 0.42719486355781555,
"learning_rate": 0.0002609947521865889,
"loss": 3.1312,
"step": 97000
},
{
"epoch": 28.263403263403262,
"eval_accuracy": 0.37481805753532843,
"eval_loss": 3.547030448913574,
"eval_runtime": 179.1721,
"eval_samples_per_second": 92.883,
"eval_steps_per_second": 5.81,
"step": 97000
},
{
"epoch": 28.277972027972027,
"grad_norm": 0.437563955783844,
"learning_rate": 0.0002608198250728863,
"loss": 3.1513,
"step": 97050
},
{
"epoch": 28.29254079254079,
"grad_norm": 0.4412456452846527,
"learning_rate": 0.00026064489795918366,
"loss": 3.1316,
"step": 97100
},
{
"epoch": 28.307109557109555,
"grad_norm": 0.418857216835022,
"learning_rate": 0.00026046997084548104,
"loss": 3.1284,
"step": 97150
},
{
"epoch": 28.32167832167832,
"grad_norm": 0.42892611026763916,
"learning_rate": 0.0002602950437317784,
"loss": 3.1356,
"step": 97200
},
{
"epoch": 28.336247086247088,
"grad_norm": 0.42652612924575806,
"learning_rate": 0.0002601201166180758,
"loss": 3.1433,
"step": 97250
},
{
"epoch": 28.350815850815852,
"grad_norm": 0.45473426580429077,
"learning_rate": 0.00025994518950437316,
"loss": 3.146,
"step": 97300
},
{
"epoch": 28.365384615384617,
"grad_norm": 0.47524523735046387,
"learning_rate": 0.00025977026239067054,
"loss": 3.1339,
"step": 97350
},
{
"epoch": 28.37995337995338,
"grad_norm": 0.4518192708492279,
"learning_rate": 0.0002595953352769679,
"loss": 3.1333,
"step": 97400
},
{
"epoch": 28.394522144522146,
"grad_norm": 0.46246474981307983,
"learning_rate": 0.0002594204081632653,
"loss": 3.1494,
"step": 97450
},
{
"epoch": 28.40909090909091,
"grad_norm": 0.40630584955215454,
"learning_rate": 0.00025924548104956267,
"loss": 3.1528,
"step": 97500
},
{
"epoch": 28.423659673659674,
"grad_norm": 0.45706915855407715,
"learning_rate": 0.00025907055393586004,
"loss": 3.1553,
"step": 97550
},
{
"epoch": 28.43822843822844,
"grad_norm": 0.4858422875404358,
"learning_rate": 0.0002588956268221574,
"loss": 3.1473,
"step": 97600
},
{
"epoch": 28.452797202797203,
"grad_norm": 0.4400869905948639,
"learning_rate": 0.0002587206997084548,
"loss": 3.1509,
"step": 97650
},
{
"epoch": 28.467365967365968,
"grad_norm": 0.4365074336528778,
"learning_rate": 0.00025854577259475217,
"loss": 3.1461,
"step": 97700
},
{
"epoch": 28.481934731934732,
"grad_norm": 0.4489445686340332,
"learning_rate": 0.00025837084548104955,
"loss": 3.1589,
"step": 97750
},
{
"epoch": 28.496503496503497,
"grad_norm": 0.45960038900375366,
"learning_rate": 0.0002581959183673469,
"loss": 3.1521,
"step": 97800
},
{
"epoch": 28.51107226107226,
"grad_norm": 0.4620950222015381,
"learning_rate": 0.0002580209912536443,
"loss": 3.1591,
"step": 97850
},
{
"epoch": 28.525641025641026,
"grad_norm": 0.4320875108242035,
"learning_rate": 0.0002578460641399417,
"loss": 3.155,
"step": 97900
},
{
"epoch": 28.54020979020979,
"grad_norm": 0.42469337582588196,
"learning_rate": 0.00025767113702623905,
"loss": 3.1591,
"step": 97950
},
{
"epoch": 28.554778554778554,
"grad_norm": 0.44742336869239807,
"learning_rate": 0.0002574962099125364,
"loss": 3.1501,
"step": 98000
},
{
"epoch": 28.554778554778554,
"eval_accuracy": 0.3747974791340891,
"eval_loss": 3.5462825298309326,
"eval_runtime": 179.092,
"eval_samples_per_second": 92.924,
"eval_steps_per_second": 5.813,
"step": 98000
},
{
"epoch": 28.56934731934732,
"grad_norm": 0.45411232113838196,
"learning_rate": 0.0002573212827988338,
"loss": 3.1527,
"step": 98050
},
{
"epoch": 28.583916083916083,
"grad_norm": 0.43855708837509155,
"learning_rate": 0.0002571463556851312,
"loss": 3.1614,
"step": 98100
},
{
"epoch": 28.598484848484848,
"grad_norm": 0.4427168369293213,
"learning_rate": 0.00025697142857142855,
"loss": 3.1581,
"step": 98150
},
{
"epoch": 28.613053613053612,
"grad_norm": 0.4582144618034363,
"learning_rate": 0.00025679650145772593,
"loss": 3.1501,
"step": 98200
},
{
"epoch": 28.627622377622377,
"grad_norm": 0.43789151310920715,
"learning_rate": 0.0002566215743440233,
"loss": 3.1521,
"step": 98250
},
{
"epoch": 28.64219114219114,
"grad_norm": 0.4331110417842865,
"learning_rate": 0.0002564466472303207,
"loss": 3.1604,
"step": 98300
},
{
"epoch": 28.656759906759905,
"grad_norm": 0.4316922128200531,
"learning_rate": 0.00025627172011661806,
"loss": 3.1523,
"step": 98350
},
{
"epoch": 28.67132867132867,
"grad_norm": 0.4386139512062073,
"learning_rate": 0.00025609679300291543,
"loss": 3.1586,
"step": 98400
},
{
"epoch": 28.685897435897434,
"grad_norm": 0.41735491156578064,
"learning_rate": 0.0002559218658892128,
"loss": 3.1663,
"step": 98450
},
{
"epoch": 28.7004662004662,
"grad_norm": 0.4343891441822052,
"learning_rate": 0.0002557469387755102,
"loss": 3.1526,
"step": 98500
},
{
"epoch": 28.715034965034967,
"grad_norm": 0.4205450415611267,
"learning_rate": 0.00025557201166180756,
"loss": 3.1696,
"step": 98550
},
{
"epoch": 28.72960372960373,
"grad_norm": 0.416477769613266,
"learning_rate": 0.00025539708454810493,
"loss": 3.1599,
"step": 98600
},
{
"epoch": 28.744172494172496,
"grad_norm": 0.4437694549560547,
"learning_rate": 0.0002552221574344023,
"loss": 3.1663,
"step": 98650
},
{
"epoch": 28.75874125874126,
"grad_norm": 0.4323379695415497,
"learning_rate": 0.0002550472303206997,
"loss": 3.1713,
"step": 98700
},
{
"epoch": 28.773310023310025,
"grad_norm": 0.4636428952217102,
"learning_rate": 0.00025487230320699706,
"loss": 3.1653,
"step": 98750
},
{
"epoch": 28.78787878787879,
"grad_norm": 0.45252299308776855,
"learning_rate": 0.00025469737609329444,
"loss": 3.1766,
"step": 98800
},
{
"epoch": 28.802447552447553,
"grad_norm": 0.44343459606170654,
"learning_rate": 0.0002545224489795918,
"loss": 3.157,
"step": 98850
},
{
"epoch": 28.817016317016318,
"grad_norm": 0.5136388540267944,
"learning_rate": 0.0002543475218658892,
"loss": 3.1567,
"step": 98900
},
{
"epoch": 28.831585081585082,
"grad_norm": 0.42482438683509827,
"learning_rate": 0.00025417259475218657,
"loss": 3.1711,
"step": 98950
},
{
"epoch": 28.846153846153847,
"grad_norm": 0.40038368105888367,
"learning_rate": 0.00025399766763848394,
"loss": 3.1644,
"step": 99000
},
{
"epoch": 28.846153846153847,
"eval_accuracy": 0.37489166941633306,
"eval_loss": 3.5355660915374756,
"eval_runtime": 181.4571,
"eval_samples_per_second": 91.713,
"eval_steps_per_second": 5.737,
"step": 99000
},
{
"epoch": 28.86072261072261,
"grad_norm": 0.4373658001422882,
"learning_rate": 0.0002538227405247813,
"loss": 3.1621,
"step": 99050
},
{
"epoch": 28.875291375291376,
"grad_norm": 0.40323692560195923,
"learning_rate": 0.0002536478134110787,
"loss": 3.1735,
"step": 99100
},
{
"epoch": 28.88986013986014,
"grad_norm": 0.4565911293029785,
"learning_rate": 0.00025347288629737607,
"loss": 3.1726,
"step": 99150
},
{
"epoch": 28.904428904428904,
"grad_norm": 0.41919973492622375,
"learning_rate": 0.00025329795918367344,
"loss": 3.1786,
"step": 99200
},
{
"epoch": 28.91899766899767,
"grad_norm": 0.4275195002555847,
"learning_rate": 0.0002531230320699708,
"loss": 3.1762,
"step": 99250
},
{
"epoch": 28.933566433566433,
"grad_norm": 0.41187289357185364,
"learning_rate": 0.0002529481049562682,
"loss": 3.175,
"step": 99300
},
{
"epoch": 28.948135198135198,
"grad_norm": 0.4252161383628845,
"learning_rate": 0.00025277317784256557,
"loss": 3.1768,
"step": 99350
},
{
"epoch": 28.962703962703962,
"grad_norm": 0.44886091351509094,
"learning_rate": 0.00025259825072886295,
"loss": 3.1724,
"step": 99400
},
{
"epoch": 28.977272727272727,
"grad_norm": 0.4043419361114502,
"learning_rate": 0.0002524233236151603,
"loss": 3.1809,
"step": 99450
},
{
"epoch": 28.99184149184149,
"grad_norm": 0.44792184233665466,
"learning_rate": 0.0002522483965014577,
"loss": 3.1773,
"step": 99500
},
{
"epoch": 29.006410256410255,
"grad_norm": 0.4384615421295166,
"learning_rate": 0.0002520734693877551,
"loss": 3.127,
"step": 99550
},
{
"epoch": 29.02097902097902,
"grad_norm": 0.453096479177475,
"learning_rate": 0.00025189854227405245,
"loss": 3.0883,
"step": 99600
},
{
"epoch": 29.035547785547784,
"grad_norm": 0.4151376187801361,
"learning_rate": 0.0002517236151603498,
"loss": 3.0826,
"step": 99650
},
{
"epoch": 29.05011655011655,
"grad_norm": 0.44585660099983215,
"learning_rate": 0.0002515486880466472,
"loss": 3.0854,
"step": 99700
},
{
"epoch": 29.064685314685313,
"grad_norm": 0.42738720774650574,
"learning_rate": 0.0002513737609329446,
"loss": 3.0954,
"step": 99750
},
{
"epoch": 29.079254079254078,
"grad_norm": 0.42302823066711426,
"learning_rate": 0.00025119883381924195,
"loss": 3.0988,
"step": 99800
},
{
"epoch": 29.093822843822842,
"grad_norm": 0.4460339844226837,
"learning_rate": 0.00025102390670553933,
"loss": 3.1087,
"step": 99850
},
{
"epoch": 29.10839160839161,
"grad_norm": 0.4409368634223938,
"learning_rate": 0.0002508489795918367,
"loss": 3.109,
"step": 99900
},
{
"epoch": 29.122960372960375,
"grad_norm": 0.429755836725235,
"learning_rate": 0.0002506740524781341,
"loss": 3.0967,
"step": 99950
},
{
"epoch": 29.13752913752914,
"grad_norm": 0.41378381848335266,
"learning_rate": 0.00025049912536443146,
"loss": 3.1114,
"step": 100000
},
{
"epoch": 29.13752913752914,
"eval_accuracy": 0.37418765291222006,
"eval_loss": 3.5503756999969482,
"eval_runtime": 180.604,
"eval_samples_per_second": 92.146,
"eval_steps_per_second": 5.764,
"step": 100000
},
{
"epoch": 29.152097902097903,
"grad_norm": 0.4601365327835083,
"learning_rate": 0.00025032419825072883,
"loss": 3.1114,
"step": 100050
},
{
"epoch": 29.166666666666668,
"grad_norm": 0.4332452714443207,
"learning_rate": 0.0002501492711370262,
"loss": 3.1138,
"step": 100100
},
{
"epoch": 29.181235431235432,
"grad_norm": 0.44746899604797363,
"learning_rate": 0.0002499743440233236,
"loss": 3.1031,
"step": 100150
},
{
"epoch": 29.195804195804197,
"grad_norm": 0.4199642539024353,
"learning_rate": 0.00024979941690962096,
"loss": 3.1175,
"step": 100200
},
{
"epoch": 29.21037296037296,
"grad_norm": 0.4360141456127167,
"learning_rate": 0.00024962448979591834,
"loss": 3.1169,
"step": 100250
},
{
"epoch": 29.224941724941726,
"grad_norm": 0.41213443875312805,
"learning_rate": 0.0002494495626822157,
"loss": 3.1143,
"step": 100300
},
{
"epoch": 29.23951048951049,
"grad_norm": 0.46815258264541626,
"learning_rate": 0.0002492746355685131,
"loss": 3.116,
"step": 100350
},
{
"epoch": 29.254079254079254,
"grad_norm": 0.4512742757797241,
"learning_rate": 0.00024909970845481046,
"loss": 3.1304,
"step": 100400
},
{
"epoch": 29.26864801864802,
"grad_norm": 0.4723387658596039,
"learning_rate": 0.00024892478134110784,
"loss": 3.1248,
"step": 100450
},
{
"epoch": 29.283216783216783,
"grad_norm": 0.4225069284439087,
"learning_rate": 0.0002487498542274052,
"loss": 3.1198,
"step": 100500
},
{
"epoch": 29.297785547785548,
"grad_norm": 0.4604575037956238,
"learning_rate": 0.0002485749271137026,
"loss": 3.125,
"step": 100550
},
{
"epoch": 29.312354312354312,
"grad_norm": 0.4449054002761841,
"learning_rate": 0.00024839999999999997,
"loss": 3.1151,
"step": 100600
},
{
"epoch": 29.326923076923077,
"grad_norm": 0.4241639971733093,
"learning_rate": 0.00024822507288629734,
"loss": 3.1222,
"step": 100650
},
{
"epoch": 29.34149184149184,
"grad_norm": 0.4389256238937378,
"learning_rate": 0.0002480501457725947,
"loss": 3.1388,
"step": 100700
},
{
"epoch": 29.356060606060606,
"grad_norm": 0.4419405460357666,
"learning_rate": 0.0002478752186588921,
"loss": 3.1262,
"step": 100750
},
{
"epoch": 29.37062937062937,
"grad_norm": 0.4478071630001068,
"learning_rate": 0.00024770029154518947,
"loss": 3.1314,
"step": 100800
},
{
"epoch": 29.385198135198134,
"grad_norm": 0.4380553960800171,
"learning_rate": 0.00024752536443148685,
"loss": 3.1442,
"step": 100850
},
{
"epoch": 29.3997668997669,
"grad_norm": 0.45655930042266846,
"learning_rate": 0.0002473504373177842,
"loss": 3.1258,
"step": 100900
},
{
"epoch": 29.414335664335663,
"grad_norm": 0.4610624611377716,
"learning_rate": 0.0002471755102040816,
"loss": 3.1336,
"step": 100950
},
{
"epoch": 29.428904428904428,
"grad_norm": 0.4348837733268738,
"learning_rate": 0.000247000583090379,
"loss": 3.1421,
"step": 101000
},
{
"epoch": 29.428904428904428,
"eval_accuracy": 0.37460063202737703,
"eval_loss": 3.5466175079345703,
"eval_runtime": 180.3672,
"eval_samples_per_second": 92.267,
"eval_steps_per_second": 5.772,
"step": 101000
},
{
"epoch": 29.443473193473192,
"grad_norm": 0.42182600498199463,
"learning_rate": 0.00024682565597667635,
"loss": 3.1319,
"step": 101050
},
{
"epoch": 29.458041958041957,
"grad_norm": 0.4220890402793884,
"learning_rate": 0.0002466507288629737,
"loss": 3.1315,
"step": 101100
},
{
"epoch": 29.47261072261072,
"grad_norm": 0.45002591609954834,
"learning_rate": 0.0002464758017492711,
"loss": 3.1372,
"step": 101150
},
{
"epoch": 29.487179487179485,
"grad_norm": 0.44506534934043884,
"learning_rate": 0.0002463008746355685,
"loss": 3.1375,
"step": 101200
},
{
"epoch": 29.501748251748253,
"grad_norm": 0.44931530952453613,
"learning_rate": 0.00024612594752186585,
"loss": 3.1525,
"step": 101250
},
{
"epoch": 29.516317016317018,
"grad_norm": 0.4678424298763275,
"learning_rate": 0.00024595102040816323,
"loss": 3.1535,
"step": 101300
},
{
"epoch": 29.530885780885782,
"grad_norm": 0.4373933672904968,
"learning_rate": 0.0002457760932944606,
"loss": 3.1546,
"step": 101350
},
{
"epoch": 29.545454545454547,
"grad_norm": 0.4528273940086365,
"learning_rate": 0.000245601166180758,
"loss": 3.1348,
"step": 101400
},
{
"epoch": 29.56002331002331,
"grad_norm": 0.44172629714012146,
"learning_rate": 0.0002454262390670554,
"loss": 3.1459,
"step": 101450
},
{
"epoch": 29.574592074592076,
"grad_norm": 0.43117034435272217,
"learning_rate": 0.00024525131195335273,
"loss": 3.1472,
"step": 101500
},
{
"epoch": 29.58916083916084,
"grad_norm": 0.42502203583717346,
"learning_rate": 0.0002450763848396501,
"loss": 3.1463,
"step": 101550
},
{
"epoch": 29.603729603729604,
"grad_norm": 0.4515135586261749,
"learning_rate": 0.0002449014577259475,
"loss": 3.1579,
"step": 101600
},
{
"epoch": 29.61829836829837,
"grad_norm": 0.47481200098991394,
"learning_rate": 0.0002447265306122449,
"loss": 3.1538,
"step": 101650
},
{
"epoch": 29.632867132867133,
"grad_norm": 0.43279948830604553,
"learning_rate": 0.00024455160349854223,
"loss": 3.1372,
"step": 101700
},
{
"epoch": 29.647435897435898,
"grad_norm": 0.4356144964694977,
"learning_rate": 0.0002443766763848396,
"loss": 3.1568,
"step": 101750
},
{
"epoch": 29.662004662004662,
"grad_norm": 0.43911242485046387,
"learning_rate": 0.000244201749271137,
"loss": 3.1467,
"step": 101800
},
{
"epoch": 29.676573426573427,
"grad_norm": 0.420939564704895,
"learning_rate": 0.0002440268221574344,
"loss": 3.1472,
"step": 101850
},
{
"epoch": 29.69114219114219,
"grad_norm": 0.433510959148407,
"learning_rate": 0.00024385189504373176,
"loss": 3.1612,
"step": 101900
},
{
"epoch": 29.705710955710956,
"grad_norm": 0.4388565719127655,
"learning_rate": 0.0002436769679300291,
"loss": 3.1629,
"step": 101950
},
{
"epoch": 29.72027972027972,
"grad_norm": 0.43283745646476746,
"learning_rate": 0.00024350204081632652,
"loss": 3.1544,
"step": 102000
},
{
"epoch": 29.72027972027972,
"eval_accuracy": 0.37507475839193083,
"eval_loss": 3.542076826095581,
"eval_runtime": 180.2077,
"eval_samples_per_second": 92.349,
"eval_steps_per_second": 5.777,
"step": 102000
},
{
"epoch": 29.734848484848484,
"grad_norm": 0.43232461810112,
"learning_rate": 0.0002433271137026239,
"loss": 3.1606,
"step": 102050
},
{
"epoch": 29.74941724941725,
"grad_norm": 0.4560996890068054,
"learning_rate": 0.00024315218658892127,
"loss": 3.1453,
"step": 102100
},
{
"epoch": 29.763986013986013,
"grad_norm": 0.4297953248023987,
"learning_rate": 0.00024297725947521862,
"loss": 3.1618,
"step": 102150
},
{
"epoch": 29.778554778554778,
"grad_norm": 0.4410262405872345,
"learning_rate": 0.00024280233236151602,
"loss": 3.1584,
"step": 102200
},
{
"epoch": 29.793123543123542,
"grad_norm": 0.45969489216804504,
"learning_rate": 0.0002426274052478134,
"loss": 3.1546,
"step": 102250
},
{
"epoch": 29.807692307692307,
"grad_norm": 0.432098388671875,
"learning_rate": 0.00024245247813411077,
"loss": 3.159,
"step": 102300
},
{
"epoch": 29.82226107226107,
"grad_norm": 0.43258893489837646,
"learning_rate": 0.00024227755102040815,
"loss": 3.1638,
"step": 102350
},
{
"epoch": 29.836829836829835,
"grad_norm": 0.42827901244163513,
"learning_rate": 0.0002421026239067055,
"loss": 3.1682,
"step": 102400
},
{
"epoch": 29.8513986013986,
"grad_norm": 0.42459243535995483,
"learning_rate": 0.0002419276967930029,
"loss": 3.168,
"step": 102450
},
{
"epoch": 29.865967365967364,
"grad_norm": 0.4142208993434906,
"learning_rate": 0.00024175276967930027,
"loss": 3.1613,
"step": 102500
},
{
"epoch": 29.88053613053613,
"grad_norm": 0.4322669804096222,
"learning_rate": 0.00024157784256559765,
"loss": 3.1545,
"step": 102550
},
{
"epoch": 29.895104895104897,
"grad_norm": 0.47667908668518066,
"learning_rate": 0.000241402915451895,
"loss": 3.1543,
"step": 102600
},
{
"epoch": 29.90967365967366,
"grad_norm": 0.41840967535972595,
"learning_rate": 0.0002412279883381924,
"loss": 3.16,
"step": 102650
},
{
"epoch": 29.924242424242426,
"grad_norm": 0.41584140062332153,
"learning_rate": 0.00024105306122448978,
"loss": 3.1566,
"step": 102700
},
{
"epoch": 29.93881118881119,
"grad_norm": 0.43997734785079956,
"learning_rate": 0.00024087813411078715,
"loss": 3.1648,
"step": 102750
},
{
"epoch": 29.953379953379955,
"grad_norm": 0.44305679202079773,
"learning_rate": 0.00024070320699708453,
"loss": 3.174,
"step": 102800
},
{
"epoch": 29.96794871794872,
"grad_norm": 0.4263700246810913,
"learning_rate": 0.0002405282798833819,
"loss": 3.1631,
"step": 102850
},
{
"epoch": 29.982517482517483,
"grad_norm": 0.4472276270389557,
"learning_rate": 0.00024035335276967928,
"loss": 3.1648,
"step": 102900
},
{
"epoch": 29.997086247086248,
"grad_norm": 0.44121333956718445,
"learning_rate": 0.00024017842565597666,
"loss": 3.1706,
"step": 102950
},
{
"epoch": 30.011655011655012,
"grad_norm": 0.44513824582099915,
"learning_rate": 0.00024000349854227403,
"loss": 3.1002,
"step": 103000
},
{
"epoch": 30.011655011655012,
"eval_accuracy": 0.3748071215849555,
"eval_loss": 3.5492284297943115,
"eval_runtime": 180.2511,
"eval_samples_per_second": 92.327,
"eval_steps_per_second": 5.775,
"step": 103000
},
{
"epoch": 30.026223776223777,
"grad_norm": 0.45389237999916077,
"learning_rate": 0.00023982857142857138,
"loss": 3.0733,
"step": 103050
},
{
"epoch": 30.04079254079254,
"grad_norm": 0.45321691036224365,
"learning_rate": 0.00023965364431486878,
"loss": 3.0796,
"step": 103100
},
{
"epoch": 30.055361305361306,
"grad_norm": 0.43791627883911133,
"learning_rate": 0.00023947871720116616,
"loss": 3.0821,
"step": 103150
},
{
"epoch": 30.06993006993007,
"grad_norm": 0.43642351031303406,
"learning_rate": 0.00023930379008746353,
"loss": 3.0828,
"step": 103200
},
{
"epoch": 30.084498834498834,
"grad_norm": 0.45049965381622314,
"learning_rate": 0.00023912886297376094,
"loss": 3.0869,
"step": 103250
},
{
"epoch": 30.0990675990676,
"grad_norm": 0.42755648493766785,
"learning_rate": 0.00023895393586005829,
"loss": 3.0856,
"step": 103300
},
{
"epoch": 30.113636363636363,
"grad_norm": 0.43771249055862427,
"learning_rate": 0.00023877900874635566,
"loss": 3.0936,
"step": 103350
},
{
"epoch": 30.128205128205128,
"grad_norm": 0.44602829217910767,
"learning_rate": 0.00023860408163265304,
"loss": 3.1002,
"step": 103400
},
{
"epoch": 30.142773892773892,
"grad_norm": 0.46566757559776306,
"learning_rate": 0.00023842915451895041,
"loss": 3.099,
"step": 103450
},
{
"epoch": 30.157342657342657,
"grad_norm": 0.42642471194267273,
"learning_rate": 0.0002382542274052478,
"loss": 3.1071,
"step": 103500
},
{
"epoch": 30.17191142191142,
"grad_norm": 0.43897151947021484,
"learning_rate": 0.00023807930029154517,
"loss": 3.1069,
"step": 103550
},
{
"epoch": 30.186480186480185,
"grad_norm": 0.4233187139034271,
"learning_rate": 0.00023790437317784254,
"loss": 3.1073,
"step": 103600
},
{
"epoch": 30.20104895104895,
"grad_norm": 0.44739827513694763,
"learning_rate": 0.00023772944606413992,
"loss": 3.1098,
"step": 103650
},
{
"epoch": 30.215617715617714,
"grad_norm": 0.42843517661094666,
"learning_rate": 0.00023755451895043732,
"loss": 3.1015,
"step": 103700
},
{
"epoch": 30.23018648018648,
"grad_norm": 0.44605836272239685,
"learning_rate": 0.00023737959183673467,
"loss": 3.1081,
"step": 103750
},
{
"epoch": 30.244755244755243,
"grad_norm": 0.42383548617362976,
"learning_rate": 0.00023720466472303204,
"loss": 3.1034,
"step": 103800
},
{
"epoch": 30.259324009324008,
"grad_norm": 0.4467232823371887,
"learning_rate": 0.00023702973760932942,
"loss": 3.1198,
"step": 103850
},
{
"epoch": 30.273892773892776,
"grad_norm": 0.44068634510040283,
"learning_rate": 0.00023685481049562682,
"loss": 3.1312,
"step": 103900
},
{
"epoch": 30.28846153846154,
"grad_norm": 0.43057772517204285,
"learning_rate": 0.00023667988338192417,
"loss": 3.124,
"step": 103950
},
{
"epoch": 30.303030303030305,
"grad_norm": 0.461153507232666,
"learning_rate": 0.00023650495626822155,
"loss": 3.1165,
"step": 104000
},
{
"epoch": 30.303030303030305,
"eval_accuracy": 0.3746876492669033,
"eval_loss": 3.5488526821136475,
"eval_runtime": 180.1429,
"eval_samples_per_second": 92.382,
"eval_steps_per_second": 5.779,
"step": 104000
},
{
"epoch": 30.31759906759907,
"grad_norm": 0.44713693857192993,
"learning_rate": 0.00023633002915451892,
"loss": 3.1326,
"step": 104050
},
{
"epoch": 30.332167832167833,
"grad_norm": 0.44030696153640747,
"learning_rate": 0.00023615510204081633,
"loss": 3.121,
"step": 104100
},
{
"epoch": 30.346736596736598,
"grad_norm": 0.43711525201797485,
"learning_rate": 0.0002359801749271137,
"loss": 3.1182,
"step": 104150
},
{
"epoch": 30.361305361305362,
"grad_norm": 0.4294222295284271,
"learning_rate": 0.00023580524781341105,
"loss": 3.1187,
"step": 104200
},
{
"epoch": 30.375874125874127,
"grad_norm": 0.43076273798942566,
"learning_rate": 0.00023563032069970843,
"loss": 3.12,
"step": 104250
},
{
"epoch": 30.39044289044289,
"grad_norm": 0.4725896120071411,
"learning_rate": 0.0002354553935860058,
"loss": 3.1244,
"step": 104300
},
{
"epoch": 30.405011655011656,
"grad_norm": 0.45083412528038025,
"learning_rate": 0.0002352804664723032,
"loss": 3.1204,
"step": 104350
},
{
"epoch": 30.41958041958042,
"grad_norm": 0.4314779043197632,
"learning_rate": 0.00023510553935860055,
"loss": 3.1339,
"step": 104400
},
{
"epoch": 30.434149184149184,
"grad_norm": 0.4679701626300812,
"learning_rate": 0.00023493061224489793,
"loss": 3.1318,
"step": 104450
},
{
"epoch": 30.44871794871795,
"grad_norm": 0.47724515199661255,
"learning_rate": 0.0002347556851311953,
"loss": 3.1144,
"step": 104500
},
{
"epoch": 30.463286713286713,
"grad_norm": 0.46651652455329895,
"learning_rate": 0.0002345807580174927,
"loss": 3.1216,
"step": 104550
},
{
"epoch": 30.477855477855478,
"grad_norm": 0.3993292450904846,
"learning_rate": 0.00023440583090379008,
"loss": 3.1345,
"step": 104600
},
{
"epoch": 30.492424242424242,
"grad_norm": 0.4892179071903229,
"learning_rate": 0.00023423090379008743,
"loss": 3.1346,
"step": 104650
},
{
"epoch": 30.506993006993007,
"grad_norm": 0.4761413633823395,
"learning_rate": 0.0002340559766763848,
"loss": 3.1327,
"step": 104700
},
{
"epoch": 30.52156177156177,
"grad_norm": 0.4577150344848633,
"learning_rate": 0.0002338810495626822,
"loss": 3.1338,
"step": 104750
},
{
"epoch": 30.536130536130536,
"grad_norm": 0.4509866535663605,
"learning_rate": 0.0002337061224489796,
"loss": 3.134,
"step": 104800
},
{
"epoch": 30.5506993006993,
"grad_norm": 0.4465436339378357,
"learning_rate": 0.00023353119533527694,
"loss": 3.1374,
"step": 104850
},
{
"epoch": 30.565268065268064,
"grad_norm": 0.4383067786693573,
"learning_rate": 0.0002333562682215743,
"loss": 3.1411,
"step": 104900
},
{
"epoch": 30.57983682983683,
"grad_norm": 0.4641270041465759,
"learning_rate": 0.0002331813411078717,
"loss": 3.1356,
"step": 104950
},
{
"epoch": 30.594405594405593,
"grad_norm": 0.47275739908218384,
"learning_rate": 0.0002330064139941691,
"loss": 3.1392,
"step": 105000
},
{
"epoch": 30.594405594405593,
"eval_accuracy": 0.37494376216918457,
"eval_loss": 3.54292893409729,
"eval_runtime": 179.8448,
"eval_samples_per_second": 92.535,
"eval_steps_per_second": 5.788,
"step": 105000
},
{
"epoch": 30.608974358974358,
"grad_norm": 0.4609619081020355,
"learning_rate": 0.00023283148688046647,
"loss": 3.1408,
"step": 105050
},
{
"epoch": 30.623543123543122,
"grad_norm": 0.4698541462421417,
"learning_rate": 0.00023265655976676381,
"loss": 3.1478,
"step": 105100
},
{
"epoch": 30.638111888111887,
"grad_norm": 0.42247274518013,
"learning_rate": 0.0002324816326530612,
"loss": 3.1404,
"step": 105150
},
{
"epoch": 30.65268065268065,
"grad_norm": 0.4531615078449249,
"learning_rate": 0.0002323067055393586,
"loss": 3.1442,
"step": 105200
},
{
"epoch": 30.667249417249415,
"grad_norm": 0.44655829668045044,
"learning_rate": 0.00023213177842565597,
"loss": 3.1468,
"step": 105250
},
{
"epoch": 30.681818181818183,
"grad_norm": 0.45186808705329895,
"learning_rate": 0.00023195685131195332,
"loss": 3.1486,
"step": 105300
},
{
"epoch": 30.696386946386948,
"grad_norm": 0.43957528471946716,
"learning_rate": 0.0002317819241982507,
"loss": 3.1417,
"step": 105350
},
{
"epoch": 30.710955710955712,
"grad_norm": 0.46366026997566223,
"learning_rate": 0.0002316069970845481,
"loss": 3.1572,
"step": 105400
},
{
"epoch": 30.725524475524477,
"grad_norm": 0.4653265178203583,
"learning_rate": 0.00023143206997084547,
"loss": 3.1405,
"step": 105450
},
{
"epoch": 30.74009324009324,
"grad_norm": 0.46687573194503784,
"learning_rate": 0.00023125714285714285,
"loss": 3.1464,
"step": 105500
},
{
"epoch": 30.754662004662006,
"grad_norm": 0.4489416480064392,
"learning_rate": 0.0002310822157434402,
"loss": 3.15,
"step": 105550
},
{
"epoch": 30.76923076923077,
"grad_norm": 0.45375025272369385,
"learning_rate": 0.00023090728862973757,
"loss": 3.1586,
"step": 105600
},
{
"epoch": 30.783799533799534,
"grad_norm": 0.44260141253471375,
"learning_rate": 0.00023073236151603498,
"loss": 3.143,
"step": 105650
},
{
"epoch": 30.7983682983683,
"grad_norm": 0.42545315623283386,
"learning_rate": 0.00023055743440233235,
"loss": 3.1566,
"step": 105700
},
{
"epoch": 30.812937062937063,
"grad_norm": 0.4167237877845764,
"learning_rate": 0.0002303825072886297,
"loss": 3.1505,
"step": 105750
},
{
"epoch": 30.827505827505828,
"grad_norm": 0.414233922958374,
"learning_rate": 0.00023020758017492708,
"loss": 3.1588,
"step": 105800
},
{
"epoch": 30.842074592074592,
"grad_norm": 0.4713982939720154,
"learning_rate": 0.00023003265306122448,
"loss": 3.1509,
"step": 105850
},
{
"epoch": 30.856643356643357,
"grad_norm": 0.4120693504810333,
"learning_rate": 0.00022985772594752185,
"loss": 3.1542,
"step": 105900
},
{
"epoch": 30.87121212121212,
"grad_norm": 0.4330739974975586,
"learning_rate": 0.00022968279883381923,
"loss": 3.1513,
"step": 105950
},
{
"epoch": 30.885780885780886,
"grad_norm": 0.48839515447616577,
"learning_rate": 0.00022950787172011658,
"loss": 3.1409,
"step": 106000
},
{
"epoch": 30.885780885780886,
"eval_accuracy": 0.3753570940569342,
"eval_loss": 3.541912794113159,
"eval_runtime": 179.7698,
"eval_samples_per_second": 92.574,
"eval_steps_per_second": 5.791,
"step": 106000
},
{
"epoch": 30.90034965034965,
"grad_norm": 0.4531843662261963,
"learning_rate": 0.00022933294460641398,
"loss": 3.1547,
"step": 106050
},
{
"epoch": 30.914918414918414,
"grad_norm": 0.4466068744659424,
"learning_rate": 0.00022915801749271136,
"loss": 3.1528,
"step": 106100
},
{
"epoch": 30.92948717948718,
"grad_norm": 0.4656772017478943,
"learning_rate": 0.00022898309037900873,
"loss": 3.1565,
"step": 106150
},
{
"epoch": 30.944055944055943,
"grad_norm": 0.43916213512420654,
"learning_rate": 0.00022880816326530608,
"loss": 3.1612,
"step": 106200
},
{
"epoch": 30.958624708624708,
"grad_norm": 0.4404742419719696,
"learning_rate": 0.00022863323615160349,
"loss": 3.1599,
"step": 106250
},
{
"epoch": 30.973193473193472,
"grad_norm": 0.46946898102760315,
"learning_rate": 0.00022845830903790086,
"loss": 3.1581,
"step": 106300
},
{
"epoch": 30.987762237762237,
"grad_norm": 0.46753212809562683,
"learning_rate": 0.00022828338192419824,
"loss": 3.1671,
"step": 106350
},
{
"epoch": 31.002331002331,
"grad_norm": 0.4661099910736084,
"learning_rate": 0.0002281084548104956,
"loss": 3.1437,
"step": 106400
},
{
"epoch": 31.016899766899765,
"grad_norm": 0.49687930941581726,
"learning_rate": 0.00022793352769679296,
"loss": 3.0715,
"step": 106450
},
{
"epoch": 31.03146853146853,
"grad_norm": 0.44716677069664,
"learning_rate": 0.00022775860058309036,
"loss": 3.0721,
"step": 106500
},
{
"epoch": 31.046037296037294,
"grad_norm": 0.4831984341144562,
"learning_rate": 0.00022758367346938774,
"loss": 3.0636,
"step": 106550
},
{
"epoch": 31.060606060606062,
"grad_norm": 0.42915815114974976,
"learning_rate": 0.00022740874635568512,
"loss": 3.0796,
"step": 106600
},
{
"epoch": 31.075174825174827,
"grad_norm": 0.46241822838783264,
"learning_rate": 0.00022723381924198246,
"loss": 3.0771,
"step": 106650
},
{
"epoch": 31.08974358974359,
"grad_norm": 0.447934627532959,
"learning_rate": 0.00022705889212827987,
"loss": 3.0837,
"step": 106700
},
{
"epoch": 31.104312354312356,
"grad_norm": 0.44860297441482544,
"learning_rate": 0.00022688396501457724,
"loss": 3.0931,
"step": 106750
},
{
"epoch": 31.11888111888112,
"grad_norm": 0.41188955307006836,
"learning_rate": 0.00022670903790087462,
"loss": 3.0743,
"step": 106800
},
{
"epoch": 31.133449883449885,
"grad_norm": 0.4889490604400635,
"learning_rate": 0.000226534110787172,
"loss": 3.0849,
"step": 106850
},
{
"epoch": 31.14801864801865,
"grad_norm": 0.4424610435962677,
"learning_rate": 0.00022635918367346937,
"loss": 3.0869,
"step": 106900
},
{
"epoch": 31.162587412587413,
"grad_norm": 0.4582122564315796,
"learning_rate": 0.00022618425655976675,
"loss": 3.1015,
"step": 106950
},
{
"epoch": 31.177156177156178,
"grad_norm": 0.44879457354545593,
"learning_rate": 0.00022600932944606412,
"loss": 3.101,
"step": 107000
},
{
"epoch": 31.177156177156178,
"eval_accuracy": 0.37432429349644913,
"eval_loss": 3.5554325580596924,
"eval_runtime": 179.6724,
"eval_samples_per_second": 92.624,
"eval_steps_per_second": 5.794,
"step": 107000
},
{
"epoch": 31.191724941724942,
"grad_norm": 0.4423729181289673,
"learning_rate": 0.0002258344023323615,
"loss": 3.1023,
"step": 107050
},
{
"epoch": 31.206293706293707,
"grad_norm": 0.4656815528869629,
"learning_rate": 0.00022565947521865885,
"loss": 3.0914,
"step": 107100
},
{
"epoch": 31.22086247086247,
"grad_norm": 0.4330950975418091,
"learning_rate": 0.00022548454810495625,
"loss": 3.105,
"step": 107150
},
{
"epoch": 31.235431235431236,
"grad_norm": 0.48956480622291565,
"learning_rate": 0.00022530962099125363,
"loss": 3.1062,
"step": 107200
},
{
"epoch": 31.25,
"grad_norm": 0.4430369436740875,
"learning_rate": 0.000225134693877551,
"loss": 3.0958,
"step": 107250
},
{
"epoch": 31.264568764568764,
"grad_norm": 0.43993812799453735,
"learning_rate": 0.0002249597667638484,
"loss": 3.1047,
"step": 107300
},
{
"epoch": 31.27913752913753,
"grad_norm": 0.4541182518005371,
"learning_rate": 0.00022478483965014575,
"loss": 3.1035,
"step": 107350
},
{
"epoch": 31.293706293706293,
"grad_norm": 0.4626857340335846,
"learning_rate": 0.00022460991253644313,
"loss": 3.1051,
"step": 107400
},
{
"epoch": 31.308275058275058,
"grad_norm": 0.46208930015563965,
"learning_rate": 0.0002244349854227405,
"loss": 3.1121,
"step": 107450
},
{
"epoch": 31.322843822843822,
"grad_norm": 0.4782877266407013,
"learning_rate": 0.00022426005830903788,
"loss": 3.1049,
"step": 107500
},
{
"epoch": 31.337412587412587,
"grad_norm": 0.498509019613266,
"learning_rate": 0.00022408513119533526,
"loss": 3.1114,
"step": 107550
},
{
"epoch": 31.35198135198135,
"grad_norm": 0.44582095742225647,
"learning_rate": 0.00022391020408163263,
"loss": 3.1155,
"step": 107600
},
{
"epoch": 31.366550116550115,
"grad_norm": 0.4745688736438751,
"learning_rate": 0.00022373527696793,
"loss": 3.1096,
"step": 107650
},
{
"epoch": 31.38111888111888,
"grad_norm": 0.4498319625854492,
"learning_rate": 0.00022356034985422738,
"loss": 3.1138,
"step": 107700
},
{
"epoch": 31.395687645687644,
"grad_norm": 0.4679555594921112,
"learning_rate": 0.00022338542274052479,
"loss": 3.1141,
"step": 107750
},
{
"epoch": 31.41025641025641,
"grad_norm": 0.44934287667274475,
"learning_rate": 0.00022321049562682213,
"loss": 3.1055,
"step": 107800
},
{
"epoch": 31.424825174825173,
"grad_norm": 0.4695224165916443,
"learning_rate": 0.0002230355685131195,
"loss": 3.1201,
"step": 107850
},
{
"epoch": 31.439393939393938,
"grad_norm": 0.46642690896987915,
"learning_rate": 0.00022286064139941689,
"loss": 3.1202,
"step": 107900
},
{
"epoch": 31.453962703962706,
"grad_norm": 0.4324790835380554,
"learning_rate": 0.0002226857142857143,
"loss": 3.131,
"step": 107950
},
{
"epoch": 31.46853146853147,
"grad_norm": 0.4190344214439392,
"learning_rate": 0.00022251078717201164,
"loss": 3.1215,
"step": 108000
},
{
"epoch": 31.46853146853147,
"eval_accuracy": 0.37509957006428224,
"eval_loss": 3.5474321842193604,
"eval_runtime": 179.0221,
"eval_samples_per_second": 92.961,
"eval_steps_per_second": 5.815,
"step": 108000
},
{
"epoch": 31.483100233100235,
"grad_norm": 0.45015689730644226,
"learning_rate": 0.00022233586005830901,
"loss": 3.1334,
"step": 108050
},
{
"epoch": 31.497668997669,
"grad_norm": 0.45660534501075745,
"learning_rate": 0.0002221609329446064,
"loss": 3.1244,
"step": 108100
},
{
"epoch": 31.512237762237763,
"grad_norm": 0.4564882218837738,
"learning_rate": 0.0002219860058309038,
"loss": 3.1267,
"step": 108150
},
{
"epoch": 31.526806526806528,
"grad_norm": 0.4678419828414917,
"learning_rate": 0.00022181107871720117,
"loss": 3.1248,
"step": 108200
},
{
"epoch": 31.541375291375292,
"grad_norm": 0.4461950957775116,
"learning_rate": 0.00022163615160349852,
"loss": 3.138,
"step": 108250
},
{
"epoch": 31.555944055944057,
"grad_norm": 0.4868103563785553,
"learning_rate": 0.0002214612244897959,
"loss": 3.1406,
"step": 108300
},
{
"epoch": 31.57051282051282,
"grad_norm": 0.4507032036781311,
"learning_rate": 0.00022128629737609327,
"loss": 3.1398,
"step": 108350
},
{
"epoch": 31.585081585081586,
"grad_norm": 0.44092482328414917,
"learning_rate": 0.00022111137026239067,
"loss": 3.1261,
"step": 108400
},
{
"epoch": 31.59965034965035,
"grad_norm": 0.43679946660995483,
"learning_rate": 0.00022093644314868802,
"loss": 3.1314,
"step": 108450
},
{
"epoch": 31.614219114219114,
"grad_norm": 0.4783121645450592,
"learning_rate": 0.0002207615160349854,
"loss": 3.1387,
"step": 108500
},
{
"epoch": 31.62878787878788,
"grad_norm": 0.4393003284931183,
"learning_rate": 0.00022058658892128277,
"loss": 3.1268,
"step": 108550
},
{
"epoch": 31.643356643356643,
"grad_norm": 0.45997878909111023,
"learning_rate": 0.00022041166180758017,
"loss": 3.1362,
"step": 108600
},
{
"epoch": 31.657925407925408,
"grad_norm": 0.4729020297527313,
"learning_rate": 0.00022023673469387755,
"loss": 3.1391,
"step": 108650
},
{
"epoch": 31.672494172494172,
"grad_norm": 0.43747490644454956,
"learning_rate": 0.0002200618075801749,
"loss": 3.1279,
"step": 108700
},
{
"epoch": 31.687062937062937,
"grad_norm": 0.45657721161842346,
"learning_rate": 0.00021988688046647227,
"loss": 3.1444,
"step": 108750
},
{
"epoch": 31.7016317016317,
"grad_norm": 0.43069857358932495,
"learning_rate": 0.00021971195335276968,
"loss": 3.1376,
"step": 108800
},
{
"epoch": 31.716200466200466,
"grad_norm": 0.4645999073982239,
"learning_rate": 0.00021953702623906705,
"loss": 3.1327,
"step": 108850
},
{
"epoch": 31.73076923076923,
"grad_norm": 0.44925370812416077,
"learning_rate": 0.0002193620991253644,
"loss": 3.1401,
"step": 108900
},
{
"epoch": 31.745337995337994,
"grad_norm": 0.49291902780532837,
"learning_rate": 0.00021918717201166178,
"loss": 3.1433,
"step": 108950
},
{
"epoch": 31.75990675990676,
"grad_norm": 0.4575839042663574,
"learning_rate": 0.00021901224489795915,
"loss": 3.1317,
"step": 109000
},
{
"epoch": 31.75990675990676,
"eval_accuracy": 0.37544023079794103,
"eval_loss": 3.538872003555298,
"eval_runtime": 179.1437,
"eval_samples_per_second": 92.898,
"eval_steps_per_second": 5.811,
"step": 109000
},
{
"epoch": 31.75990675990676,
"step": 109000,
"total_flos": 2.278434118828032e+18,
"train_loss": 0.8388565721424348,
"train_runtime": 57880.4577,
"train_samples_per_second": 237.175,
"train_steps_per_second": 2.965
}
],
"logging_steps": 50,
"max_steps": 171600,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 11
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.278434118828032e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}