Setpember's picture
Upload folder using huggingface_hub
5b262b0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.942103570279833,
"eval_steps": 500,
"global_step": 62000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0321646831778707,
"grad_norm": 1.2931797504425049,
"learning_rate": 4.992039240913477e-05,
"loss": 1.7938,
"step": 100
},
{
"epoch": 0.0643293663557414,
"grad_norm": 1.2261773347854614,
"learning_rate": 4.98399807011901e-05,
"loss": 1.4694,
"step": 200
},
{
"epoch": 0.0964940495336121,
"grad_norm": 1.1642584800720215,
"learning_rate": 4.975956899324542e-05,
"loss": 1.3725,
"step": 300
},
{
"epoch": 0.1286587327114828,
"grad_norm": 1.1396938562393188,
"learning_rate": 4.967915728530074e-05,
"loss": 1.3302,
"step": 400
},
{
"epoch": 0.16082341588935348,
"grad_norm": 1.0271061658859253,
"learning_rate": 4.959874557735607e-05,
"loss": 1.2981,
"step": 500
},
{
"epoch": 0.1929880990672242,
"grad_norm": 1.0276116132736206,
"learning_rate": 4.951833386941139e-05,
"loss": 1.2702,
"step": 600
},
{
"epoch": 0.22515278224509488,
"grad_norm": 0.9277088046073914,
"learning_rate": 4.943792216146671e-05,
"loss": 1.2354,
"step": 700
},
{
"epoch": 0.2573174654229656,
"grad_norm": 0.8870278596878052,
"learning_rate": 4.935751045352204e-05,
"loss": 1.2262,
"step": 800
},
{
"epoch": 0.28948214860083626,
"grad_norm": 0.9166188836097717,
"learning_rate": 4.927709874557736e-05,
"loss": 1.2211,
"step": 900
},
{
"epoch": 0.32164683177870695,
"grad_norm": 0.9699078798294067,
"learning_rate": 4.9196687037632685e-05,
"loss": 1.1927,
"step": 1000
},
{
"epoch": 0.3538115149565777,
"grad_norm": 0.8429589867591858,
"learning_rate": 4.9116275329688006e-05,
"loss": 1.1822,
"step": 1100
},
{
"epoch": 0.3859761981344484,
"grad_norm": 0.921406090259552,
"learning_rate": 4.9035863621743327e-05,
"loss": 1.1754,
"step": 1200
},
{
"epoch": 0.4181408813123191,
"grad_norm": 0.9048774242401123,
"learning_rate": 4.895545191379865e-05,
"loss": 1.1757,
"step": 1300
},
{
"epoch": 0.45030556449018977,
"grad_norm": 0.8832495808601379,
"learning_rate": 4.8875040205853975e-05,
"loss": 1.156,
"step": 1400
},
{
"epoch": 0.48247024766806046,
"grad_norm": 1.008846640586853,
"learning_rate": 4.87946284979093e-05,
"loss": 1.1307,
"step": 1500
},
{
"epoch": 0.5146349308459311,
"grad_norm": 0.9244402647018433,
"learning_rate": 4.871421678996462e-05,
"loss": 1.1405,
"step": 1600
},
{
"epoch": 0.5467996140238018,
"grad_norm": 0.8623345494270325,
"learning_rate": 4.8633805082019944e-05,
"loss": 1.117,
"step": 1700
},
{
"epoch": 0.5789642972016725,
"grad_norm": 0.8600289821624756,
"learning_rate": 4.8553393374075265e-05,
"loss": 1.1364,
"step": 1800
},
{
"epoch": 0.6111289803795432,
"grad_norm": 0.8522710800170898,
"learning_rate": 4.847298166613059e-05,
"loss": 1.1196,
"step": 1900
},
{
"epoch": 0.6432936635574139,
"grad_norm": 0.8213996291160583,
"learning_rate": 4.839256995818591e-05,
"loss": 1.0908,
"step": 2000
},
{
"epoch": 0.6754583467352847,
"grad_norm": 0.8777344226837158,
"learning_rate": 4.8312158250241234e-05,
"loss": 1.0863,
"step": 2100
},
{
"epoch": 0.7076230299131554,
"grad_norm": 0.8957362174987793,
"learning_rate": 4.823174654229656e-05,
"loss": 1.1033,
"step": 2200
},
{
"epoch": 0.7397877130910261,
"grad_norm": 0.8989800810813904,
"learning_rate": 4.815133483435189e-05,
"loss": 1.1242,
"step": 2300
},
{
"epoch": 0.7719523962688968,
"grad_norm": 0.8211784362792969,
"learning_rate": 4.807092312640721e-05,
"loss": 1.0763,
"step": 2400
},
{
"epoch": 0.8041170794467675,
"grad_norm": 0.8380767107009888,
"learning_rate": 4.799051141846253e-05,
"loss": 1.115,
"step": 2500
},
{
"epoch": 0.8362817626246382,
"grad_norm": 0.8601460456848145,
"learning_rate": 4.791009971051785e-05,
"loss": 1.0721,
"step": 2600
},
{
"epoch": 0.8684464458025088,
"grad_norm": 0.7620469927787781,
"learning_rate": 4.782968800257317e-05,
"loss": 1.0923,
"step": 2700
},
{
"epoch": 0.9006111289803795,
"grad_norm": 0.7965347170829773,
"learning_rate": 4.77492762946285e-05,
"loss": 1.0701,
"step": 2800
},
{
"epoch": 0.9327758121582502,
"grad_norm": 0.8240953087806702,
"learning_rate": 4.766886458668383e-05,
"loss": 1.0559,
"step": 2900
},
{
"epoch": 0.9649404953361209,
"grad_norm": 0.8731254935264587,
"learning_rate": 4.758845287873915e-05,
"loss": 1.0658,
"step": 3000
},
{
"epoch": 0.9971051785139916,
"grad_norm": 0.8962804675102234,
"learning_rate": 4.750804117079447e-05,
"loss": 1.0566,
"step": 3100
},
{
"epoch": 1.0292698616918623,
"grad_norm": 0.8816267251968384,
"learning_rate": 4.7427629462849796e-05,
"loss": 0.9773,
"step": 3200
},
{
"epoch": 1.061434544869733,
"grad_norm": 0.8294162154197693,
"learning_rate": 4.7348021871984565e-05,
"loss": 0.9802,
"step": 3300
},
{
"epoch": 1.0935992280476037,
"grad_norm": 0.789862334728241,
"learning_rate": 4.7267610164039886e-05,
"loss": 1.0045,
"step": 3400
},
{
"epoch": 1.1257639112254745,
"grad_norm": 0.9009740948677063,
"learning_rate": 4.718719845609521e-05,
"loss": 0.9923,
"step": 3500
},
{
"epoch": 1.157928594403345,
"grad_norm": 0.9688478112220764,
"learning_rate": 4.7106786748150534e-05,
"loss": 0.9924,
"step": 3600
},
{
"epoch": 1.1900932775812159,
"grad_norm": 0.9292545318603516,
"learning_rate": 4.7026375040205855e-05,
"loss": 0.9685,
"step": 3700
},
{
"epoch": 1.2222579607590864,
"grad_norm": 0.8641929030418396,
"learning_rate": 4.6945963332261175e-05,
"loss": 0.9894,
"step": 3800
},
{
"epoch": 1.2544226439369572,
"grad_norm": 0.9717555046081543,
"learning_rate": 4.68655516243165e-05,
"loss": 0.9758,
"step": 3900
},
{
"epoch": 1.286587327114828,
"grad_norm": 0.8688477277755737,
"learning_rate": 4.678513991637183e-05,
"loss": 0.962,
"step": 4000
},
{
"epoch": 1.3187520102926986,
"grad_norm": 0.9667224287986755,
"learning_rate": 4.670472820842715e-05,
"loss": 0.9509,
"step": 4100
},
{
"epoch": 1.3509166934705692,
"grad_norm": 0.9459453225135803,
"learning_rate": 4.662431650048247e-05,
"loss": 0.9901,
"step": 4200
},
{
"epoch": 1.38308137664844,
"grad_norm": 0.7960919141769409,
"learning_rate": 4.654390479253779e-05,
"loss": 0.9448,
"step": 4300
},
{
"epoch": 1.4152460598263108,
"grad_norm": 0.9302664399147034,
"learning_rate": 4.646349308459312e-05,
"loss": 0.9773,
"step": 4400
},
{
"epoch": 1.4474107430041814,
"grad_norm": 0.8294656276702881,
"learning_rate": 4.638308137664844e-05,
"loss": 0.9551,
"step": 4500
},
{
"epoch": 1.4795754261820522,
"grad_norm": 0.8560650944709778,
"learning_rate": 4.630266966870376e-05,
"loss": 0.979,
"step": 4600
},
{
"epoch": 1.5117401093599228,
"grad_norm": 0.896445631980896,
"learning_rate": 4.622225796075909e-05,
"loss": 0.9666,
"step": 4700
},
{
"epoch": 1.5439047925377936,
"grad_norm": 0.7956883907318115,
"learning_rate": 4.614184625281442e-05,
"loss": 0.9691,
"step": 4800
},
{
"epoch": 1.5760694757156641,
"grad_norm": 0.8232746124267578,
"learning_rate": 4.606143454486974e-05,
"loss": 0.9729,
"step": 4900
},
{
"epoch": 1.608234158893535,
"grad_norm": 0.9488968849182129,
"learning_rate": 4.598102283692506e-05,
"loss": 0.9719,
"step": 5000
},
{
"epoch": 1.6403988420714057,
"grad_norm": 0.8127655386924744,
"learning_rate": 4.590061112898038e-05,
"loss": 0.9576,
"step": 5100
},
{
"epoch": 1.6725635252492763,
"grad_norm": 0.8098297119140625,
"learning_rate": 4.582019942103571e-05,
"loss": 0.9717,
"step": 5200
},
{
"epoch": 1.704728208427147,
"grad_norm": 0.798922598361969,
"learning_rate": 4.573978771309103e-05,
"loss": 0.946,
"step": 5300
},
{
"epoch": 1.7368928916050177,
"grad_norm": 0.827904224395752,
"learning_rate": 4.5659376005146355e-05,
"loss": 0.994,
"step": 5400
},
{
"epoch": 1.7690575747828885,
"grad_norm": 0.8054550290107727,
"learning_rate": 4.5578964297201676e-05,
"loss": 0.9506,
"step": 5500
},
{
"epoch": 1.801222257960759,
"grad_norm": 0.787251889705658,
"learning_rate": 4.5498552589256996e-05,
"loss": 0.937,
"step": 5600
},
{
"epoch": 1.8333869411386297,
"grad_norm": 0.8783938884735107,
"learning_rate": 4.5418140881312324e-05,
"loss": 0.9576,
"step": 5700
},
{
"epoch": 1.8655516243165005,
"grad_norm": 0.7947699427604675,
"learning_rate": 4.5337729173367645e-05,
"loss": 0.9586,
"step": 5800
},
{
"epoch": 1.8977163074943713,
"grad_norm": 0.8789839744567871,
"learning_rate": 4.5257317465422966e-05,
"loss": 0.9852,
"step": 5900
},
{
"epoch": 1.9298809906722418,
"grad_norm": 0.7798435091972351,
"learning_rate": 4.5176905757478286e-05,
"loss": 0.9536,
"step": 6000
},
{
"epoch": 1.9620456738501124,
"grad_norm": 0.8084624409675598,
"learning_rate": 4.5096494049533614e-05,
"loss": 0.9886,
"step": 6100
},
{
"epoch": 1.9942103570279834,
"grad_norm": 0.7713519334793091,
"learning_rate": 4.501608234158894e-05,
"loss": 0.9737,
"step": 6200
},
{
"epoch": 2.026375040205854,
"grad_norm": 0.9505152106285095,
"learning_rate": 4.493567063364426e-05,
"loss": 0.8649,
"step": 6300
},
{
"epoch": 2.0585397233837246,
"grad_norm": 1.0130715370178223,
"learning_rate": 4.485525892569958e-05,
"loss": 0.8337,
"step": 6400
},
{
"epoch": 2.090704406561595,
"grad_norm": 0.8994264006614685,
"learning_rate": 4.4774847217754904e-05,
"loss": 0.8414,
"step": 6500
},
{
"epoch": 2.122869089739466,
"grad_norm": 0.8061904311180115,
"learning_rate": 4.469443550981023e-05,
"loss": 0.8375,
"step": 6600
},
{
"epoch": 2.1550337729173368,
"grad_norm": 0.8046594262123108,
"learning_rate": 4.461402380186556e-05,
"loss": 0.857,
"step": 6700
},
{
"epoch": 2.1871984560952074,
"grad_norm": 0.8733768463134766,
"learning_rate": 4.453441621100032e-05,
"loss": 0.8693,
"step": 6800
},
{
"epoch": 2.219363139273078,
"grad_norm": 0.8263835906982422,
"learning_rate": 4.445400450305565e-05,
"loss": 0.8408,
"step": 6900
},
{
"epoch": 2.251527822450949,
"grad_norm": 0.7763977646827698,
"learning_rate": 4.437359279511097e-05,
"loss": 0.87,
"step": 7000
},
{
"epoch": 2.2836925056288195,
"grad_norm": 0.8745999932289124,
"learning_rate": 4.429318108716629e-05,
"loss": 0.8538,
"step": 7100
},
{
"epoch": 2.31585718880669,
"grad_norm": 0.8505412936210632,
"learning_rate": 4.421276937922162e-05,
"loss": 0.8431,
"step": 7200
},
{
"epoch": 2.348021871984561,
"grad_norm": 0.8500067591667175,
"learning_rate": 4.4132357671276945e-05,
"loss": 0.841,
"step": 7300
},
{
"epoch": 2.3801865551624317,
"grad_norm": 0.9537181854248047,
"learning_rate": 4.4051945963332266e-05,
"loss": 0.8473,
"step": 7400
},
{
"epoch": 2.4123512383403023,
"grad_norm": 0.889831006526947,
"learning_rate": 4.3971534255387586e-05,
"loss": 0.8653,
"step": 7500
},
{
"epoch": 2.444515921518173,
"grad_norm": 0.9562848210334778,
"learning_rate": 4.389112254744291e-05,
"loss": 0.8451,
"step": 7600
},
{
"epoch": 2.476680604696044,
"grad_norm": 0.869602382183075,
"learning_rate": 4.3810710839498235e-05,
"loss": 0.8589,
"step": 7700
},
{
"epoch": 2.5088452878739145,
"grad_norm": 0.9119632244110107,
"learning_rate": 4.3730299131553555e-05,
"loss": 0.8468,
"step": 7800
},
{
"epoch": 2.541009971051785,
"grad_norm": 0.8096926212310791,
"learning_rate": 4.364988742360888e-05,
"loss": 0.8495,
"step": 7900
},
{
"epoch": 2.573174654229656,
"grad_norm": 0.9040408134460449,
"learning_rate": 4.3569475715664204e-05,
"loss": 0.8627,
"step": 8000
},
{
"epoch": 2.6053393374075267,
"grad_norm": 0.7748663425445557,
"learning_rate": 4.3489064007719524e-05,
"loss": 0.8717,
"step": 8100
},
{
"epoch": 2.6375040205853972,
"grad_norm": 0.9192063212394714,
"learning_rate": 4.340865229977485e-05,
"loss": 0.8845,
"step": 8200
},
{
"epoch": 2.669668703763268,
"grad_norm": 0.7993588447570801,
"learning_rate": 4.332824059183017e-05,
"loss": 0.8686,
"step": 8300
},
{
"epoch": 2.7018333869411384,
"grad_norm": 0.9893975257873535,
"learning_rate": 4.3247828883885494e-05,
"loss": 0.8655,
"step": 8400
},
{
"epoch": 2.7339980701190094,
"grad_norm": 0.6633499264717102,
"learning_rate": 4.3167417175940814e-05,
"loss": 0.8764,
"step": 8500
},
{
"epoch": 2.76616275329688,
"grad_norm": 0.9091640114784241,
"learning_rate": 4.308700546799614e-05,
"loss": 0.8751,
"step": 8600
},
{
"epoch": 2.7983274364747506,
"grad_norm": 0.820894181728363,
"learning_rate": 4.300659376005147e-05,
"loss": 0.875,
"step": 8700
},
{
"epoch": 2.8304921196526216,
"grad_norm": 0.7544598579406738,
"learning_rate": 4.292618205210679e-05,
"loss": 0.8731,
"step": 8800
},
{
"epoch": 2.862656802830492,
"grad_norm": 0.8050575256347656,
"learning_rate": 4.284577034416211e-05,
"loss": 0.8787,
"step": 8900
},
{
"epoch": 2.8948214860083628,
"grad_norm": 0.8130388855934143,
"learning_rate": 4.276535863621743e-05,
"loss": 0.8566,
"step": 9000
},
{
"epoch": 2.9269861691862333,
"grad_norm": 0.861630380153656,
"learning_rate": 4.268494692827276e-05,
"loss": 0.8776,
"step": 9100
},
{
"epoch": 2.9591508523641044,
"grad_norm": 0.8607050776481628,
"learning_rate": 4.260453522032809e-05,
"loss": 0.8568,
"step": 9200
},
{
"epoch": 2.991315535541975,
"grad_norm": 0.8430234789848328,
"learning_rate": 4.252412351238341e-05,
"loss": 0.8723,
"step": 9300
},
{
"epoch": 3.0234802187198455,
"grad_norm": 0.7705909013748169,
"learning_rate": 4.244371180443873e-05,
"loss": 0.7587,
"step": 9400
},
{
"epoch": 3.055644901897716,
"grad_norm": 1.0884994268417358,
"learning_rate": 4.236330009649405e-05,
"loss": 0.7223,
"step": 9500
},
{
"epoch": 3.087809585075587,
"grad_norm": 0.9366634488105774,
"learning_rate": 4.2282888388549377e-05,
"loss": 0.7462,
"step": 9600
},
{
"epoch": 3.1199742682534577,
"grad_norm": 0.8941793441772461,
"learning_rate": 4.22024766806047e-05,
"loss": 0.7351,
"step": 9700
},
{
"epoch": 3.1521389514313283,
"grad_norm": 0.8178213834762573,
"learning_rate": 4.212206497266002e-05,
"loss": 0.7465,
"step": 9800
},
{
"epoch": 3.1843036346091993,
"grad_norm": 1.0509120225906372,
"learning_rate": 4.2041653264715346e-05,
"loss": 0.7434,
"step": 9900
},
{
"epoch": 3.21646831778707,
"grad_norm": 0.9800424575805664,
"learning_rate": 4.1961241556770666e-05,
"loss": 0.7469,
"step": 10000
},
{
"epoch": 3.2486330009649405,
"grad_norm": 0.9956560134887695,
"learning_rate": 4.1880829848825994e-05,
"loss": 0.7559,
"step": 10100
},
{
"epoch": 3.280797684142811,
"grad_norm": 0.877349317073822,
"learning_rate": 4.1800418140881315e-05,
"loss": 0.7538,
"step": 10200
},
{
"epoch": 3.312962367320682,
"grad_norm": 0.935783326625824,
"learning_rate": 4.1720006432936635e-05,
"loss": 0.7359,
"step": 10300
},
{
"epoch": 3.3451270504985526,
"grad_norm": 1.0015981197357178,
"learning_rate": 4.1639594724991956e-05,
"loss": 0.7665,
"step": 10400
},
{
"epoch": 3.377291733676423,
"grad_norm": 0.9721918702125549,
"learning_rate": 4.1559183017047284e-05,
"loss": 0.7776,
"step": 10500
},
{
"epoch": 3.409456416854294,
"grad_norm": 0.8627845644950867,
"learning_rate": 4.147877130910261e-05,
"loss": 0.7536,
"step": 10600
},
{
"epoch": 3.441621100032165,
"grad_norm": 0.9311351180076599,
"learning_rate": 4.139835960115793e-05,
"loss": 0.7614,
"step": 10700
},
{
"epoch": 3.4737857832100354,
"grad_norm": 1.148760437965393,
"learning_rate": 4.131794789321325e-05,
"loss": 0.7663,
"step": 10800
},
{
"epoch": 3.505950466387906,
"grad_norm": 0.7732231020927429,
"learning_rate": 4.123753618526858e-05,
"loss": 0.742,
"step": 10900
},
{
"epoch": 3.538115149565777,
"grad_norm": 1.0244348049163818,
"learning_rate": 4.11571244773239e-05,
"loss": 0.7509,
"step": 11000
},
{
"epoch": 3.5702798327436476,
"grad_norm": 0.8451341390609741,
"learning_rate": 4.107671276937922e-05,
"loss": 0.755,
"step": 11100
},
{
"epoch": 3.602444515921518,
"grad_norm": 0.9181256890296936,
"learning_rate": 4.099630106143454e-05,
"loss": 0.7556,
"step": 11200
},
{
"epoch": 3.6346091990993887,
"grad_norm": 0.9541407227516174,
"learning_rate": 4.091588935348987e-05,
"loss": 0.7595,
"step": 11300
},
{
"epoch": 3.6667738822772593,
"grad_norm": 0.9781204462051392,
"learning_rate": 4.08354776455452e-05,
"loss": 0.7621,
"step": 11400
},
{
"epoch": 3.6989385654551303,
"grad_norm": 0.9283120632171631,
"learning_rate": 4.075506593760052e-05,
"loss": 0.7531,
"step": 11500
},
{
"epoch": 3.731103248633001,
"grad_norm": 1.1551364660263062,
"learning_rate": 4.067465422965584e-05,
"loss": 0.7578,
"step": 11600
},
{
"epoch": 3.763267931810872,
"grad_norm": 0.9374954104423523,
"learning_rate": 4.059424252171116e-05,
"loss": 0.7624,
"step": 11700
},
{
"epoch": 3.7954326149887425,
"grad_norm": 0.9222885966300964,
"learning_rate": 4.051383081376649e-05,
"loss": 0.7703,
"step": 11800
},
{
"epoch": 3.827597298166613,
"grad_norm": 0.9635455012321472,
"learning_rate": 4.043341910582181e-05,
"loss": 0.7714,
"step": 11900
},
{
"epoch": 3.8597619813444837,
"grad_norm": 0.8468224406242371,
"learning_rate": 4.0353007397877136e-05,
"loss": 0.7624,
"step": 12000
},
{
"epoch": 3.8919266645223543,
"grad_norm": 1.1616623401641846,
"learning_rate": 4.0272595689932457e-05,
"loss": 0.7553,
"step": 12100
},
{
"epoch": 3.9240913477002253,
"grad_norm": 0.9972304105758667,
"learning_rate": 4.019218398198778e-05,
"loss": 0.7446,
"step": 12200
},
{
"epoch": 3.956256030878096,
"grad_norm": 0.9060898423194885,
"learning_rate": 4.0111772274043105e-05,
"loss": 0.7645,
"step": 12300
},
{
"epoch": 3.9884207140559664,
"grad_norm": 0.876514732837677,
"learning_rate": 4.0031360566098426e-05,
"loss": 0.7724,
"step": 12400
},
{
"epoch": 4.0205853972338375,
"grad_norm": 0.9708064198493958,
"learning_rate": 3.9950948858153746e-05,
"loss": 0.6719,
"step": 12500
},
{
"epoch": 4.052750080411708,
"grad_norm": 1.1206649541854858,
"learning_rate": 3.987053715020907e-05,
"loss": 0.6307,
"step": 12600
},
{
"epoch": 4.084914763589579,
"grad_norm": 0.9474230408668518,
"learning_rate": 3.9790125442264395e-05,
"loss": 0.626,
"step": 12700
},
{
"epoch": 4.117079446767449,
"grad_norm": 0.938599169254303,
"learning_rate": 3.970971373431972e-05,
"loss": 0.6249,
"step": 12800
},
{
"epoch": 4.14924412994532,
"grad_norm": 1.1760433912277222,
"learning_rate": 3.962930202637504e-05,
"loss": 0.6387,
"step": 12900
},
{
"epoch": 4.18140881312319,
"grad_norm": 1.1726824045181274,
"learning_rate": 3.9548890318430364e-05,
"loss": 0.6458,
"step": 13000
},
{
"epoch": 4.213573496301062,
"grad_norm": 0.961887538433075,
"learning_rate": 3.9468478610485685e-05,
"loss": 0.6361,
"step": 13100
},
{
"epoch": 4.245738179478932,
"grad_norm": 1.0183879137039185,
"learning_rate": 3.938806690254101e-05,
"loss": 0.6288,
"step": 13200
},
{
"epoch": 4.277902862656803,
"grad_norm": 1.2366642951965332,
"learning_rate": 3.930765519459634e-05,
"loss": 0.6378,
"step": 13300
},
{
"epoch": 4.3100675458346736,
"grad_norm": 0.9713104963302612,
"learning_rate": 3.922724348665166e-05,
"loss": 0.6389,
"step": 13400
},
{
"epoch": 4.342232229012544,
"grad_norm": 0.9488447904586792,
"learning_rate": 3.914683177870698e-05,
"loss": 0.6491,
"step": 13500
},
{
"epoch": 4.374396912190415,
"grad_norm": 1.1355348825454712,
"learning_rate": 3.90664200707623e-05,
"loss": 0.659,
"step": 13600
},
{
"epoch": 4.406561595368285,
"grad_norm": 1.10310959815979,
"learning_rate": 3.898600836281763e-05,
"loss": 0.6617,
"step": 13700
},
{
"epoch": 4.438726278546156,
"grad_norm": 1.0456990003585815,
"learning_rate": 3.890559665487295e-05,
"loss": 0.6541,
"step": 13800
},
{
"epoch": 4.470890961724027,
"grad_norm": 1.2041975259780884,
"learning_rate": 3.882518494692827e-05,
"loss": 0.6553,
"step": 13900
},
{
"epoch": 4.503055644901898,
"grad_norm": 0.860864520072937,
"learning_rate": 3.87447732389836e-05,
"loss": 0.6497,
"step": 14000
},
{
"epoch": 4.5352203280797685,
"grad_norm": 1.1725019216537476,
"learning_rate": 3.8664361531038926e-05,
"loss": 0.6533,
"step": 14100
},
{
"epoch": 4.567385011257639,
"grad_norm": 1.0299824476242065,
"learning_rate": 3.858394982309425e-05,
"loss": 0.6472,
"step": 14200
},
{
"epoch": 4.59954969443551,
"grad_norm": 1.1528712511062622,
"learning_rate": 3.850353811514957e-05,
"loss": 0.6508,
"step": 14300
},
{
"epoch": 4.63171437761338,
"grad_norm": 1.106142520904541,
"learning_rate": 3.842312640720489e-05,
"loss": 0.655,
"step": 14400
},
{
"epoch": 4.663879060791251,
"grad_norm": 1.019434928894043,
"learning_rate": 3.8342714699260216e-05,
"loss": 0.6419,
"step": 14500
},
{
"epoch": 4.696043743969122,
"grad_norm": 1.1405712366104126,
"learning_rate": 3.826230299131554e-05,
"loss": 0.6535,
"step": 14600
},
{
"epoch": 4.728208427146993,
"grad_norm": 1.1748415231704712,
"learning_rate": 3.8181891283370864e-05,
"loss": 0.6609,
"step": 14700
},
{
"epoch": 4.760373110324863,
"grad_norm": 1.0870500802993774,
"learning_rate": 3.810228369250563e-05,
"loss": 0.6582,
"step": 14800
},
{
"epoch": 4.792537793502734,
"grad_norm": 1.2334747314453125,
"learning_rate": 3.8021871984560954e-05,
"loss": 0.655,
"step": 14900
},
{
"epoch": 4.824702476680605,
"grad_norm": 1.0857813358306885,
"learning_rate": 3.7941460276616274e-05,
"loss": 0.6568,
"step": 15000
},
{
"epoch": 4.856867159858475,
"grad_norm": 1.1353741884231567,
"learning_rate": 3.7861048568671595e-05,
"loss": 0.6652,
"step": 15100
},
{
"epoch": 4.889031843036346,
"grad_norm": 1.1487704515457153,
"learning_rate": 3.778063686072692e-05,
"loss": 0.6696,
"step": 15200
},
{
"epoch": 4.921196526214217,
"grad_norm": 1.0605279207229614,
"learning_rate": 3.770022515278225e-05,
"loss": 0.6561,
"step": 15300
},
{
"epoch": 4.953361209392088,
"grad_norm": 1.0910602807998657,
"learning_rate": 3.761981344483757e-05,
"loss": 0.6794,
"step": 15400
},
{
"epoch": 4.985525892569958,
"grad_norm": 1.147965431213379,
"learning_rate": 3.753940173689289e-05,
"loss": 0.6717,
"step": 15500
},
{
"epoch": 5.017690575747829,
"grad_norm": 1.1085681915283203,
"learning_rate": 3.745899002894821e-05,
"loss": 0.5856,
"step": 15600
},
{
"epoch": 5.0498552589256995,
"grad_norm": 1.1322296857833862,
"learning_rate": 3.737857832100354e-05,
"loss": 0.5282,
"step": 15700
},
{
"epoch": 5.08201994210357,
"grad_norm": 1.0307468175888062,
"learning_rate": 3.729816661305887e-05,
"loss": 0.538,
"step": 15800
},
{
"epoch": 5.114184625281441,
"grad_norm": 1.3094079494476318,
"learning_rate": 3.7218559022193636e-05,
"loss": 0.5269,
"step": 15900
},
{
"epoch": 5.146349308459311,
"grad_norm": 0.9686407446861267,
"learning_rate": 3.713814731424896e-05,
"loss": 0.5398,
"step": 16000
},
{
"epoch": 5.178513991637183,
"grad_norm": 1.1006598472595215,
"learning_rate": 3.705773560630428e-05,
"loss": 0.5419,
"step": 16100
},
{
"epoch": 5.210678674815053,
"grad_norm": 1.0708500146865845,
"learning_rate": 3.69773238983596e-05,
"loss": 0.5339,
"step": 16200
},
{
"epoch": 5.242843357992924,
"grad_norm": 0.994500994682312,
"learning_rate": 3.6896912190414926e-05,
"loss": 0.5446,
"step": 16300
},
{
"epoch": 5.2750080411707945,
"grad_norm": 1.0130538940429688,
"learning_rate": 3.6816500482470254e-05,
"loss": 0.5457,
"step": 16400
},
{
"epoch": 5.307172724348665,
"grad_norm": 1.1695197820663452,
"learning_rate": 3.6736088774525574e-05,
"loss": 0.5556,
"step": 16500
},
{
"epoch": 5.339337407526536,
"grad_norm": 1.2486307621002197,
"learning_rate": 3.6655677066580895e-05,
"loss": 0.5478,
"step": 16600
},
{
"epoch": 5.371502090704406,
"grad_norm": 1.3196483850479126,
"learning_rate": 3.6575265358636216e-05,
"loss": 0.5479,
"step": 16700
},
{
"epoch": 5.403666773882278,
"grad_norm": 1.1621615886688232,
"learning_rate": 3.6494853650691544e-05,
"loss": 0.5507,
"step": 16800
},
{
"epoch": 5.435831457060148,
"grad_norm": 1.1101694107055664,
"learning_rate": 3.6414441942746864e-05,
"loss": 0.5541,
"step": 16900
},
{
"epoch": 5.467996140238019,
"grad_norm": 1.0300599336624146,
"learning_rate": 3.633403023480219e-05,
"loss": 0.5595,
"step": 17000
},
{
"epoch": 5.500160823415889,
"grad_norm": 1.133117914199829,
"learning_rate": 3.625361852685751e-05,
"loss": 0.5461,
"step": 17100
},
{
"epoch": 5.53232550659376,
"grad_norm": 1.1690670251846313,
"learning_rate": 3.617320681891283e-05,
"loss": 0.5648,
"step": 17200
},
{
"epoch": 5.564490189771631,
"grad_norm": 0.9958963990211487,
"learning_rate": 3.609279511096816e-05,
"loss": 0.5679,
"step": 17300
},
{
"epoch": 5.596654872949501,
"grad_norm": 1.1210240125656128,
"learning_rate": 3.601238340302348e-05,
"loss": 0.5478,
"step": 17400
},
{
"epoch": 5.628819556127372,
"grad_norm": 1.1824806928634644,
"learning_rate": 3.59319716950788e-05,
"loss": 0.5611,
"step": 17500
},
{
"epoch": 5.660984239305243,
"grad_norm": 1.201159119606018,
"learning_rate": 3.585155998713413e-05,
"loss": 0.5513,
"step": 17600
},
{
"epoch": 5.693148922483114,
"grad_norm": 1.264073371887207,
"learning_rate": 3.577114827918945e-05,
"loss": 0.5693,
"step": 17700
},
{
"epoch": 5.725313605660984,
"grad_norm": 1.2534087896347046,
"learning_rate": 3.569073657124478e-05,
"loss": 0.5673,
"step": 17800
},
{
"epoch": 5.757478288838855,
"grad_norm": 1.2071319818496704,
"learning_rate": 3.56103248633001e-05,
"loss": 0.5592,
"step": 17900
},
{
"epoch": 5.7896429720167255,
"grad_norm": 1.3951480388641357,
"learning_rate": 3.552991315535542e-05,
"loss": 0.5576,
"step": 18000
},
{
"epoch": 5.821807655194596,
"grad_norm": 1.1245003938674927,
"learning_rate": 3.544950144741074e-05,
"loss": 0.5608,
"step": 18100
},
{
"epoch": 5.853972338372467,
"grad_norm": 1.0020116567611694,
"learning_rate": 3.536908973946607e-05,
"loss": 0.5676,
"step": 18200
},
{
"epoch": 5.886137021550338,
"grad_norm": 1.2194671630859375,
"learning_rate": 3.5288678031521396e-05,
"loss": 0.5619,
"step": 18300
},
{
"epoch": 5.918301704728209,
"grad_norm": 1.162885308265686,
"learning_rate": 3.5208266323576716e-05,
"loss": 0.5673,
"step": 18400
},
{
"epoch": 5.950466387906079,
"grad_norm": 1.1328020095825195,
"learning_rate": 3.5128658732711485e-05,
"loss": 0.5744,
"step": 18500
},
{
"epoch": 5.98263107108395,
"grad_norm": 1.0173712968826294,
"learning_rate": 3.5048247024766806e-05,
"loss": 0.5727,
"step": 18600
},
{
"epoch": 6.0147957542618204,
"grad_norm": 1.096663475036621,
"learning_rate": 3.4967835316822127e-05,
"loss": 0.5169,
"step": 18700
},
{
"epoch": 6.046960437439691,
"grad_norm": 1.217439889907837,
"learning_rate": 3.4887423608877454e-05,
"loss": 0.4442,
"step": 18800
},
{
"epoch": 6.079125120617562,
"grad_norm": 1.2011640071868896,
"learning_rate": 3.480701190093278e-05,
"loss": 0.4432,
"step": 18900
},
{
"epoch": 6.111289803795432,
"grad_norm": 1.1648433208465576,
"learning_rate": 3.47266001929881e-05,
"loss": 0.4448,
"step": 19000
},
{
"epoch": 6.143454486973304,
"grad_norm": 1.0002985000610352,
"learning_rate": 3.464618848504342e-05,
"loss": 0.4422,
"step": 19100
},
{
"epoch": 6.175619170151174,
"grad_norm": 1.227947473526001,
"learning_rate": 3.4565776777098744e-05,
"loss": 0.4448,
"step": 19200
},
{
"epoch": 6.207783853329045,
"grad_norm": 1.3668509721755981,
"learning_rate": 3.448536506915407e-05,
"loss": 0.458,
"step": 19300
},
{
"epoch": 6.239948536506915,
"grad_norm": 1.1402783393859863,
"learning_rate": 3.440495336120939e-05,
"loss": 0.4489,
"step": 19400
},
{
"epoch": 6.272113219684786,
"grad_norm": 1.2867486476898193,
"learning_rate": 3.432454165326472e-05,
"loss": 0.4611,
"step": 19500
},
{
"epoch": 6.3042779028626565,
"grad_norm": 1.3049463033676147,
"learning_rate": 3.424412994532004e-05,
"loss": 0.4554,
"step": 19600
},
{
"epoch": 6.336442586040527,
"grad_norm": 1.2762079238891602,
"learning_rate": 3.416371823737536e-05,
"loss": 0.4651,
"step": 19700
},
{
"epoch": 6.368607269218399,
"grad_norm": 1.0911743640899658,
"learning_rate": 3.408330652943069e-05,
"loss": 0.46,
"step": 19800
},
{
"epoch": 6.400771952396269,
"grad_norm": 1.2043310403823853,
"learning_rate": 3.400289482148601e-05,
"loss": 0.4689,
"step": 19900
},
{
"epoch": 6.43293663557414,
"grad_norm": 1.1901593208312988,
"learning_rate": 3.392248311354133e-05,
"loss": 0.4689,
"step": 20000
},
{
"epoch": 6.46510131875201,
"grad_norm": 1.3777788877487183,
"learning_rate": 3.384207140559666e-05,
"loss": 0.4646,
"step": 20100
},
{
"epoch": 6.497266001929881,
"grad_norm": 1.2720263004302979,
"learning_rate": 3.376165969765198e-05,
"loss": 0.4684,
"step": 20200
},
{
"epoch": 6.5294306851077515,
"grad_norm": 1.0634431838989258,
"learning_rate": 3.3681247989707306e-05,
"loss": 0.4699,
"step": 20300
},
{
"epoch": 6.561595368285622,
"grad_norm": 1.3379625082015991,
"learning_rate": 3.360083628176263e-05,
"loss": 0.4807,
"step": 20400
},
{
"epoch": 6.593760051463493,
"grad_norm": 1.379611849784851,
"learning_rate": 3.352042457381795e-05,
"loss": 0.4743,
"step": 20500
},
{
"epoch": 6.625924734641364,
"grad_norm": 1.2262407541275024,
"learning_rate": 3.344001286587327e-05,
"loss": 0.4699,
"step": 20600
},
{
"epoch": 6.658089417819235,
"grad_norm": 1.2290821075439453,
"learning_rate": 3.3360405275008044e-05,
"loss": 0.4728,
"step": 20700
},
{
"epoch": 6.690254100997105,
"grad_norm": 1.286399006843567,
"learning_rate": 3.3279993567063365e-05,
"loss": 0.4811,
"step": 20800
},
{
"epoch": 6.722418784174976,
"grad_norm": 1.3331542015075684,
"learning_rate": 3.319958185911869e-05,
"loss": 0.4795,
"step": 20900
},
{
"epoch": 6.754583467352846,
"grad_norm": 0.9238734841346741,
"learning_rate": 3.311917015117401e-05,
"loss": 0.4719,
"step": 21000
},
{
"epoch": 6.786748150530717,
"grad_norm": 1.0958573818206787,
"learning_rate": 3.3038758443229334e-05,
"loss": 0.4787,
"step": 21100
},
{
"epoch": 6.818912833708588,
"grad_norm": 1.3021626472473145,
"learning_rate": 3.2958346735284655e-05,
"loss": 0.481,
"step": 21200
},
{
"epoch": 6.851077516886459,
"grad_norm": 1.2894419431686401,
"learning_rate": 3.287793502733998e-05,
"loss": 0.4799,
"step": 21300
},
{
"epoch": 6.88324220006433,
"grad_norm": 1.2276169061660767,
"learning_rate": 3.279752331939531e-05,
"loss": 0.4791,
"step": 21400
},
{
"epoch": 6.9154068832422,
"grad_norm": 1.2167209386825562,
"learning_rate": 3.271711161145063e-05,
"loss": 0.4777,
"step": 21500
},
{
"epoch": 6.947571566420071,
"grad_norm": 1.0899590253829956,
"learning_rate": 3.263669990350595e-05,
"loss": 0.4807,
"step": 21600
},
{
"epoch": 6.979736249597941,
"grad_norm": 1.0014961957931519,
"learning_rate": 3.255628819556127e-05,
"loss": 0.4809,
"step": 21700
},
{
"epoch": 7.011900932775812,
"grad_norm": 1.2910642623901367,
"learning_rate": 3.24758764876166e-05,
"loss": 0.4473,
"step": 21800
},
{
"epoch": 7.0440656159536825,
"grad_norm": 1.2702795267105103,
"learning_rate": 3.239546477967192e-05,
"loss": 0.3631,
"step": 21900
},
{
"epoch": 7.076230299131554,
"grad_norm": 1.1640880107879639,
"learning_rate": 3.231505307172725e-05,
"loss": 0.3648,
"step": 22000
},
{
"epoch": 7.108394982309425,
"grad_norm": 1.3022829294204712,
"learning_rate": 3.223464136378257e-05,
"loss": 0.372,
"step": 22100
},
{
"epoch": 7.140559665487295,
"grad_norm": 1.1453783512115479,
"learning_rate": 3.215422965583789e-05,
"loss": 0.3713,
"step": 22200
},
{
"epoch": 7.172724348665166,
"grad_norm": 1.2497502565383911,
"learning_rate": 3.207381794789322e-05,
"loss": 0.3744,
"step": 22300
},
{
"epoch": 7.204889031843036,
"grad_norm": 1.0671898126602173,
"learning_rate": 3.199340623994854e-05,
"loss": 0.3818,
"step": 22400
},
{
"epoch": 7.237053715020907,
"grad_norm": 1.3146332502365112,
"learning_rate": 3.191299453200386e-05,
"loss": 0.3818,
"step": 22500
},
{
"epoch": 7.2692183981987775,
"grad_norm": 1.155218482017517,
"learning_rate": 3.1832582824059186e-05,
"loss": 0.3849,
"step": 22600
},
{
"epoch": 7.301383081376648,
"grad_norm": 1.493849515914917,
"learning_rate": 3.175217111611451e-05,
"loss": 0.3914,
"step": 22700
},
{
"epoch": 7.3335477645545195,
"grad_norm": 1.4199835062026978,
"learning_rate": 3.1671759408169834e-05,
"loss": 0.3877,
"step": 22800
},
{
"epoch": 7.36571244773239,
"grad_norm": 1.3320192098617554,
"learning_rate": 3.1591347700225155e-05,
"loss": 0.3898,
"step": 22900
},
{
"epoch": 7.397877130910261,
"grad_norm": 1.2950165271759033,
"learning_rate": 3.1510935992280476e-05,
"loss": 0.3915,
"step": 23000
},
{
"epoch": 7.430041814088131,
"grad_norm": 1.2923767566680908,
"learning_rate": 3.1430524284335796e-05,
"loss": 0.3866,
"step": 23100
},
{
"epoch": 7.462206497266002,
"grad_norm": 1.2137949466705322,
"learning_rate": 3.1350112576391124e-05,
"loss": 0.3848,
"step": 23200
},
{
"epoch": 7.494371180443872,
"grad_norm": 1.4630000591278076,
"learning_rate": 3.126970086844645e-05,
"loss": 0.3965,
"step": 23300
},
{
"epoch": 7.526535863621743,
"grad_norm": 1.2107799053192139,
"learning_rate": 3.118928916050177e-05,
"loss": 0.3922,
"step": 23400
},
{
"epoch": 7.558700546799614,
"grad_norm": 1.1131380796432495,
"learning_rate": 3.110887745255709e-05,
"loss": 0.3892,
"step": 23500
},
{
"epoch": 7.590865229977485,
"grad_norm": 1.2040220499038696,
"learning_rate": 3.1028465744612414e-05,
"loss": 0.3931,
"step": 23600
},
{
"epoch": 7.623029913155356,
"grad_norm": 1.1216152906417847,
"learning_rate": 3.094805403666774e-05,
"loss": 0.3901,
"step": 23700
},
{
"epoch": 7.655194596333226,
"grad_norm": 1.2603133916854858,
"learning_rate": 3.086764232872306e-05,
"loss": 0.3976,
"step": 23800
},
{
"epoch": 7.687359279511097,
"grad_norm": 1.2045493125915527,
"learning_rate": 3.078723062077838e-05,
"loss": 0.3983,
"step": 23900
},
{
"epoch": 7.719523962688967,
"grad_norm": 1.3943126201629639,
"learning_rate": 3.070681891283371e-05,
"loss": 0.3905,
"step": 24000
},
{
"epoch": 7.751688645866838,
"grad_norm": 1.204542875289917,
"learning_rate": 3.062640720488904e-05,
"loss": 0.3996,
"step": 24100
},
{
"epoch": 7.7838533290447085,
"grad_norm": 1.377310037612915,
"learning_rate": 3.054599549694436e-05,
"loss": 0.3996,
"step": 24200
},
{
"epoch": 7.81601801222258,
"grad_norm": 1.1735265254974365,
"learning_rate": 3.046558378899968e-05,
"loss": 0.4043,
"step": 24300
},
{
"epoch": 7.8481826954004505,
"grad_norm": 1.2886712551116943,
"learning_rate": 3.0385172081055004e-05,
"loss": 0.4006,
"step": 24400
},
{
"epoch": 7.880347378578321,
"grad_norm": 1.1405675411224365,
"learning_rate": 3.0304760373110324e-05,
"loss": 0.4031,
"step": 24500
},
{
"epoch": 7.912512061756192,
"grad_norm": 1.6100174188613892,
"learning_rate": 3.022434866516565e-05,
"loss": 0.3999,
"step": 24600
},
{
"epoch": 7.944676744934062,
"grad_norm": 1.123673677444458,
"learning_rate": 3.0143936957220976e-05,
"loss": 0.4096,
"step": 24700
},
{
"epoch": 7.976841428111933,
"grad_norm": 1.3668479919433594,
"learning_rate": 3.006432936635574e-05,
"loss": 0.4084,
"step": 24800
},
{
"epoch": 8.009006111289803,
"grad_norm": 1.191008448600769,
"learning_rate": 2.9983917658411066e-05,
"loss": 0.3732,
"step": 24900
},
{
"epoch": 8.041170794467675,
"grad_norm": 1.3444154262542725,
"learning_rate": 2.9903505950466386e-05,
"loss": 0.3036,
"step": 25000
},
{
"epoch": 8.073335477645545,
"grad_norm": 1.2217003107070923,
"learning_rate": 2.9823094242521714e-05,
"loss": 0.3044,
"step": 25100
},
{
"epoch": 8.105500160823416,
"grad_norm": 1.3392952680587769,
"learning_rate": 2.9742682534577038e-05,
"loss": 0.3051,
"step": 25200
},
{
"epoch": 8.137664844001286,
"grad_norm": 1.4353630542755127,
"learning_rate": 2.966227082663236e-05,
"loss": 0.3116,
"step": 25300
},
{
"epoch": 8.169829527179157,
"grad_norm": 1.2575156688690186,
"learning_rate": 2.9581859118687683e-05,
"loss": 0.3154,
"step": 25400
},
{
"epoch": 8.201994210357029,
"grad_norm": 1.276118278503418,
"learning_rate": 2.9501447410743004e-05,
"loss": 0.3137,
"step": 25500
},
{
"epoch": 8.234158893534898,
"grad_norm": 1.2896833419799805,
"learning_rate": 2.9421035702798328e-05,
"loss": 0.3132,
"step": 25600
},
{
"epoch": 8.26632357671277,
"grad_norm": 1.390541672706604,
"learning_rate": 2.934062399485365e-05,
"loss": 0.3174,
"step": 25700
},
{
"epoch": 8.29848825989064,
"grad_norm": 1.0846691131591797,
"learning_rate": 2.926021228690898e-05,
"loss": 0.3258,
"step": 25800
},
{
"epoch": 8.330652943068511,
"grad_norm": 1.1695352792739868,
"learning_rate": 2.91798005789643e-05,
"loss": 0.3188,
"step": 25900
},
{
"epoch": 8.36281762624638,
"grad_norm": 1.36068856716156,
"learning_rate": 2.909938887101962e-05,
"loss": 0.3233,
"step": 26000
},
{
"epoch": 8.394982309424252,
"grad_norm": 1.087651252746582,
"learning_rate": 2.9018977163074945e-05,
"loss": 0.3195,
"step": 26100
},
{
"epoch": 8.427146992602124,
"grad_norm": 1.1580371856689453,
"learning_rate": 2.8938565455130266e-05,
"loss": 0.3187,
"step": 26200
},
{
"epoch": 8.459311675779993,
"grad_norm": 1.2364060878753662,
"learning_rate": 2.885815374718559e-05,
"loss": 0.3225,
"step": 26300
},
{
"epoch": 8.491476358957865,
"grad_norm": 1.0226277112960815,
"learning_rate": 2.877774203924091e-05,
"loss": 0.3213,
"step": 26400
},
{
"epoch": 8.523641042135734,
"grad_norm": 1.1340548992156982,
"learning_rate": 2.8697330331296242e-05,
"loss": 0.3137,
"step": 26500
},
{
"epoch": 8.555805725313606,
"grad_norm": 1.1343051195144653,
"learning_rate": 2.8616918623351563e-05,
"loss": 0.3219,
"step": 26600
},
{
"epoch": 8.587970408491476,
"grad_norm": 1.0772162675857544,
"learning_rate": 2.8536506915406887e-05,
"loss": 0.329,
"step": 26700
},
{
"epoch": 8.620135091669347,
"grad_norm": 1.3438891172409058,
"learning_rate": 2.8456899324541652e-05,
"loss": 0.3299,
"step": 26800
},
{
"epoch": 8.652299774847219,
"grad_norm": 1.4512920379638672,
"learning_rate": 2.837648761659698e-05,
"loss": 0.3348,
"step": 26900
},
{
"epoch": 8.684464458025088,
"grad_norm": 1.2788150310516357,
"learning_rate": 2.8296075908652304e-05,
"loss": 0.3338,
"step": 27000
},
{
"epoch": 8.71662914120296,
"grad_norm": 1.142204999923706,
"learning_rate": 2.8215664200707625e-05,
"loss": 0.3309,
"step": 27100
},
{
"epoch": 8.74879382438083,
"grad_norm": 1.4097461700439453,
"learning_rate": 2.813525249276295e-05,
"loss": 0.3326,
"step": 27200
},
{
"epoch": 8.7809585075587,
"grad_norm": 1.1304296255111694,
"learning_rate": 2.805484078481827e-05,
"loss": 0.333,
"step": 27300
},
{
"epoch": 8.81312319073657,
"grad_norm": 1.2657917737960815,
"learning_rate": 2.7974429076873594e-05,
"loss": 0.3383,
"step": 27400
},
{
"epoch": 8.845287873914442,
"grad_norm": 1.2562810182571411,
"learning_rate": 2.7894017368928914e-05,
"loss": 0.3331,
"step": 27500
},
{
"epoch": 8.877452557092312,
"grad_norm": 1.2090494632720947,
"learning_rate": 2.7813605660984242e-05,
"loss": 0.3335,
"step": 27600
},
{
"epoch": 8.909617240270183,
"grad_norm": 1.055282711982727,
"learning_rate": 2.7733193953039566e-05,
"loss": 0.3345,
"step": 27700
},
{
"epoch": 8.941781923448055,
"grad_norm": 1.3975212574005127,
"learning_rate": 2.7652782245094887e-05,
"loss": 0.3374,
"step": 27800
},
{
"epoch": 8.973946606625924,
"grad_norm": 1.4486628770828247,
"learning_rate": 2.757237053715021e-05,
"loss": 0.3353,
"step": 27900
},
{
"epoch": 9.006111289803796,
"grad_norm": 1.0647599697113037,
"learning_rate": 2.7491958829205532e-05,
"loss": 0.3258,
"step": 28000
},
{
"epoch": 9.038275972981666,
"grad_norm": 1.1231446266174316,
"learning_rate": 2.7411547121260856e-05,
"loss": 0.2498,
"step": 28100
},
{
"epoch": 9.070440656159537,
"grad_norm": 1.1911687850952148,
"learning_rate": 2.7331135413316177e-05,
"loss": 0.2507,
"step": 28200
},
{
"epoch": 9.102605339337407,
"grad_norm": 1.1804819107055664,
"learning_rate": 2.7250723705371508e-05,
"loss": 0.2538,
"step": 28300
},
{
"epoch": 9.134770022515278,
"grad_norm": 1.2449610233306885,
"learning_rate": 2.7170311997426828e-05,
"loss": 0.254,
"step": 28400
},
{
"epoch": 9.16693470569315,
"grad_norm": 1.3183308839797974,
"learning_rate": 2.708990028948215e-05,
"loss": 0.2569,
"step": 28500
},
{
"epoch": 9.19909938887102,
"grad_norm": 1.2077155113220215,
"learning_rate": 2.7009488581537473e-05,
"loss": 0.264,
"step": 28600
},
{
"epoch": 9.23126407204889,
"grad_norm": 1.2201523780822754,
"learning_rate": 2.6929076873592794e-05,
"loss": 0.2618,
"step": 28700
},
{
"epoch": 9.26342875522676,
"grad_norm": 1.010611653327942,
"learning_rate": 2.6848665165648118e-05,
"loss": 0.2586,
"step": 28800
},
{
"epoch": 9.295593438404632,
"grad_norm": 1.4196604490280151,
"learning_rate": 2.676825345770344e-05,
"loss": 0.2645,
"step": 28900
},
{
"epoch": 9.327758121582502,
"grad_norm": 1.0224769115447998,
"learning_rate": 2.668784174975877e-05,
"loss": 0.2677,
"step": 29000
},
{
"epoch": 9.359922804760373,
"grad_norm": 1.0215541124343872,
"learning_rate": 2.6608234158893535e-05,
"loss": 0.2662,
"step": 29100
},
{
"epoch": 9.392087487938245,
"grad_norm": 1.1498533487319946,
"learning_rate": 2.652782245094886e-05,
"loss": 0.2647,
"step": 29200
},
{
"epoch": 9.424252171116114,
"grad_norm": 1.1814836263656616,
"learning_rate": 2.644741074300418e-05,
"loss": 0.2681,
"step": 29300
},
{
"epoch": 9.456416854293986,
"grad_norm": 1.3575364351272583,
"learning_rate": 2.6366999035059508e-05,
"loss": 0.2659,
"step": 29400
},
{
"epoch": 9.488581537471855,
"grad_norm": 1.4364129304885864,
"learning_rate": 2.6286587327114832e-05,
"loss": 0.2701,
"step": 29500
},
{
"epoch": 9.520746220649727,
"grad_norm": 1.4229729175567627,
"learning_rate": 2.6206175619170152e-05,
"loss": 0.2744,
"step": 29600
},
{
"epoch": 9.552910903827597,
"grad_norm": 1.2050132751464844,
"learning_rate": 2.6125763911225477e-05,
"loss": 0.2678,
"step": 29700
},
{
"epoch": 9.585075587005468,
"grad_norm": 1.360245704650879,
"learning_rate": 2.6045352203280797e-05,
"loss": 0.2668,
"step": 29800
},
{
"epoch": 9.61724027018334,
"grad_norm": 1.2902683019638062,
"learning_rate": 2.596494049533612e-05,
"loss": 0.2749,
"step": 29900
},
{
"epoch": 9.64940495336121,
"grad_norm": 1.3184953927993774,
"learning_rate": 2.5884528787391442e-05,
"loss": 0.2729,
"step": 30000
},
{
"epoch": 9.68156963653908,
"grad_norm": 1.1212196350097656,
"learning_rate": 2.580411707944677e-05,
"loss": 0.2711,
"step": 30100
},
{
"epoch": 9.71373431971695,
"grad_norm": 1.1325451135635376,
"learning_rate": 2.5723705371502094e-05,
"loss": 0.2739,
"step": 30200
},
{
"epoch": 9.745899002894822,
"grad_norm": 1.2344636917114258,
"learning_rate": 2.5643293663557415e-05,
"loss": 0.276,
"step": 30300
},
{
"epoch": 9.778063686072691,
"grad_norm": 1.3600369691848755,
"learning_rate": 2.556288195561274e-05,
"loss": 0.2788,
"step": 30400
},
{
"epoch": 9.810228369250563,
"grad_norm": 1.3456294536590576,
"learning_rate": 2.548247024766806e-05,
"loss": 0.2768,
"step": 30500
},
{
"epoch": 9.842393052428434,
"grad_norm": 1.316645622253418,
"learning_rate": 2.5402058539723384e-05,
"loss": 0.2756,
"step": 30600
},
{
"epoch": 9.874557735606304,
"grad_norm": 1.4233603477478027,
"learning_rate": 2.5321646831778705e-05,
"loss": 0.2767,
"step": 30700
},
{
"epoch": 9.906722418784176,
"grad_norm": 1.3680518865585327,
"learning_rate": 2.5241235123834036e-05,
"loss": 0.2746,
"step": 30800
},
{
"epoch": 9.938887101962045,
"grad_norm": 1.4857224225997925,
"learning_rate": 2.5160823415889356e-05,
"loss": 0.2775,
"step": 30900
},
{
"epoch": 9.971051785139917,
"grad_norm": 1.3762352466583252,
"learning_rate": 2.508041170794468e-05,
"loss": 0.2806,
"step": 31000
},
{
"epoch": 10.003216468317786,
"grad_norm": 0.9549392461776733,
"learning_rate": 2.5e-05,
"loss": 0.2723,
"step": 31100
},
{
"epoch": 10.035381151495658,
"grad_norm": 1.2942768335342407,
"learning_rate": 2.4919588292055322e-05,
"loss": 0.2038,
"step": 31200
},
{
"epoch": 10.067545834673528,
"grad_norm": 1.1192700862884521,
"learning_rate": 2.483917658411065e-05,
"loss": 0.208,
"step": 31300
},
{
"epoch": 10.099710517851399,
"grad_norm": 1.0974397659301758,
"learning_rate": 2.475876487616597e-05,
"loss": 0.2106,
"step": 31400
},
{
"epoch": 10.13187520102927,
"grad_norm": 1.3536230325698853,
"learning_rate": 2.4678353168221294e-05,
"loss": 0.2099,
"step": 31500
},
{
"epoch": 10.16403988420714,
"grad_norm": 1.1772140264511108,
"learning_rate": 2.4597941460276615e-05,
"loss": 0.2205,
"step": 31600
},
{
"epoch": 10.196204567385012,
"grad_norm": 1.053501844406128,
"learning_rate": 2.4517529752331943e-05,
"loss": 0.2118,
"step": 31700
},
{
"epoch": 10.228369250562881,
"grad_norm": 1.1771047115325928,
"learning_rate": 2.4437118044387263e-05,
"loss": 0.2169,
"step": 31800
},
{
"epoch": 10.260533933740753,
"grad_norm": 1.249473214149475,
"learning_rate": 2.4356706336442588e-05,
"loss": 0.2184,
"step": 31900
},
{
"epoch": 10.292698616918623,
"grad_norm": 1.2319327592849731,
"learning_rate": 2.4276294628497912e-05,
"loss": 0.2186,
"step": 32000
},
{
"epoch": 10.324863300096494,
"grad_norm": 1.2124056816101074,
"learning_rate": 2.419668703763268e-05,
"loss": 0.2165,
"step": 32100
},
{
"epoch": 10.357027983274365,
"grad_norm": 1.0940320491790771,
"learning_rate": 2.4116275329688005e-05,
"loss": 0.2196,
"step": 32200
},
{
"epoch": 10.389192666452235,
"grad_norm": 1.3821940422058105,
"learning_rate": 2.4035863621743325e-05,
"loss": 0.2201,
"step": 32300
},
{
"epoch": 10.421357349630107,
"grad_norm": 1.4705811738967896,
"learning_rate": 2.3955451913798653e-05,
"loss": 0.2237,
"step": 32400
},
{
"epoch": 10.453522032807976,
"grad_norm": 0.9724491834640503,
"learning_rate": 2.3875040205853974e-05,
"loss": 0.2202,
"step": 32500
},
{
"epoch": 10.485686715985848,
"grad_norm": 1.2271063327789307,
"learning_rate": 2.3794628497909298e-05,
"loss": 0.2248,
"step": 32600
},
{
"epoch": 10.517851399163717,
"grad_norm": 1.049058437347412,
"learning_rate": 2.371421678996462e-05,
"loss": 0.2261,
"step": 32700
},
{
"epoch": 10.550016082341589,
"grad_norm": 1.324881911277771,
"learning_rate": 2.3633805082019943e-05,
"loss": 0.2241,
"step": 32800
},
{
"epoch": 10.58218076551946,
"grad_norm": 1.2529538869857788,
"learning_rate": 2.3553393374075267e-05,
"loss": 0.2206,
"step": 32900
},
{
"epoch": 10.61434544869733,
"grad_norm": 1.1270591020584106,
"learning_rate": 2.3472981666130588e-05,
"loss": 0.2264,
"step": 33000
},
{
"epoch": 10.646510131875202,
"grad_norm": 1.256332278251648,
"learning_rate": 2.3392569958185915e-05,
"loss": 0.2227,
"step": 33100
},
{
"epoch": 10.678674815053071,
"grad_norm": 1.2265748977661133,
"learning_rate": 2.3312158250241236e-05,
"loss": 0.227,
"step": 33200
},
{
"epoch": 10.710839498230943,
"grad_norm": 1.367353081703186,
"learning_rate": 2.323174654229656e-05,
"loss": 0.2312,
"step": 33300
},
{
"epoch": 10.743004181408812,
"grad_norm": 1.185103416442871,
"learning_rate": 2.315133483435188e-05,
"loss": 0.2298,
"step": 33400
},
{
"epoch": 10.775168864586684,
"grad_norm": 1.3100920915603638,
"learning_rate": 2.307092312640721e-05,
"loss": 0.2265,
"step": 33500
},
{
"epoch": 10.807333547764555,
"grad_norm": 1.1452741622924805,
"learning_rate": 2.299051141846253e-05,
"loss": 0.2296,
"step": 33600
},
{
"epoch": 10.839498230942425,
"grad_norm": 1.3622289896011353,
"learning_rate": 2.2910099710517853e-05,
"loss": 0.2318,
"step": 33700
},
{
"epoch": 10.871662914120297,
"grad_norm": 1.2668938636779785,
"learning_rate": 2.2829688002573177e-05,
"loss": 0.227,
"step": 33800
},
{
"epoch": 10.903827597298166,
"grad_norm": 1.139683723449707,
"learning_rate": 2.2749276294628498e-05,
"loss": 0.23,
"step": 33900
},
{
"epoch": 10.935992280476038,
"grad_norm": 1.293151617050171,
"learning_rate": 2.2668864586683822e-05,
"loss": 0.2325,
"step": 34000
},
{
"epoch": 10.968156963653907,
"grad_norm": 1.6205676794052124,
"learning_rate": 2.2588452878739143e-05,
"loss": 0.2348,
"step": 34100
},
{
"epoch": 11.000321646831779,
"grad_norm": 0.9748951196670532,
"learning_rate": 2.2508845287873915e-05,
"loss": 0.2312,
"step": 34200
},
{
"epoch": 11.03248633000965,
"grad_norm": 1.0078290700912476,
"learning_rate": 2.242843357992924e-05,
"loss": 0.1733,
"step": 34300
},
{
"epoch": 11.06465101318752,
"grad_norm": 1.0165376663208008,
"learning_rate": 2.2348021871984564e-05,
"loss": 0.1744,
"step": 34400
},
{
"epoch": 11.096815696365391,
"grad_norm": 1.2080796957015991,
"learning_rate": 2.2267610164039884e-05,
"loss": 0.1752,
"step": 34500
},
{
"epoch": 11.128980379543261,
"grad_norm": 1.0622116327285767,
"learning_rate": 2.218719845609521e-05,
"loss": 0.1746,
"step": 34600
},
{
"epoch": 11.161145062721133,
"grad_norm": 1.2261426448822021,
"learning_rate": 2.2106786748150533e-05,
"loss": 0.1792,
"step": 34700
},
{
"epoch": 11.193309745899002,
"grad_norm": 1.2450960874557495,
"learning_rate": 2.2026375040205853e-05,
"loss": 0.1769,
"step": 34800
},
{
"epoch": 11.225474429076874,
"grad_norm": 1.2167742252349854,
"learning_rate": 2.194596333226118e-05,
"loss": 0.1807,
"step": 34900
},
{
"epoch": 11.257639112254743,
"grad_norm": 1.187880277633667,
"learning_rate": 2.18655516243165e-05,
"loss": 0.1799,
"step": 35000
},
{
"epoch": 11.289803795432615,
"grad_norm": 1.1898218393325806,
"learning_rate": 2.1785139916371826e-05,
"loss": 0.1778,
"step": 35100
},
{
"epoch": 11.321968478610486,
"grad_norm": 1.105224609375,
"learning_rate": 2.1705532325506595e-05,
"loss": 0.1821,
"step": 35200
},
{
"epoch": 11.354133161788356,
"grad_norm": 1.1716156005859375,
"learning_rate": 2.1625120617561915e-05,
"loss": 0.1817,
"step": 35300
},
{
"epoch": 11.386297844966228,
"grad_norm": 0.952513575553894,
"learning_rate": 2.1544708909617243e-05,
"loss": 0.1848,
"step": 35400
},
{
"epoch": 11.418462528144097,
"grad_norm": 1.2010760307312012,
"learning_rate": 2.1464297201672564e-05,
"loss": 0.1813,
"step": 35500
},
{
"epoch": 11.450627211321969,
"grad_norm": 1.2307448387145996,
"learning_rate": 2.1383885493727888e-05,
"loss": 0.1842,
"step": 35600
},
{
"epoch": 11.482791894499838,
"grad_norm": 1.3126641511917114,
"learning_rate": 2.1303473785783212e-05,
"loss": 0.1849,
"step": 35700
},
{
"epoch": 11.51495657767771,
"grad_norm": 1.1805635690689087,
"learning_rate": 2.1223062077838536e-05,
"loss": 0.1862,
"step": 35800
},
{
"epoch": 11.547121260855581,
"grad_norm": 1.1975961923599243,
"learning_rate": 2.1142650369893857e-05,
"loss": 0.1897,
"step": 35900
},
{
"epoch": 11.579285944033451,
"grad_norm": 1.1553934812545776,
"learning_rate": 2.106223866194918e-05,
"loss": 0.1842,
"step": 36000
},
{
"epoch": 11.611450627211322,
"grad_norm": 1.3459019660949707,
"learning_rate": 2.0981826954004505e-05,
"loss": 0.1879,
"step": 36100
},
{
"epoch": 11.643615310389192,
"grad_norm": 1.1215953826904297,
"learning_rate": 2.0901415246059826e-05,
"loss": 0.1882,
"step": 36200
},
{
"epoch": 11.675779993567064,
"grad_norm": 1.3726115226745605,
"learning_rate": 2.082100353811515e-05,
"loss": 0.1888,
"step": 36300
},
{
"epoch": 11.707944676744933,
"grad_norm": 1.240684151649475,
"learning_rate": 2.0740591830170474e-05,
"loss": 0.1866,
"step": 36400
},
{
"epoch": 11.740109359922805,
"grad_norm": 1.291636347770691,
"learning_rate": 2.06601801222258e-05,
"loss": 0.1888,
"step": 36500
},
{
"epoch": 11.772274043100676,
"grad_norm": 1.172780156135559,
"learning_rate": 2.057976841428112e-05,
"loss": 0.1934,
"step": 36600
},
{
"epoch": 11.804438726278546,
"grad_norm": 1.1808074712753296,
"learning_rate": 2.0499356706336443e-05,
"loss": 0.1912,
"step": 36700
},
{
"epoch": 11.836603409456417,
"grad_norm": 1.2383092641830444,
"learning_rate": 2.0418944998391767e-05,
"loss": 0.1911,
"step": 36800
},
{
"epoch": 11.868768092634287,
"grad_norm": 1.0541281700134277,
"learning_rate": 2.033853329044709e-05,
"loss": 0.1948,
"step": 36900
},
{
"epoch": 11.900932775812159,
"grad_norm": 1.151892066001892,
"learning_rate": 2.0258121582502412e-05,
"loss": 0.1905,
"step": 37000
},
{
"epoch": 11.933097458990028,
"grad_norm": 1.2144666910171509,
"learning_rate": 2.0177709874557736e-05,
"loss": 0.1923,
"step": 37100
},
{
"epoch": 11.9652621421679,
"grad_norm": 1.1869224309921265,
"learning_rate": 2.009729816661306e-05,
"loss": 0.1878,
"step": 37200
},
{
"epoch": 11.99742682534577,
"grad_norm": 1.4024932384490967,
"learning_rate": 2.001688645866838e-05,
"loss": 0.1954,
"step": 37300
},
{
"epoch": 12.029591508523641,
"grad_norm": 1.0352271795272827,
"learning_rate": 1.993647475072371e-05,
"loss": 0.1495,
"step": 37400
},
{
"epoch": 12.061756191701512,
"grad_norm": 1.0901597738265991,
"learning_rate": 1.985606304277903e-05,
"loss": 0.1474,
"step": 37500
},
{
"epoch": 12.093920874879382,
"grad_norm": 0.9055734872817993,
"learning_rate": 1.9775651334834354e-05,
"loss": 0.1494,
"step": 37600
},
{
"epoch": 12.126085558057254,
"grad_norm": 1.3142735958099365,
"learning_rate": 1.9695239626889675e-05,
"loss": 0.1494,
"step": 37700
},
{
"epoch": 12.158250241235123,
"grad_norm": 1.0741150379180908,
"learning_rate": 1.9614827918945e-05,
"loss": 0.1496,
"step": 37800
},
{
"epoch": 12.190414924412995,
"grad_norm": 1.259490728378296,
"learning_rate": 1.9534416211000323e-05,
"loss": 0.1496,
"step": 37900
},
{
"epoch": 12.222579607590864,
"grad_norm": 1.353095531463623,
"learning_rate": 1.9454004503055644e-05,
"loss": 0.1503,
"step": 38000
},
{
"epoch": 12.254744290768736,
"grad_norm": 1.0767805576324463,
"learning_rate": 1.937359279511097e-05,
"loss": 0.1513,
"step": 38100
},
{
"epoch": 12.286908973946607,
"grad_norm": 1.162312388420105,
"learning_rate": 1.9293181087166292e-05,
"loss": 0.153,
"step": 38200
},
{
"epoch": 12.319073657124477,
"grad_norm": 1.1145883798599243,
"learning_rate": 1.9212769379221616e-05,
"loss": 0.1561,
"step": 38300
},
{
"epoch": 12.351238340302348,
"grad_norm": 1.1558126211166382,
"learning_rate": 1.9132357671276937e-05,
"loss": 0.1535,
"step": 38400
},
{
"epoch": 12.383403023480218,
"grad_norm": 0.838792085647583,
"learning_rate": 1.9051945963332264e-05,
"loss": 0.1518,
"step": 38500
},
{
"epoch": 12.41556770665809,
"grad_norm": 1.1334800720214844,
"learning_rate": 1.8971534255387585e-05,
"loss": 0.1522,
"step": 38600
},
{
"epoch": 12.44773238983596,
"grad_norm": 1.1585274934768677,
"learning_rate": 1.889112254744291e-05,
"loss": 0.1535,
"step": 38700
},
{
"epoch": 12.47989707301383,
"grad_norm": 1.1669962406158447,
"learning_rate": 1.8810710839498233e-05,
"loss": 0.1556,
"step": 38800
},
{
"epoch": 12.512061756191702,
"grad_norm": 1.205310344696045,
"learning_rate": 1.8730299131553554e-05,
"loss": 0.1585,
"step": 38900
},
{
"epoch": 12.544226439369572,
"grad_norm": 0.9617523550987244,
"learning_rate": 1.864988742360888e-05,
"loss": 0.1571,
"step": 39000
},
{
"epoch": 12.576391122547443,
"grad_norm": 1.259700894355774,
"learning_rate": 1.8569475715664203e-05,
"loss": 0.1563,
"step": 39100
},
{
"epoch": 12.608555805725313,
"grad_norm": 1.0180591344833374,
"learning_rate": 1.8489064007719527e-05,
"loss": 0.1553,
"step": 39200
},
{
"epoch": 12.640720488903185,
"grad_norm": 1.2915213108062744,
"learning_rate": 1.8408652299774847e-05,
"loss": 0.1599,
"step": 39300
},
{
"epoch": 12.672885172081054,
"grad_norm": 1.1494709253311157,
"learning_rate": 1.832824059183017e-05,
"loss": 0.1575,
"step": 39400
},
{
"epoch": 12.705049855258926,
"grad_norm": 1.1564290523529053,
"learning_rate": 1.8247828883885496e-05,
"loss": 0.1563,
"step": 39500
},
{
"epoch": 12.737214538436797,
"grad_norm": 1.2692680358886719,
"learning_rate": 1.8167417175940816e-05,
"loss": 0.1577,
"step": 39600
},
{
"epoch": 12.769379221614667,
"grad_norm": 1.0520470142364502,
"learning_rate": 1.808780958507559e-05,
"loss": 0.1603,
"step": 39700
},
{
"epoch": 12.801543904792538,
"grad_norm": 1.1396217346191406,
"learning_rate": 1.800739787713091e-05,
"loss": 0.1601,
"step": 39800
},
{
"epoch": 12.833708587970408,
"grad_norm": 1.1824222803115845,
"learning_rate": 1.7926986169186237e-05,
"loss": 0.1597,
"step": 39900
},
{
"epoch": 12.86587327114828,
"grad_norm": 1.0760473012924194,
"learning_rate": 1.7846574461241558e-05,
"loss": 0.1557,
"step": 40000
},
{
"epoch": 12.89803795432615,
"grad_norm": 1.2268940210342407,
"learning_rate": 1.7766162753296882e-05,
"loss": 0.1574,
"step": 40100
},
{
"epoch": 12.93020263750402,
"grad_norm": 1.4867218732833862,
"learning_rate": 1.7685751045352203e-05,
"loss": 0.1599,
"step": 40200
},
{
"epoch": 12.962367320681892,
"grad_norm": 1.271653413772583,
"learning_rate": 1.7605339337407527e-05,
"loss": 0.1577,
"step": 40300
},
{
"epoch": 12.994532003859762,
"grad_norm": 1.1374136209487915,
"learning_rate": 1.752492762946285e-05,
"loss": 0.1612,
"step": 40400
},
{
"epoch": 13.026696687037633,
"grad_norm": 1.023424744606018,
"learning_rate": 1.744451592151817e-05,
"loss": 0.1305,
"step": 40500
},
{
"epoch": 13.058861370215503,
"grad_norm": 1.096927523612976,
"learning_rate": 1.73641042135735e-05,
"loss": 0.1237,
"step": 40600
},
{
"epoch": 13.091026053393374,
"grad_norm": 1.0655758380889893,
"learning_rate": 1.7284496622708268e-05,
"loss": 0.1263,
"step": 40700
},
{
"epoch": 13.123190736571244,
"grad_norm": 1.0864534378051758,
"learning_rate": 1.7204084914763592e-05,
"loss": 0.1234,
"step": 40800
},
{
"epoch": 13.155355419749116,
"grad_norm": 1.1215620040893555,
"learning_rate": 1.7123673206818913e-05,
"loss": 0.1271,
"step": 40900
},
{
"epoch": 13.187520102926985,
"grad_norm": 1.2142025232315063,
"learning_rate": 1.7043261498874237e-05,
"loss": 0.1266,
"step": 41000
},
{
"epoch": 13.219684786104857,
"grad_norm": 1.1056804656982422,
"learning_rate": 1.696284979092956e-05,
"loss": 0.1296,
"step": 41100
},
{
"epoch": 13.251849469282728,
"grad_norm": 0.9434877038002014,
"learning_rate": 1.6882438082984882e-05,
"loss": 0.1288,
"step": 41200
},
{
"epoch": 13.284014152460598,
"grad_norm": 0.9666481018066406,
"learning_rate": 1.6802026375040206e-05,
"loss": 0.1297,
"step": 41300
},
{
"epoch": 13.31617883563847,
"grad_norm": 1.1036096811294556,
"learning_rate": 1.672161466709553e-05,
"loss": 0.1267,
"step": 41400
},
{
"epoch": 13.348343518816339,
"grad_norm": 1.0307798385620117,
"learning_rate": 1.6641202959150854e-05,
"loss": 0.1297,
"step": 41500
},
{
"epoch": 13.38050820199421,
"grad_norm": 1.0009799003601074,
"learning_rate": 1.6560791251206175e-05,
"loss": 0.1286,
"step": 41600
},
{
"epoch": 13.41267288517208,
"grad_norm": 1.098833680152893,
"learning_rate": 1.6480379543261503e-05,
"loss": 0.131,
"step": 41700
},
{
"epoch": 13.444837568349952,
"grad_norm": 1.3332812786102295,
"learning_rate": 1.6399967835316823e-05,
"loss": 0.1311,
"step": 41800
},
{
"epoch": 13.477002251527823,
"grad_norm": 1.133938193321228,
"learning_rate": 1.6319556127372148e-05,
"loss": 0.1291,
"step": 41900
},
{
"epoch": 13.509166934705693,
"grad_norm": 1.1072946786880493,
"learning_rate": 1.6239144419427468e-05,
"loss": 0.1311,
"step": 42000
},
{
"epoch": 13.541331617883564,
"grad_norm": 1.088657021522522,
"learning_rate": 1.6158732711482792e-05,
"loss": 0.1301,
"step": 42100
},
{
"epoch": 13.573496301061434,
"grad_norm": 1.2645858526229858,
"learning_rate": 1.6078321003538117e-05,
"loss": 0.13,
"step": 42200
},
{
"epoch": 13.605660984239305,
"grad_norm": 1.117727518081665,
"learning_rate": 1.5997909295593437e-05,
"loss": 0.1316,
"step": 42300
},
{
"epoch": 13.637825667417175,
"grad_norm": 1.268072485923767,
"learning_rate": 1.5917497587648765e-05,
"loss": 0.1292,
"step": 42400
},
{
"epoch": 13.669990350595047,
"grad_norm": 1.0627697706222534,
"learning_rate": 1.5837085879704086e-05,
"loss": 0.1325,
"step": 42500
},
{
"epoch": 13.702155033772918,
"grad_norm": 1.2271528244018555,
"learning_rate": 1.575667417175941e-05,
"loss": 0.1326,
"step": 42600
},
{
"epoch": 13.734319716950788,
"grad_norm": 1.1797391176223755,
"learning_rate": 1.567626246381473e-05,
"loss": 0.1361,
"step": 42700
},
{
"epoch": 13.76648440012866,
"grad_norm": 1.1041834354400635,
"learning_rate": 1.5595850755870055e-05,
"loss": 0.1362,
"step": 42800
},
{
"epoch": 13.798649083306529,
"grad_norm": 1.1098655462265015,
"learning_rate": 1.551543904792538e-05,
"loss": 0.1349,
"step": 42900
},
{
"epoch": 13.8308137664844,
"grad_norm": 1.1673643589019775,
"learning_rate": 1.54350273399807e-05,
"loss": 0.1372,
"step": 43000
},
{
"epoch": 13.86297844966227,
"grad_norm": 1.078134298324585,
"learning_rate": 1.5354615632036027e-05,
"loss": 0.1326,
"step": 43100
},
{
"epoch": 13.895143132840142,
"grad_norm": 1.3180980682373047,
"learning_rate": 1.5274203924091348e-05,
"loss": 0.1319,
"step": 43200
},
{
"epoch": 13.927307816018013,
"grad_norm": 1.16948664188385,
"learning_rate": 1.519379221614667e-05,
"loss": 0.1316,
"step": 43300
},
{
"epoch": 13.959472499195883,
"grad_norm": 1.0479273796081543,
"learning_rate": 1.5113380508201993e-05,
"loss": 0.1361,
"step": 43400
},
{
"epoch": 13.991637182373754,
"grad_norm": 1.2957446575164795,
"learning_rate": 1.5032968800257319e-05,
"loss": 0.1348,
"step": 43500
},
{
"epoch": 14.023801865551624,
"grad_norm": 0.9477380514144897,
"learning_rate": 1.4952557092312641e-05,
"loss": 0.1132,
"step": 43600
},
{
"epoch": 14.055966548729495,
"grad_norm": 1.090921401977539,
"learning_rate": 1.4872145384367964e-05,
"loss": 0.1055,
"step": 43700
},
{
"epoch": 14.088131231907365,
"grad_norm": 0.9108441472053528,
"learning_rate": 1.479173367642329e-05,
"loss": 0.107,
"step": 43800
},
{
"epoch": 14.120295915085237,
"grad_norm": 0.9658478498458862,
"learning_rate": 1.4711321968478612e-05,
"loss": 0.1074,
"step": 43900
},
{
"epoch": 14.152460598263108,
"grad_norm": 1.125514030456543,
"learning_rate": 1.4630910260533934e-05,
"loss": 0.1069,
"step": 44000
},
{
"epoch": 14.184625281440978,
"grad_norm": 1.137123942375183,
"learning_rate": 1.4550498552589258e-05,
"loss": 0.108,
"step": 44100
},
{
"epoch": 14.21678996461885,
"grad_norm": 1.2043941020965576,
"learning_rate": 1.4470086844644581e-05,
"loss": 0.1086,
"step": 44200
},
{
"epoch": 14.248954647796719,
"grad_norm": 1.0090394020080566,
"learning_rate": 1.4389675136699903e-05,
"loss": 0.1088,
"step": 44300
},
{
"epoch": 14.28111933097459,
"grad_norm": 1.0625391006469727,
"learning_rate": 1.4309263428755226e-05,
"loss": 0.1098,
"step": 44400
},
{
"epoch": 14.31328401415246,
"grad_norm": 0.9002700448036194,
"learning_rate": 1.4228851720810552e-05,
"loss": 0.109,
"step": 44500
},
{
"epoch": 14.345448697330331,
"grad_norm": 1.1506325006484985,
"learning_rate": 1.4148440012865874e-05,
"loss": 0.1095,
"step": 44600
},
{
"epoch": 14.377613380508201,
"grad_norm": 0.9931826591491699,
"learning_rate": 1.4068028304921197e-05,
"loss": 0.1105,
"step": 44700
},
{
"epoch": 14.409778063686073,
"grad_norm": 1.258078932762146,
"learning_rate": 1.398761659697652e-05,
"loss": 0.1107,
"step": 44800
},
{
"epoch": 14.441942746863944,
"grad_norm": 1.1100337505340576,
"learning_rate": 1.3907204889031843e-05,
"loss": 0.1119,
"step": 44900
},
{
"epoch": 14.474107430041814,
"grad_norm": 1.0397074222564697,
"learning_rate": 1.3826793181087166e-05,
"loss": 0.1104,
"step": 45000
},
{
"epoch": 14.506272113219685,
"grad_norm": 1.0517898797988892,
"learning_rate": 1.3746381473142488e-05,
"loss": 0.1132,
"step": 45100
},
{
"epoch": 14.538436796397555,
"grad_norm": 0.9665665030479431,
"learning_rate": 1.3665969765197814e-05,
"loss": 0.1124,
"step": 45200
},
{
"epoch": 14.570601479575426,
"grad_norm": 1.032689094543457,
"learning_rate": 1.3585558057253136e-05,
"loss": 0.1108,
"step": 45300
},
{
"epoch": 14.602766162753296,
"grad_norm": 1.0238643884658813,
"learning_rate": 1.3505146349308459e-05,
"loss": 0.1109,
"step": 45400
},
{
"epoch": 14.634930845931168,
"grad_norm": 0.9571881890296936,
"learning_rate": 1.3424734641363785e-05,
"loss": 0.1124,
"step": 45500
},
{
"epoch": 14.667095529109039,
"grad_norm": 0.9984620809555054,
"learning_rate": 1.3345127050498553e-05,
"loss": 0.1108,
"step": 45600
},
{
"epoch": 14.699260212286909,
"grad_norm": 1.1483553647994995,
"learning_rate": 1.3264715342553876e-05,
"loss": 0.1127,
"step": 45700
},
{
"epoch": 14.73142489546478,
"grad_norm": 1.1123391389846802,
"learning_rate": 1.3184303634609198e-05,
"loss": 0.1125,
"step": 45800
},
{
"epoch": 14.76358957864265,
"grad_norm": 1.0833643674850464,
"learning_rate": 1.3103891926664524e-05,
"loss": 0.1124,
"step": 45900
},
{
"epoch": 14.795754261820521,
"grad_norm": 0.8864045143127441,
"learning_rate": 1.3023480218719847e-05,
"loss": 0.1117,
"step": 46000
},
{
"epoch": 14.827918944998391,
"grad_norm": 1.0828707218170166,
"learning_rate": 1.2943068510775169e-05,
"loss": 0.1135,
"step": 46100
},
{
"epoch": 14.860083628176262,
"grad_norm": 0.9754586815834045,
"learning_rate": 1.2862656802830492e-05,
"loss": 0.1146,
"step": 46200
},
{
"epoch": 14.892248311354134,
"grad_norm": 1.0156996250152588,
"learning_rate": 1.2782245094885817e-05,
"loss": 0.1117,
"step": 46300
},
{
"epoch": 14.924412994532004,
"grad_norm": 1.1564407348632812,
"learning_rate": 1.270183338694114e-05,
"loss": 0.113,
"step": 46400
},
{
"epoch": 14.956577677709875,
"grad_norm": 1.151871919631958,
"learning_rate": 1.2621421678996462e-05,
"loss": 0.1156,
"step": 46500
},
{
"epoch": 14.988742360887745,
"grad_norm": 1.0806056261062622,
"learning_rate": 1.2541009971051786e-05,
"loss": 0.1115,
"step": 46600
},
{
"epoch": 15.020907044065616,
"grad_norm": 1.0419082641601562,
"learning_rate": 1.2460598263107109e-05,
"loss": 0.0983,
"step": 46700
},
{
"epoch": 15.053071727243486,
"grad_norm": 1.059793472290039,
"learning_rate": 1.2380186555162431e-05,
"loss": 0.0914,
"step": 46800
},
{
"epoch": 15.085236410421357,
"grad_norm": 0.9557967782020569,
"learning_rate": 1.2299774847217756e-05,
"loss": 0.0898,
"step": 46900
},
{
"epoch": 15.117401093599229,
"grad_norm": 0.9494596719741821,
"learning_rate": 1.2219363139273078e-05,
"loss": 0.0923,
"step": 47000
},
{
"epoch": 15.149565776777099,
"grad_norm": 0.9562474489212036,
"learning_rate": 1.2138951431328402e-05,
"loss": 0.0951,
"step": 47100
},
{
"epoch": 15.18173045995497,
"grad_norm": 0.9924389123916626,
"learning_rate": 1.2058539723383726e-05,
"loss": 0.0924,
"step": 47200
},
{
"epoch": 15.21389514313284,
"grad_norm": 0.9415188431739807,
"learning_rate": 1.1978128015439049e-05,
"loss": 0.0946,
"step": 47300
},
{
"epoch": 15.246059826310711,
"grad_norm": 1.003299593925476,
"learning_rate": 1.1897716307494373e-05,
"loss": 0.0938,
"step": 47400
},
{
"epoch": 15.278224509488581,
"grad_norm": 0.977337121963501,
"learning_rate": 1.1817304599549694e-05,
"loss": 0.0949,
"step": 47500
},
{
"epoch": 15.310389192666452,
"grad_norm": 0.9806488156318665,
"learning_rate": 1.1736892891605018e-05,
"loss": 0.0949,
"step": 47600
},
{
"epoch": 15.342553875844324,
"grad_norm": 0.9360681176185608,
"learning_rate": 1.1656481183660342e-05,
"loss": 0.0927,
"step": 47700
},
{
"epoch": 15.374718559022194,
"grad_norm": 1.0371599197387695,
"learning_rate": 1.1576069475715664e-05,
"loss": 0.0928,
"step": 47800
},
{
"epoch": 15.406883242200065,
"grad_norm": 1.0855216979980469,
"learning_rate": 1.1495657767770989e-05,
"loss": 0.0937,
"step": 47900
},
{
"epoch": 15.439047925377935,
"grad_norm": 0.9102591276168823,
"learning_rate": 1.1415246059826311e-05,
"loss": 0.0954,
"step": 48000
},
{
"epoch": 15.471212608555806,
"grad_norm": 0.8920603394508362,
"learning_rate": 1.1334834351881635e-05,
"loss": 0.0944,
"step": 48100
},
{
"epoch": 15.503377291733676,
"grad_norm": 1.0157232284545898,
"learning_rate": 1.1254422643936958e-05,
"loss": 0.0941,
"step": 48200
},
{
"epoch": 15.535541974911547,
"grad_norm": 1.0315204858779907,
"learning_rate": 1.1174010935992282e-05,
"loss": 0.0943,
"step": 48300
},
{
"epoch": 15.567706658089417,
"grad_norm": 0.9432384371757507,
"learning_rate": 1.109440334512705e-05,
"loss": 0.0942,
"step": 48400
},
{
"epoch": 15.599871341267288,
"grad_norm": 1.0601836442947388,
"learning_rate": 1.1013991637182375e-05,
"loss": 0.0959,
"step": 48500
},
{
"epoch": 15.63203602444516,
"grad_norm": 1.0262197256088257,
"learning_rate": 1.0933579929237697e-05,
"loss": 0.0959,
"step": 48600
},
{
"epoch": 15.66420070762303,
"grad_norm": 1.1089009046554565,
"learning_rate": 1.0853168221293021e-05,
"loss": 0.0927,
"step": 48700
},
{
"epoch": 15.696365390800901,
"grad_norm": 0.9688426852226257,
"learning_rate": 1.0772756513348344e-05,
"loss": 0.0943,
"step": 48800
},
{
"epoch": 15.72853007397877,
"grad_norm": 0.9896754026412964,
"learning_rate": 1.0692344805403668e-05,
"loss": 0.0943,
"step": 48900
},
{
"epoch": 15.760694757156642,
"grad_norm": 1.1106398105621338,
"learning_rate": 1.0611933097458992e-05,
"loss": 0.0978,
"step": 49000
},
{
"epoch": 15.792859440334512,
"grad_norm": 0.9831110835075378,
"learning_rate": 1.0531521389514314e-05,
"loss": 0.0968,
"step": 49100
},
{
"epoch": 15.825024123512383,
"grad_norm": 0.9822130799293518,
"learning_rate": 1.0451109681569637e-05,
"loss": 0.0961,
"step": 49200
},
{
"epoch": 15.857188806690255,
"grad_norm": 0.9292415380477905,
"learning_rate": 1.037069797362496e-05,
"loss": 0.0954,
"step": 49300
},
{
"epoch": 15.889353489868125,
"grad_norm": 1.1434463262557983,
"learning_rate": 1.0290286265680284e-05,
"loss": 0.0971,
"step": 49400
},
{
"epoch": 15.921518173045996,
"grad_norm": 0.9617388844490051,
"learning_rate": 1.0209874557735606e-05,
"loss": 0.0958,
"step": 49500
},
{
"epoch": 15.953682856223866,
"grad_norm": 1.0446373224258423,
"learning_rate": 1.012946284979093e-05,
"loss": 0.0946,
"step": 49600
},
{
"epoch": 15.985847539401737,
"grad_norm": 1.1061015129089355,
"learning_rate": 1.0049051141846254e-05,
"loss": 0.0983,
"step": 49700
},
{
"epoch": 16.018012222579607,
"grad_norm": 0.9545947909355164,
"learning_rate": 9.968639433901577e-06,
"loss": 0.0845,
"step": 49800
},
{
"epoch": 16.05017690575748,
"grad_norm": 0.9615588188171387,
"learning_rate": 9.888227725956901e-06,
"loss": 0.0786,
"step": 49900
},
{
"epoch": 16.08234158893535,
"grad_norm": 0.9855136275291443,
"learning_rate": 9.807816018012223e-06,
"loss": 0.0802,
"step": 50000
},
{
"epoch": 16.11450627211322,
"grad_norm": 1.1269737482070923,
"learning_rate": 9.727404310067546e-06,
"loss": 0.0775,
"step": 50100
},
{
"epoch": 16.14667095529109,
"grad_norm": 0.8828549385070801,
"learning_rate": 9.64699260212287e-06,
"loss": 0.0805,
"step": 50200
},
{
"epoch": 16.17883563846896,
"grad_norm": 0.9752237200737,
"learning_rate": 9.566580894178192e-06,
"loss": 0.0791,
"step": 50300
},
{
"epoch": 16.211000321646832,
"grad_norm": 1.014918565750122,
"learning_rate": 9.486169186233517e-06,
"loss": 0.0802,
"step": 50400
},
{
"epoch": 16.243165004824704,
"grad_norm": 0.9082208871841431,
"learning_rate": 9.405757478288839e-06,
"loss": 0.0805,
"step": 50500
},
{
"epoch": 16.27532968800257,
"grad_norm": 0.793496310710907,
"learning_rate": 9.325345770344163e-06,
"loss": 0.0791,
"step": 50600
},
{
"epoch": 16.307494371180443,
"grad_norm": 0.922303318977356,
"learning_rate": 9.244934062399486e-06,
"loss": 0.0823,
"step": 50700
},
{
"epoch": 16.339659054358314,
"grad_norm": 0.9956395626068115,
"learning_rate": 9.16452235445481e-06,
"loss": 0.0808,
"step": 50800
},
{
"epoch": 16.371823737536186,
"grad_norm": 1.1846128702163696,
"learning_rate": 9.084110646510132e-06,
"loss": 0.0811,
"step": 50900
},
{
"epoch": 16.403988420714057,
"grad_norm": 0.8967270255088806,
"learning_rate": 9.003698938565455e-06,
"loss": 0.0812,
"step": 51000
},
{
"epoch": 16.436153103891925,
"grad_norm": 1.0155181884765625,
"learning_rate": 8.923287230620779e-06,
"loss": 0.0809,
"step": 51100
},
{
"epoch": 16.468317787069797,
"grad_norm": 1.014799952507019,
"learning_rate": 8.842875522676101e-06,
"loss": 0.0819,
"step": 51200
},
{
"epoch": 16.50048247024767,
"grad_norm": 1.1782442331314087,
"learning_rate": 8.763267931810872e-06,
"loss": 0.0815,
"step": 51300
},
{
"epoch": 16.53264715342554,
"grad_norm": 0.9318787455558777,
"learning_rate": 8.682856223866196e-06,
"loss": 0.0814,
"step": 51400
},
{
"epoch": 16.56481183660341,
"grad_norm": 0.9740914702415466,
"learning_rate": 8.60244451592152e-06,
"loss": 0.0809,
"step": 51500
},
{
"epoch": 16.59697651978128,
"grad_norm": 1.0369396209716797,
"learning_rate": 8.522032807976842e-06,
"loss": 0.0808,
"step": 51600
},
{
"epoch": 16.62914120295915,
"grad_norm": 0.8972563743591309,
"learning_rate": 8.441621100032165e-06,
"loss": 0.079,
"step": 51700
},
{
"epoch": 16.661305886137022,
"grad_norm": 1.1047544479370117,
"learning_rate": 8.361209392087487e-06,
"loss": 0.0803,
"step": 51800
},
{
"epoch": 16.693470569314893,
"grad_norm": 1.0711828470230103,
"learning_rate": 8.280797684142811e-06,
"loss": 0.0832,
"step": 51900
},
{
"epoch": 16.72563525249276,
"grad_norm": 0.8830587267875671,
"learning_rate": 8.200385976198136e-06,
"loss": 0.0815,
"step": 52000
},
{
"epoch": 16.757799935670633,
"grad_norm": 0.8716908693313599,
"learning_rate": 8.119974268253458e-06,
"loss": 0.0808,
"step": 52100
},
{
"epoch": 16.789964618848504,
"grad_norm": 1.0468723773956299,
"learning_rate": 8.039562560308782e-06,
"loss": 0.0825,
"step": 52200
},
{
"epoch": 16.822129302026376,
"grad_norm": 1.0085501670837402,
"learning_rate": 7.959150852364105e-06,
"loss": 0.0817,
"step": 52300
},
{
"epoch": 16.854293985204247,
"grad_norm": 0.8367465138435364,
"learning_rate": 7.878739144419429e-06,
"loss": 0.0819,
"step": 52400
},
{
"epoch": 16.886458668382115,
"grad_norm": 0.9274358749389648,
"learning_rate": 7.798327436474751e-06,
"loss": 0.0818,
"step": 52500
},
{
"epoch": 16.918623351559987,
"grad_norm": 1.1087638139724731,
"learning_rate": 7.717915728530074e-06,
"loss": 0.083,
"step": 52600
},
{
"epoch": 16.950788034737858,
"grad_norm": 1.0249075889587402,
"learning_rate": 7.637504020585398e-06,
"loss": 0.0818,
"step": 52700
},
{
"epoch": 16.98295271791573,
"grad_norm": 0.9565846920013428,
"learning_rate": 7.55709231264072e-06,
"loss": 0.0802,
"step": 52800
},
{
"epoch": 17.015117401093597,
"grad_norm": 0.8502324819564819,
"learning_rate": 7.4766806046960445e-06,
"loss": 0.0762,
"step": 52900
},
{
"epoch": 17.04728208427147,
"grad_norm": 0.998939037322998,
"learning_rate": 7.396268896751367e-06,
"loss": 0.0693,
"step": 53000
},
{
"epoch": 17.07944676744934,
"grad_norm": 0.8634834289550781,
"learning_rate": 7.31585718880669e-06,
"loss": 0.069,
"step": 53100
},
{
"epoch": 17.111611450627212,
"grad_norm": 0.8927751779556274,
"learning_rate": 7.2354454808620144e-06,
"loss": 0.0686,
"step": 53200
},
{
"epoch": 17.143776133805083,
"grad_norm": 0.7095975875854492,
"learning_rate": 7.155033772917337e-06,
"loss": 0.0688,
"step": 53300
},
{
"epoch": 17.17594081698295,
"grad_norm": 1.0496543645858765,
"learning_rate": 7.074622064972661e-06,
"loss": 0.0709,
"step": 53400
},
{
"epoch": 17.208105500160823,
"grad_norm": 0.8392481803894043,
"learning_rate": 6.9942103570279835e-06,
"loss": 0.0692,
"step": 53500
},
{
"epoch": 17.240270183338694,
"grad_norm": 0.9044252634048462,
"learning_rate": 6.913798649083307e-06,
"loss": 0.0706,
"step": 53600
},
{
"epoch": 17.272434866516566,
"grad_norm": 0.8854852914810181,
"learning_rate": 6.833386941138629e-06,
"loss": 0.0693,
"step": 53700
},
{
"epoch": 17.304599549694437,
"grad_norm": 0.8078619837760925,
"learning_rate": 6.752975233193953e-06,
"loss": 0.0702,
"step": 53800
},
{
"epoch": 17.336764232872305,
"grad_norm": 0.9859098196029663,
"learning_rate": 6.672563525249277e-06,
"loss": 0.07,
"step": 53900
},
{
"epoch": 17.368928916050177,
"grad_norm": 1.025367259979248,
"learning_rate": 6.592151817304599e-06,
"loss": 0.0699,
"step": 54000
},
{
"epoch": 17.401093599228048,
"grad_norm": 1.0151135921478271,
"learning_rate": 6.511740109359923e-06,
"loss": 0.0704,
"step": 54100
},
{
"epoch": 17.43325828240592,
"grad_norm": 1.0093625783920288,
"learning_rate": 6.431328401415246e-06,
"loss": 0.0694,
"step": 54200
},
{
"epoch": 17.465422965583787,
"grad_norm": 0.8962597846984863,
"learning_rate": 6.35091669347057e-06,
"loss": 0.0703,
"step": 54300
},
{
"epoch": 17.49758764876166,
"grad_norm": 0.8843083381652832,
"learning_rate": 6.270504985525893e-06,
"loss": 0.0709,
"step": 54400
},
{
"epoch": 17.52975233193953,
"grad_norm": 1.1030328273773193,
"learning_rate": 6.190093277581216e-06,
"loss": 0.0694,
"step": 54500
},
{
"epoch": 17.5619170151174,
"grad_norm": 0.9267283082008362,
"learning_rate": 6.109681569636539e-06,
"loss": 0.0708,
"step": 54600
},
{
"epoch": 17.594081698295273,
"grad_norm": 1.0376532077789307,
"learning_rate": 6.029269861691863e-06,
"loss": 0.0708,
"step": 54700
},
{
"epoch": 17.62624638147314,
"grad_norm": 0.8866332173347473,
"learning_rate": 5.9488581537471864e-06,
"loss": 0.0709,
"step": 54800
},
{
"epoch": 17.658411064651013,
"grad_norm": 0.9702731966972351,
"learning_rate": 5.868446445802509e-06,
"loss": 0.0711,
"step": 54900
},
{
"epoch": 17.690575747828884,
"grad_norm": 0.904787540435791,
"learning_rate": 5.788034737857832e-06,
"loss": 0.0719,
"step": 55000
},
{
"epoch": 17.722740431006756,
"grad_norm": 0.8797966241836548,
"learning_rate": 5.7076230299131555e-06,
"loss": 0.0712,
"step": 55100
},
{
"epoch": 17.754905114184623,
"grad_norm": 0.9212404489517212,
"learning_rate": 5.628015439047925e-06,
"loss": 0.0687,
"step": 55200
},
{
"epoch": 17.787069797362495,
"grad_norm": 0.9659159779548645,
"learning_rate": 5.547603731103248e-06,
"loss": 0.0716,
"step": 55300
},
{
"epoch": 17.819234480540366,
"grad_norm": 0.9781568050384521,
"learning_rate": 5.4671920231585725e-06,
"loss": 0.0712,
"step": 55400
},
{
"epoch": 17.851399163718238,
"grad_norm": 0.9969457387924194,
"learning_rate": 5.386780315213896e-06,
"loss": 0.0716,
"step": 55500
},
{
"epoch": 17.88356384689611,
"grad_norm": 0.8759910464286804,
"learning_rate": 5.306368607269218e-06,
"loss": 0.0687,
"step": 55600
},
{
"epoch": 17.915728530073977,
"grad_norm": 0.9174026250839233,
"learning_rate": 5.225956899324542e-06,
"loss": 0.0705,
"step": 55700
},
{
"epoch": 17.94789321325185,
"grad_norm": 1.0490466356277466,
"learning_rate": 5.145545191379865e-06,
"loss": 0.0701,
"step": 55800
},
{
"epoch": 17.98005789642972,
"grad_norm": 0.8875726461410522,
"learning_rate": 5.065133483435188e-06,
"loss": 0.0713,
"step": 55900
},
{
"epoch": 18.01222257960759,
"grad_norm": 0.7829542756080627,
"learning_rate": 4.984721775490512e-06,
"loss": 0.068,
"step": 56000
},
{
"epoch": 18.044387262785463,
"grad_norm": 0.9281951785087585,
"learning_rate": 4.904310067545835e-06,
"loss": 0.0615,
"step": 56100
},
{
"epoch": 18.07655194596333,
"grad_norm": 0.8894643783569336,
"learning_rate": 4.823898359601158e-06,
"loss": 0.0608,
"step": 56200
},
{
"epoch": 18.108716629141203,
"grad_norm": 0.7356093525886536,
"learning_rate": 4.743486651656481e-06,
"loss": 0.0616,
"step": 56300
},
{
"epoch": 18.140881312319074,
"grad_norm": 0.7402681708335876,
"learning_rate": 4.663074943711805e-06,
"loss": 0.0616,
"step": 56400
},
{
"epoch": 18.173045995496945,
"grad_norm": 0.8778253793716431,
"learning_rate": 4.582663235767128e-06,
"loss": 0.0613,
"step": 56500
},
{
"epoch": 18.205210678674813,
"grad_norm": 0.7177903652191162,
"learning_rate": 4.502251527822451e-06,
"loss": 0.061,
"step": 56600
},
{
"epoch": 18.237375361852685,
"grad_norm": 1.0053707361221313,
"learning_rate": 4.421839819877775e-06,
"loss": 0.0614,
"step": 56700
},
{
"epoch": 18.269540045030556,
"grad_norm": 0.8634065985679626,
"learning_rate": 4.341428111933098e-06,
"loss": 0.0609,
"step": 56800
},
{
"epoch": 18.301704728208428,
"grad_norm": 0.8410573601722717,
"learning_rate": 4.261016403988421e-06,
"loss": 0.0618,
"step": 56900
},
{
"epoch": 18.3338694113863,
"grad_norm": 0.9490267038345337,
"learning_rate": 4.180604696043744e-06,
"loss": 0.0617,
"step": 57000
},
{
"epoch": 18.366034094564167,
"grad_norm": 0.8697332739830017,
"learning_rate": 4.100192988099068e-06,
"loss": 0.0617,
"step": 57100
},
{
"epoch": 18.39819877774204,
"grad_norm": 1.0434635877609253,
"learning_rate": 4.019781280154391e-06,
"loss": 0.0613,
"step": 57200
},
{
"epoch": 18.43036346091991,
"grad_norm": 1.034525990486145,
"learning_rate": 3.9393695722097144e-06,
"loss": 0.0631,
"step": 57300
},
{
"epoch": 18.46252814409778,
"grad_norm": 0.8523834347724915,
"learning_rate": 3.858957864265037e-06,
"loss": 0.0619,
"step": 57400
},
{
"epoch": 18.494692827275653,
"grad_norm": 0.9277452826499939,
"learning_rate": 3.77854615632036e-06,
"loss": 0.0619,
"step": 57500
},
{
"epoch": 18.52685751045352,
"grad_norm": 0.8697579503059387,
"learning_rate": 3.6981344483756835e-06,
"loss": 0.0609,
"step": 57600
},
{
"epoch": 18.559022193631392,
"grad_norm": 1.1431069374084473,
"learning_rate": 3.6177227404310072e-06,
"loss": 0.0627,
"step": 57700
},
{
"epoch": 18.591186876809264,
"grad_norm": 0.8121923804283142,
"learning_rate": 3.5373110324863305e-06,
"loss": 0.0611,
"step": 57800
},
{
"epoch": 18.623351559987135,
"grad_norm": 0.8631033301353455,
"learning_rate": 3.4568993245416534e-06,
"loss": 0.0643,
"step": 57900
},
{
"epoch": 18.655516243165003,
"grad_norm": 0.8957691788673401,
"learning_rate": 3.3764876165969767e-06,
"loss": 0.0641,
"step": 58000
},
{
"epoch": 18.687680926342875,
"grad_norm": 0.8032559156417847,
"learning_rate": 3.2960759086522996e-06,
"loss": 0.0628,
"step": 58100
},
{
"epoch": 18.719845609520746,
"grad_norm": 0.8172631859779358,
"learning_rate": 3.215664200707623e-06,
"loss": 0.061,
"step": 58200
},
{
"epoch": 18.752010292698618,
"grad_norm": 0.8193603157997131,
"learning_rate": 3.1352524927629466e-06,
"loss": 0.0624,
"step": 58300
},
{
"epoch": 18.78417497587649,
"grad_norm": 0.8261486887931824,
"learning_rate": 3.0548407848182695e-06,
"loss": 0.0601,
"step": 58400
},
{
"epoch": 18.816339659054357,
"grad_norm": 0.9643902778625488,
"learning_rate": 2.9744290768735932e-06,
"loss": 0.0618,
"step": 58500
},
{
"epoch": 18.84850434223223,
"grad_norm": 0.919653058052063,
"learning_rate": 2.894017368928916e-06,
"loss": 0.0621,
"step": 58600
},
{
"epoch": 18.8806690254101,
"grad_norm": 0.99927818775177,
"learning_rate": 2.8136056609842394e-06,
"loss": 0.0626,
"step": 58700
},
{
"epoch": 18.91283370858797,
"grad_norm": 1.0389022827148438,
"learning_rate": 2.7331939530395627e-06,
"loss": 0.0607,
"step": 58800
},
{
"epoch": 18.944998391765843,
"grad_norm": 0.8360564708709717,
"learning_rate": 2.652782245094886e-06,
"loss": 0.0621,
"step": 58900
},
{
"epoch": 18.97716307494371,
"grad_norm": 0.7946717143058777,
"learning_rate": 2.572370537150209e-06,
"loss": 0.0613,
"step": 59000
},
{
"epoch": 19.009327758121582,
"grad_norm": 0.8464102149009705,
"learning_rate": 2.4919588292055326e-06,
"loss": 0.0598,
"step": 59100
},
{
"epoch": 19.041492441299454,
"grad_norm": 0.8916983008384705,
"learning_rate": 2.4115471212608555e-06,
"loss": 0.0553,
"step": 59200
},
{
"epoch": 19.073657124477325,
"grad_norm": 0.8545793294906616,
"learning_rate": 2.3311354133161792e-06,
"loss": 0.0561,
"step": 59300
},
{
"epoch": 19.105821807655193,
"grad_norm": 0.8248264789581299,
"learning_rate": 2.2515278224509488e-06,
"loss": 0.0551,
"step": 59400
},
{
"epoch": 19.137986490833065,
"grad_norm": 0.9491187334060669,
"learning_rate": 2.1711161145062725e-06,
"loss": 0.0569,
"step": 59500
},
{
"epoch": 19.170151174010936,
"grad_norm": 0.7597662806510925,
"learning_rate": 2.0907044065615954e-06,
"loss": 0.0554,
"step": 59600
},
{
"epoch": 19.202315857188808,
"grad_norm": 0.7805320024490356,
"learning_rate": 2.0102926986169187e-06,
"loss": 0.0567,
"step": 59700
},
{
"epoch": 19.23448054036668,
"grad_norm": 0.7836379408836365,
"learning_rate": 1.929880990672242e-06,
"loss": 0.0551,
"step": 59800
},
{
"epoch": 19.266645223544547,
"grad_norm": 0.8051148056983948,
"learning_rate": 1.849469282727565e-06,
"loss": 0.0565,
"step": 59900
},
{
"epoch": 19.29880990672242,
"grad_norm": 0.8301971554756165,
"learning_rate": 1.7690575747828884e-06,
"loss": 0.0567,
"step": 60000
},
{
"epoch": 19.33097458990029,
"grad_norm": 0.8996140956878662,
"learning_rate": 1.6886458668382117e-06,
"loss": 0.056,
"step": 60100
},
{
"epoch": 19.36313927307816,
"grad_norm": 0.8115219473838806,
"learning_rate": 1.608234158893535e-06,
"loss": 0.0575,
"step": 60200
},
{
"epoch": 19.39530395625603,
"grad_norm": 0.8823007345199585,
"learning_rate": 1.5278224509488583e-06,
"loss": 0.0552,
"step": 60300
},
{
"epoch": 19.4274686394339,
"grad_norm": 0.7915894389152527,
"learning_rate": 1.4474107430041816e-06,
"loss": 0.0561,
"step": 60400
},
{
"epoch": 19.459633322611772,
"grad_norm": 0.7833816409111023,
"learning_rate": 1.3669990350595047e-06,
"loss": 0.0563,
"step": 60500
},
{
"epoch": 19.491798005789644,
"grad_norm": 0.9442763924598694,
"learning_rate": 1.286587327114828e-06,
"loss": 0.0583,
"step": 60600
},
{
"epoch": 19.523962688967515,
"grad_norm": 0.8519619107246399,
"learning_rate": 1.2061756191701513e-06,
"loss": 0.0561,
"step": 60700
},
{
"epoch": 19.556127372145383,
"grad_norm": 0.8903068900108337,
"learning_rate": 1.1257639112254744e-06,
"loss": 0.0564,
"step": 60800
},
{
"epoch": 19.588292055323254,
"grad_norm": 0.9718756079673767,
"learning_rate": 1.0453522032807977e-06,
"loss": 0.0556,
"step": 60900
},
{
"epoch": 19.620456738501126,
"grad_norm": 0.7963365316390991,
"learning_rate": 9.64940495336121e-07,
"loss": 0.0561,
"step": 61000
},
{
"epoch": 19.652621421678997,
"grad_norm": 0.7003450989723206,
"learning_rate": 8.845287873914442e-07,
"loss": 0.0558,
"step": 61100
},
{
"epoch": 19.68478610485687,
"grad_norm": 0.8856065273284912,
"learning_rate": 8.041170794467675e-07,
"loss": 0.0558,
"step": 61200
},
{
"epoch": 19.716950788034737,
"grad_norm": 0.6978211998939514,
"learning_rate": 7.237053715020908e-07,
"loss": 0.0547,
"step": 61300
},
{
"epoch": 19.74911547121261,
"grad_norm": 0.6877977848052979,
"learning_rate": 6.440977806368608e-07,
"loss": 0.0566,
"step": 61400
},
{
"epoch": 19.78128015439048,
"grad_norm": 0.9814041256904602,
"learning_rate": 5.636860726921841e-07,
"loss": 0.0544,
"step": 61500
},
{
"epoch": 19.81344483756835,
"grad_norm": 0.8647675514221191,
"learning_rate": 4.832743647475073e-07,
"loss": 0.0561,
"step": 61600
},
{
"epoch": 19.84560952074622,
"grad_norm": 0.8660107851028442,
"learning_rate": 4.028626568028305e-07,
"loss": 0.0552,
"step": 61700
},
{
"epoch": 19.87777420392409,
"grad_norm": 0.7724593281745911,
"learning_rate": 3.224509488581538e-07,
"loss": 0.0563,
"step": 61800
},
{
"epoch": 19.909938887101962,
"grad_norm": 0.8536872863769531,
"learning_rate": 2.42039240913477e-07,
"loss": 0.0557,
"step": 61900
},
{
"epoch": 19.942103570279833,
"grad_norm": 0.8342153429985046,
"learning_rate": 1.6162753296880025e-07,
"loss": 0.0561,
"step": 62000
}
],
"logging_steps": 100,
"max_steps": 62180,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.396914765824e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}