mit / trainer_state.json
hieu2004's picture
Upload 14 files
3689492 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.031181202890604,
"eval_steps": 500,
"global_step": 63000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016368085506878688,
"grad_norm": 0.5328027606010437,
"learning_rate": 3.600654664484452e-07,
"loss": 1.6968,
"step": 100
},
{
"epoch": 0.0032736171013757376,
"grad_norm": 0.5594077706336975,
"learning_rate": 7.237679578105111e-07,
"loss": 1.6883,
"step": 200
},
{
"epoch": 0.004910425652063607,
"grad_norm": 0.6636043787002563,
"learning_rate": 1.087470449172577e-06,
"loss": 1.6196,
"step": 300
},
{
"epoch": 0.006547234202751475,
"grad_norm": 0.6200364828109741,
"learning_rate": 1.4511729405346428e-06,
"loss": 1.511,
"step": 400
},
{
"epoch": 0.008184042753439345,
"grad_norm": 0.4777531623840332,
"learning_rate": 1.8148754318967086e-06,
"loss": 1.342,
"step": 500
},
{
"epoch": 0.009820851304127213,
"grad_norm": 0.3041970133781433,
"learning_rate": 2.1785779232587743e-06,
"loss": 1.2154,
"step": 600
},
{
"epoch": 0.011457659854815082,
"grad_norm": 0.21760690212249756,
"learning_rate": 2.54228041462084e-06,
"loss": 1.1427,
"step": 700
},
{
"epoch": 0.01309446840550295,
"grad_norm": 0.22987280786037445,
"learning_rate": 2.9059829059829063e-06,
"loss": 1.0943,
"step": 800
},
{
"epoch": 0.014731276956190819,
"grad_norm": 0.24943482875823975,
"learning_rate": 3.269685397344972e-06,
"loss": 1.0696,
"step": 900
},
{
"epoch": 0.01636808550687869,
"grad_norm": 0.2619542181491852,
"learning_rate": 3.633387888707038e-06,
"loss": 1.0318,
"step": 1000
},
{
"epoch": 0.018004894057566556,
"grad_norm": 0.2811136841773987,
"learning_rate": 3.997090380069103e-06,
"loss": 1.0035,
"step": 1100
},
{
"epoch": 0.019641702608254426,
"grad_norm": 0.3045084476470947,
"learning_rate": 4.36079287143117e-06,
"loss": 0.9726,
"step": 1200
},
{
"epoch": 0.021278511158942293,
"grad_norm": 0.3168332278728485,
"learning_rate": 4.7244953627932355e-06,
"loss": 0.971,
"step": 1300
},
{
"epoch": 0.022915319709630164,
"grad_norm": 0.33685848116874695,
"learning_rate": 5.088197854155301e-06,
"loss": 0.952,
"step": 1400
},
{
"epoch": 0.02455212826031803,
"grad_norm": 0.3198516368865967,
"learning_rate": 5.451900345517367e-06,
"loss": 0.9385,
"step": 1500
},
{
"epoch": 0.0261889368110059,
"grad_norm": 0.3457159101963043,
"learning_rate": 5.815602836879432e-06,
"loss": 0.9291,
"step": 1600
},
{
"epoch": 0.02782574536169377,
"grad_norm": 0.3343696594238281,
"learning_rate": 6.179305328241499e-06,
"loss": 0.9251,
"step": 1700
},
{
"epoch": 0.029462553912381638,
"grad_norm": 0.4662475287914276,
"learning_rate": 6.543007819603565e-06,
"loss": 0.9328,
"step": 1800
},
{
"epoch": 0.03109936246306951,
"grad_norm": 0.3559871017932892,
"learning_rate": 6.906710310965631e-06,
"loss": 0.9126,
"step": 1900
},
{
"epoch": 0.03273617101375738,
"grad_norm": 0.3852447271347046,
"learning_rate": 7.270412802327696e-06,
"loss": 0.9024,
"step": 2000
},
{
"epoch": 0.034372979564445245,
"grad_norm": 0.36482807993888855,
"learning_rate": 7.634115293689762e-06,
"loss": 0.9086,
"step": 2100
},
{
"epoch": 0.03600978811513311,
"grad_norm": 0.39493420720100403,
"learning_rate": 7.997817785051828e-06,
"loss": 0.9144,
"step": 2200
},
{
"epoch": 0.03764659666582098,
"grad_norm": 0.4406372010707855,
"learning_rate": 8.361520276413894e-06,
"loss": 0.9067,
"step": 2300
},
{
"epoch": 0.03928340521650885,
"grad_norm": 0.43684300780296326,
"learning_rate": 8.72522276777596e-06,
"loss": 0.898,
"step": 2400
},
{
"epoch": 0.04092021376719672,
"grad_norm": 0.4949699342250824,
"learning_rate": 9.088925259138026e-06,
"loss": 0.8893,
"step": 2500
},
{
"epoch": 0.04255702231788459,
"grad_norm": 0.4759005308151245,
"learning_rate": 9.452627750500092e-06,
"loss": 0.9036,
"step": 2600
},
{
"epoch": 0.04419383086857246,
"grad_norm": 0.4733336567878723,
"learning_rate": 9.816330241862157e-06,
"loss": 0.9046,
"step": 2700
},
{
"epoch": 0.04583063941926033,
"grad_norm": 0.5515408515930176,
"learning_rate": 1.0180032733224223e-05,
"loss": 0.8899,
"step": 2800
},
{
"epoch": 0.047467447969948194,
"grad_norm": 0.5026727318763733,
"learning_rate": 1.054373522458629e-05,
"loss": 0.8868,
"step": 2900
},
{
"epoch": 0.04910425652063606,
"grad_norm": 0.5517929196357727,
"learning_rate": 1.0907437715948354e-05,
"loss": 0.8905,
"step": 3000
},
{
"epoch": 0.050741065071323935,
"grad_norm": 0.5139409899711609,
"learning_rate": 1.127114020731042e-05,
"loss": 0.8711,
"step": 3100
},
{
"epoch": 0.0523778736220118,
"grad_norm": 0.5762068033218384,
"learning_rate": 1.1634842698672486e-05,
"loss": 0.9,
"step": 3200
},
{
"epoch": 0.05401468217269967,
"grad_norm": 0.5540242791175842,
"learning_rate": 1.1998545190034552e-05,
"loss": 0.8854,
"step": 3300
},
{
"epoch": 0.05565149072338754,
"grad_norm": 0.6651942133903503,
"learning_rate": 1.236224768139662e-05,
"loss": 0.875,
"step": 3400
},
{
"epoch": 0.05728829927407541,
"grad_norm": 0.6157256364822388,
"learning_rate": 1.2725950172758685e-05,
"loss": 0.87,
"step": 3500
},
{
"epoch": 0.058925107824763276,
"grad_norm": 0.6638494729995728,
"learning_rate": 1.3089652664120751e-05,
"loss": 0.8666,
"step": 3600
},
{
"epoch": 0.06056191637545114,
"grad_norm": 0.6535647511482239,
"learning_rate": 1.3453355155482817e-05,
"loss": 0.8675,
"step": 3700
},
{
"epoch": 0.06219872492613902,
"grad_norm": 0.7346630692481995,
"learning_rate": 1.3817057646844883e-05,
"loss": 0.8724,
"step": 3800
},
{
"epoch": 0.06383553347682688,
"grad_norm": 0.7002882957458496,
"learning_rate": 1.4180760138206948e-05,
"loss": 0.8476,
"step": 3900
},
{
"epoch": 0.06547234202751476,
"grad_norm": 0.6632655262947083,
"learning_rate": 1.4544462629569014e-05,
"loss": 0.8641,
"step": 4000
},
{
"epoch": 0.06710915057820262,
"grad_norm": 0.7253566384315491,
"learning_rate": 1.490816512093108e-05,
"loss": 0.8611,
"step": 4100
},
{
"epoch": 0.06874595912889049,
"grad_norm": 0.7651970386505127,
"learning_rate": 1.5271867612293146e-05,
"loss": 0.8597,
"step": 4200
},
{
"epoch": 0.07038276767957836,
"grad_norm": 0.6781213879585266,
"learning_rate": 1.563557010365521e-05,
"loss": 0.844,
"step": 4300
},
{
"epoch": 0.07201957623026622,
"grad_norm": 0.7465602159500122,
"learning_rate": 1.5999272595017275e-05,
"loss": 0.8558,
"step": 4400
},
{
"epoch": 0.0736563847809541,
"grad_norm": 0.7796695828437805,
"learning_rate": 1.6362975086379343e-05,
"loss": 0.8533,
"step": 4500
},
{
"epoch": 0.07529319333164196,
"grad_norm": 0.7622010111808777,
"learning_rate": 1.6726677577741408e-05,
"loss": 0.8414,
"step": 4600
},
{
"epoch": 0.07693000188232983,
"grad_norm": 0.7499621510505676,
"learning_rate": 1.7090380069103472e-05,
"loss": 0.8459,
"step": 4700
},
{
"epoch": 0.0785668104330177,
"grad_norm": 0.7822730541229248,
"learning_rate": 1.745408256046554e-05,
"loss": 0.8468,
"step": 4800
},
{
"epoch": 0.08020361898370557,
"grad_norm": 0.7850978970527649,
"learning_rate": 1.7817785051827608e-05,
"loss": 0.8603,
"step": 4900
},
{
"epoch": 0.08184042753439344,
"grad_norm": 0.8370286822319031,
"learning_rate": 1.8181487543189672e-05,
"loss": 0.837,
"step": 5000
},
{
"epoch": 0.08347723608508131,
"grad_norm": 0.821024477481842,
"learning_rate": 1.854519003455174e-05,
"loss": 0.8464,
"step": 5100
},
{
"epoch": 0.08511404463576917,
"grad_norm": 0.8516008257865906,
"learning_rate": 1.8908892525913805e-05,
"loss": 0.837,
"step": 5200
},
{
"epoch": 0.08675085318645705,
"grad_norm": 0.7816336750984192,
"learning_rate": 1.927259501727587e-05,
"loss": 0.8471,
"step": 5300
},
{
"epoch": 0.08838766173714492,
"grad_norm": 0.8347124457359314,
"learning_rate": 1.9636297508637937e-05,
"loss": 0.8333,
"step": 5400
},
{
"epoch": 0.09002447028783278,
"grad_norm": 0.8995541334152222,
"learning_rate": 2e-05,
"loss": 0.8341,
"step": 5500
},
{
"epoch": 0.09166127883852065,
"grad_norm": 0.9787241816520691,
"learning_rate": 1.9999984387425675e-05,
"loss": 0.8431,
"step": 5600
},
{
"epoch": 0.09329808738920851,
"grad_norm": 0.8093689680099487,
"learning_rate": 1.999993754975144e-05,
"loss": 0.8325,
"step": 5700
},
{
"epoch": 0.09493489593989639,
"grad_norm": 0.9042837023735046,
"learning_rate": 1.999985948712355e-05,
"loss": 0.828,
"step": 5800
},
{
"epoch": 0.09657170449058426,
"grad_norm": 0.9188331961631775,
"learning_rate": 1.999975019978576e-05,
"loss": 0.8291,
"step": 5900
},
{
"epoch": 0.09820851304127212,
"grad_norm": 0.8699648380279541,
"learning_rate": 1.9999609688079316e-05,
"loss": 0.8277,
"step": 6000
},
{
"epoch": 0.09984532159196,
"grad_norm": 0.9138243794441223,
"learning_rate": 1.999943795244297e-05,
"loss": 0.8367,
"step": 6100
},
{
"epoch": 0.10148213014264787,
"grad_norm": 0.9293233156204224,
"learning_rate": 1.9999234993412973e-05,
"loss": 0.8281,
"step": 6200
},
{
"epoch": 0.10311893869333573,
"grad_norm": 0.9346773624420166,
"learning_rate": 1.999900081162306e-05,
"loss": 0.8323,
"step": 6300
},
{
"epoch": 0.1047557472440236,
"grad_norm": 0.9332927465438843,
"learning_rate": 1.999873540780447e-05,
"loss": 0.8259,
"step": 6400
},
{
"epoch": 0.10639255579471148,
"grad_norm": 0.8887437582015991,
"learning_rate": 1.9998438782785937e-05,
"loss": 0.8305,
"step": 6500
},
{
"epoch": 0.10802936434539934,
"grad_norm": 0.9184074401855469,
"learning_rate": 1.999811093749367e-05,
"loss": 0.829,
"step": 6600
},
{
"epoch": 0.10966617289608721,
"grad_norm": 0.8532683849334717,
"learning_rate": 1.999775187295137e-05,
"loss": 0.8275,
"step": 6700
},
{
"epoch": 0.11130298144677508,
"grad_norm": 0.9298515915870667,
"learning_rate": 1.9997361590280225e-05,
"loss": 0.8192,
"step": 6800
},
{
"epoch": 0.11293978999746294,
"grad_norm": 0.9617123603820801,
"learning_rate": 1.9996940090698896e-05,
"loss": 0.8198,
"step": 6900
},
{
"epoch": 0.11457659854815082,
"grad_norm": 1.0112113952636719,
"learning_rate": 1.9996487375523524e-05,
"loss": 0.8239,
"step": 7000
},
{
"epoch": 0.11621340709883868,
"grad_norm": 0.9226319193840027,
"learning_rate": 1.9996003446167718e-05,
"loss": 0.8281,
"step": 7100
},
{
"epoch": 0.11785021564952655,
"grad_norm": 1.0199968814849854,
"learning_rate": 1.999548830414255e-05,
"loss": 0.82,
"step": 7200
},
{
"epoch": 0.11948702420021443,
"grad_norm": 0.9594390988349915,
"learning_rate": 1.999494195105657e-05,
"loss": 0.8139,
"step": 7300
},
{
"epoch": 0.12112383275090229,
"grad_norm": 0.9685386419296265,
"learning_rate": 1.9994364388615763e-05,
"loss": 0.8193,
"step": 7400
},
{
"epoch": 0.12276064130159016,
"grad_norm": 0.9797342419624329,
"learning_rate": 1.999375561862358e-05,
"loss": 0.815,
"step": 7500
},
{
"epoch": 0.12439744985227803,
"grad_norm": 1.0541061162948608,
"learning_rate": 1.9993115642980912e-05,
"loss": 0.8239,
"step": 7600
},
{
"epoch": 0.1260342584029659,
"grad_norm": 0.9543519616127014,
"learning_rate": 1.99924444636861e-05,
"loss": 0.8145,
"step": 7700
},
{
"epoch": 0.12767106695365377,
"grad_norm": 0.9379186630249023,
"learning_rate": 1.99917420828349e-05,
"loss": 0.817,
"step": 7800
},
{
"epoch": 0.12930787550434164,
"grad_norm": 0.9919012188911438,
"learning_rate": 1.9991008502620515e-05,
"loss": 0.8208,
"step": 7900
},
{
"epoch": 0.13094468405502951,
"grad_norm": 0.9344952702522278,
"learning_rate": 1.999024372533356e-05,
"loss": 0.8167,
"step": 8000
},
{
"epoch": 0.13258149260571736,
"grad_norm": 0.9583950638771057,
"learning_rate": 1.9989447753362058e-05,
"loss": 0.8125,
"step": 8100
},
{
"epoch": 0.13421830115640523,
"grad_norm": 0.9945580363273621,
"learning_rate": 1.998862058919145e-05,
"loss": 0.8225,
"step": 8200
},
{
"epoch": 0.1358551097070931,
"grad_norm": 0.9583763480186462,
"learning_rate": 1.9987762235404566e-05,
"loss": 0.8105,
"step": 8300
},
{
"epoch": 0.13749191825778098,
"grad_norm": 1.025468349456787,
"learning_rate": 1.998687269468162e-05,
"loss": 0.8107,
"step": 8400
},
{
"epoch": 0.13912872680846886,
"grad_norm": 1.0057779550552368,
"learning_rate": 1.998595196980023e-05,
"loss": 0.8138,
"step": 8500
},
{
"epoch": 0.14076553535915673,
"grad_norm": 0.9300206899642944,
"learning_rate": 1.9985000063635365e-05,
"loss": 0.8207,
"step": 8600
},
{
"epoch": 0.14240234390984458,
"grad_norm": 1.0241742134094238,
"learning_rate": 1.9984016979159368e-05,
"loss": 0.8046,
"step": 8700
},
{
"epoch": 0.14403915246053245,
"grad_norm": 0.9688097238540649,
"learning_rate": 1.9983002719441935e-05,
"loss": 0.8193,
"step": 8800
},
{
"epoch": 0.14567596101122032,
"grad_norm": 0.9877735376358032,
"learning_rate": 1.9981957287650107e-05,
"loss": 0.8003,
"step": 8900
},
{
"epoch": 0.1473127695619082,
"grad_norm": 0.9533541202545166,
"learning_rate": 1.9980880687048257e-05,
"loss": 0.8089,
"step": 9000
},
{
"epoch": 0.14894957811259607,
"grad_norm": 1.0934607982635498,
"learning_rate": 1.997977292099809e-05,
"loss": 0.7971,
"step": 9100
},
{
"epoch": 0.15058638666328392,
"grad_norm": 0.9715205430984497,
"learning_rate": 1.9978633992958624e-05,
"loss": 0.8194,
"step": 9200
},
{
"epoch": 0.1522231952139718,
"grad_norm": 0.9527362585067749,
"learning_rate": 1.9977463906486175e-05,
"loss": 0.8095,
"step": 9300
},
{
"epoch": 0.15386000376465966,
"grad_norm": 1.0439358949661255,
"learning_rate": 1.9976262665234357e-05,
"loss": 0.7997,
"step": 9400
},
{
"epoch": 0.15549681231534754,
"grad_norm": 1.1087926626205444,
"learning_rate": 1.9975030272954066e-05,
"loss": 0.8012,
"step": 9500
},
{
"epoch": 0.1571336208660354,
"grad_norm": 1.0532102584838867,
"learning_rate": 1.9973766733493458e-05,
"loss": 0.8006,
"step": 9600
},
{
"epoch": 0.15877042941672329,
"grad_norm": 0.9958882331848145,
"learning_rate": 1.997247205079796e-05,
"loss": 0.8138,
"step": 9700
},
{
"epoch": 0.16040723796741113,
"grad_norm": 1.0133436918258667,
"learning_rate": 1.9971146228910236e-05,
"loss": 0.7942,
"step": 9800
},
{
"epoch": 0.162044046518099,
"grad_norm": 0.9266718029975891,
"learning_rate": 1.9969789271970187e-05,
"loss": 0.7917,
"step": 9900
},
{
"epoch": 0.16368085506878688,
"grad_norm": 1.0468189716339111,
"learning_rate": 1.9968401184214924e-05,
"loss": 0.8012,
"step": 10000
},
{
"epoch": 0.16531766361947475,
"grad_norm": 1.0444200038909912,
"learning_rate": 1.9966981969978782e-05,
"loss": 0.7979,
"step": 10100
},
{
"epoch": 0.16695447217016263,
"grad_norm": 1.0317082405090332,
"learning_rate": 1.9965531633693268e-05,
"loss": 0.8209,
"step": 10200
},
{
"epoch": 0.16859128072085047,
"grad_norm": 1.0699563026428223,
"learning_rate": 1.9964050179887088e-05,
"loss": 0.8035,
"step": 10300
},
{
"epoch": 0.17022808927153835,
"grad_norm": 0.9806187748908997,
"learning_rate": 1.9962537613186096e-05,
"loss": 0.7957,
"step": 10400
},
{
"epoch": 0.17186489782222622,
"grad_norm": 1.0728228092193604,
"learning_rate": 1.996099393831331e-05,
"loss": 0.791,
"step": 10500
},
{
"epoch": 0.1735017063729141,
"grad_norm": 1.028189778327942,
"learning_rate": 1.9959419160088874e-05,
"loss": 0.7964,
"step": 10600
},
{
"epoch": 0.17513851492360197,
"grad_norm": 1.0126999616622925,
"learning_rate": 1.9957813283430054e-05,
"loss": 0.799,
"step": 10700
},
{
"epoch": 0.17677532347428984,
"grad_norm": 0.96955406665802,
"learning_rate": 1.995617631335123e-05,
"loss": 0.8118,
"step": 10800
},
{
"epoch": 0.1784121320249777,
"grad_norm": 1.0654776096343994,
"learning_rate": 1.9954508254963865e-05,
"loss": 0.8084,
"step": 10900
},
{
"epoch": 0.18004894057566556,
"grad_norm": 0.9537600874900818,
"learning_rate": 1.9952809113476493e-05,
"loss": 0.8011,
"step": 11000
},
{
"epoch": 0.18168574912635344,
"grad_norm": 0.9695281982421875,
"learning_rate": 1.9951078894194708e-05,
"loss": 0.8054,
"step": 11100
},
{
"epoch": 0.1833225576770413,
"grad_norm": 1.0722426176071167,
"learning_rate": 1.9949317602521144e-05,
"loss": 0.7917,
"step": 11200
},
{
"epoch": 0.18495936622772918,
"grad_norm": 0.9706518054008484,
"learning_rate": 1.9947525243955467e-05,
"loss": 0.8055,
"step": 11300
},
{
"epoch": 0.18659617477841703,
"grad_norm": 0.9769388437271118,
"learning_rate": 1.994570182409434e-05,
"loss": 0.7981,
"step": 11400
},
{
"epoch": 0.1882329833291049,
"grad_norm": 0.9185972809791565,
"learning_rate": 1.9943847348631415e-05,
"loss": 0.7907,
"step": 11500
},
{
"epoch": 0.18986979187979278,
"grad_norm": 1.0683258771896362,
"learning_rate": 1.9941961823357322e-05,
"loss": 0.8021,
"step": 11600
},
{
"epoch": 0.19150660043048065,
"grad_norm": 0.9599470496177673,
"learning_rate": 1.9940045254159644e-05,
"loss": 0.7923,
"step": 11700
},
{
"epoch": 0.19314340898116852,
"grad_norm": 0.9822320938110352,
"learning_rate": 1.9938097647022895e-05,
"loss": 0.7864,
"step": 11800
},
{
"epoch": 0.1947802175318564,
"grad_norm": 1.180939793586731,
"learning_rate": 1.9936119008028503e-05,
"loss": 0.7841,
"step": 11900
},
{
"epoch": 0.19641702608254424,
"grad_norm": 1.1611251831054688,
"learning_rate": 1.9934109343354808e-05,
"loss": 0.7855,
"step": 12000
},
{
"epoch": 0.19805383463323212,
"grad_norm": 1.0176281929016113,
"learning_rate": 1.9932068659277006e-05,
"loss": 0.7936,
"step": 12100
},
{
"epoch": 0.19969064318392,
"grad_norm": 1.05084228515625,
"learning_rate": 1.992999696216717e-05,
"loss": 0.7856,
"step": 12200
},
{
"epoch": 0.20132745173460787,
"grad_norm": 1.1582859754562378,
"learning_rate": 1.9927894258494204e-05,
"loss": 0.8064,
"step": 12300
},
{
"epoch": 0.20296426028529574,
"grad_norm": 0.9974379539489746,
"learning_rate": 1.992576055482383e-05,
"loss": 0.7923,
"step": 12400
},
{
"epoch": 0.2046010688359836,
"grad_norm": 1.0076924562454224,
"learning_rate": 1.9923595857818573e-05,
"loss": 0.801,
"step": 12500
},
{
"epoch": 0.20623787738667146,
"grad_norm": 1.104923129081726,
"learning_rate": 1.9921400174237732e-05,
"loss": 0.8053,
"step": 12600
},
{
"epoch": 0.20787468593735933,
"grad_norm": 1.0884004831314087,
"learning_rate": 1.9919173510937355e-05,
"loss": 0.7948,
"step": 12700
},
{
"epoch": 0.2095114944880472,
"grad_norm": 0.9803980588912964,
"learning_rate": 1.9916915874870234e-05,
"loss": 0.791,
"step": 12800
},
{
"epoch": 0.21114830303873508,
"grad_norm": 1.0630168914794922,
"learning_rate": 1.9914627273085876e-05,
"loss": 0.7813,
"step": 12900
},
{
"epoch": 0.21278511158942295,
"grad_norm": 1.0575711727142334,
"learning_rate": 1.9912307712730468e-05,
"loss": 0.7862,
"step": 13000
},
{
"epoch": 0.2144219201401108,
"grad_norm": 1.0258235931396484,
"learning_rate": 1.9909957201046875e-05,
"loss": 0.7855,
"step": 13100
},
{
"epoch": 0.21605872869079867,
"grad_norm": 0.970610499382019,
"learning_rate": 1.9907575745374605e-05,
"loss": 0.7845,
"step": 13200
},
{
"epoch": 0.21769553724148655,
"grad_norm": 1.0707366466522217,
"learning_rate": 1.9905163353149787e-05,
"loss": 0.7986,
"step": 13300
},
{
"epoch": 0.21933234579217442,
"grad_norm": 0.9396125674247742,
"learning_rate": 1.9902720031905153e-05,
"loss": 0.7798,
"step": 13400
},
{
"epoch": 0.2209691543428623,
"grad_norm": 1.0123385190963745,
"learning_rate": 1.9900245789270006e-05,
"loss": 0.7866,
"step": 13500
},
{
"epoch": 0.22260596289355017,
"grad_norm": 0.9208526015281677,
"learning_rate": 1.989774063297021e-05,
"loss": 0.79,
"step": 13600
},
{
"epoch": 0.22424277144423801,
"grad_norm": 1.0145132541656494,
"learning_rate": 1.989520457082815e-05,
"loss": 0.7826,
"step": 13700
},
{
"epoch": 0.2258795799949259,
"grad_norm": 0.9474859237670898,
"learning_rate": 1.9892637610762723e-05,
"loss": 0.7904,
"step": 13800
},
{
"epoch": 0.22751638854561376,
"grad_norm": 0.997414767742157,
"learning_rate": 1.9890039760789294e-05,
"loss": 0.7863,
"step": 13900
},
{
"epoch": 0.22915319709630164,
"grad_norm": 1.0312907695770264,
"learning_rate": 1.9887411029019686e-05,
"loss": 0.7825,
"step": 14000
},
{
"epoch": 0.2307900056469895,
"grad_norm": 1.019665002822876,
"learning_rate": 1.9884751423662162e-05,
"loss": 0.7746,
"step": 14100
},
{
"epoch": 0.23242681419767736,
"grad_norm": 0.9788889288902283,
"learning_rate": 1.9882060953021375e-05,
"loss": 0.7805,
"step": 14200
},
{
"epoch": 0.23406362274836523,
"grad_norm": 1.1468379497528076,
"learning_rate": 1.9879339625498356e-05,
"loss": 0.7783,
"step": 14300
},
{
"epoch": 0.2357004312990531,
"grad_norm": 0.9630206823348999,
"learning_rate": 1.9876587449590496e-05,
"loss": 0.7785,
"step": 14400
},
{
"epoch": 0.23733723984974098,
"grad_norm": 1.0484507083892822,
"learning_rate": 1.98738044338915e-05,
"loss": 0.7577,
"step": 14500
},
{
"epoch": 0.23897404840042885,
"grad_norm": 0.9262145161628723,
"learning_rate": 1.987099058709138e-05,
"loss": 0.7847,
"step": 14600
},
{
"epoch": 0.24061085695111672,
"grad_norm": 1.0156426429748535,
"learning_rate": 1.9868145917976412e-05,
"loss": 0.7754,
"step": 14700
},
{
"epoch": 0.24224766550180457,
"grad_norm": 1.0557153224945068,
"learning_rate": 1.986527043542912e-05,
"loss": 0.783,
"step": 14800
},
{
"epoch": 0.24388447405249244,
"grad_norm": 0.9480391144752502,
"learning_rate": 1.9862364148428243e-05,
"loss": 0.7795,
"step": 14900
},
{
"epoch": 0.24552128260318032,
"grad_norm": 1.1189950704574585,
"learning_rate": 1.9859427066048694e-05,
"loss": 0.773,
"step": 15000
},
{
"epoch": 0.2471580911538682,
"grad_norm": 1.0406650304794312,
"learning_rate": 1.985645919746157e-05,
"loss": 0.7815,
"step": 15100
},
{
"epoch": 0.24879489970455607,
"grad_norm": 1.0539467334747314,
"learning_rate": 1.985346055193408e-05,
"loss": 0.7832,
"step": 15200
},
{
"epoch": 0.2504317082552439,
"grad_norm": 1.0707350969314575,
"learning_rate": 1.9850431138829537e-05,
"loss": 0.7775,
"step": 15300
},
{
"epoch": 0.2520685168059318,
"grad_norm": 1.0518571138381958,
"learning_rate": 1.9847370967607332e-05,
"loss": 0.7692,
"step": 15400
},
{
"epoch": 0.25370532535661966,
"grad_norm": 1.038328766822815,
"learning_rate": 1.9844280047822892e-05,
"loss": 0.7812,
"step": 15500
},
{
"epoch": 0.25534213390730753,
"grad_norm": 1.0571229457855225,
"learning_rate": 1.984115838912766e-05,
"loss": 0.7773,
"step": 15600
},
{
"epoch": 0.2569789424579954,
"grad_norm": 1.0450866222381592,
"learning_rate": 1.9838006001269064e-05,
"loss": 0.7789,
"step": 15700
},
{
"epoch": 0.2586157510086833,
"grad_norm": 1.107710838317871,
"learning_rate": 1.9834822894090478e-05,
"loss": 0.7628,
"step": 15800
},
{
"epoch": 0.26025255955937115,
"grad_norm": 1.0595227479934692,
"learning_rate": 1.9831609077531205e-05,
"loss": 0.7805,
"step": 15900
},
{
"epoch": 0.26188936811005903,
"grad_norm": 1.0978327989578247,
"learning_rate": 1.982836456162644e-05,
"loss": 0.7779,
"step": 16000
},
{
"epoch": 0.2635261766607469,
"grad_norm": 1.0871798992156982,
"learning_rate": 1.982508935650722e-05,
"loss": 0.7696,
"step": 16100
},
{
"epoch": 0.2651629852114347,
"grad_norm": 1.0791369676589966,
"learning_rate": 1.982178347240043e-05,
"loss": 0.7701,
"step": 16200
},
{
"epoch": 0.2667997937621226,
"grad_norm": 1.095301866531372,
"learning_rate": 1.981844691962874e-05,
"loss": 0.783,
"step": 16300
},
{
"epoch": 0.26843660231281047,
"grad_norm": 1.1223257780075073,
"learning_rate": 1.9815079708610588e-05,
"loss": 0.7785,
"step": 16400
},
{
"epoch": 0.27007341086349834,
"grad_norm": 1.0025781393051147,
"learning_rate": 1.9811681849860137e-05,
"loss": 0.7787,
"step": 16500
},
{
"epoch": 0.2717102194141862,
"grad_norm": 1.1232304573059082,
"learning_rate": 1.9808253353987252e-05,
"loss": 0.7655,
"step": 16600
},
{
"epoch": 0.2733470279648741,
"grad_norm": 0.9625865817070007,
"learning_rate": 1.9804794231697464e-05,
"loss": 0.785,
"step": 16700
},
{
"epoch": 0.27498383651556196,
"grad_norm": 1.1022255420684814,
"learning_rate": 1.980130449379193e-05,
"loss": 0.7681,
"step": 16800
},
{
"epoch": 0.27662064506624984,
"grad_norm": 1.0605260133743286,
"learning_rate": 1.9797784151167417e-05,
"loss": 0.7686,
"step": 16900
},
{
"epoch": 0.2782574536169377,
"grad_norm": 1.0693503618240356,
"learning_rate": 1.9794233214816237e-05,
"loss": 0.7653,
"step": 17000
},
{
"epoch": 0.2798942621676256,
"grad_norm": 1.0027199983596802,
"learning_rate": 1.979065169582625e-05,
"loss": 0.7802,
"step": 17100
},
{
"epoch": 0.28153107071831346,
"grad_norm": 1.002388834953308,
"learning_rate": 1.9787039605380792e-05,
"loss": 0.7668,
"step": 17200
},
{
"epoch": 0.2831678792690013,
"grad_norm": 1.0847641229629517,
"learning_rate": 1.9783396954758682e-05,
"loss": 0.7685,
"step": 17300
},
{
"epoch": 0.28480468781968915,
"grad_norm": 1.1153062582015991,
"learning_rate": 1.9779723755334142e-05,
"loss": 0.7761,
"step": 17400
},
{
"epoch": 0.286441496370377,
"grad_norm": 1.0675033330917358,
"learning_rate": 1.9776020018576794e-05,
"loss": 0.7637,
"step": 17500
},
{
"epoch": 0.2880783049210649,
"grad_norm": 1.0875293016433716,
"learning_rate": 1.9772285756051613e-05,
"loss": 0.7689,
"step": 17600
},
{
"epoch": 0.28971511347175277,
"grad_norm": 1.135380744934082,
"learning_rate": 1.9768520979418885e-05,
"loss": 0.7763,
"step": 17700
},
{
"epoch": 0.29135192202244065,
"grad_norm": 1.0305795669555664,
"learning_rate": 1.9764725700434183e-05,
"loss": 0.7688,
"step": 17800
},
{
"epoch": 0.2929887305731285,
"grad_norm": 1.0471090078353882,
"learning_rate": 1.976089993094832e-05,
"loss": 0.7573,
"step": 17900
},
{
"epoch": 0.2946255391238164,
"grad_norm": 1.0096269845962524,
"learning_rate": 1.9757043682907325e-05,
"loss": 0.7622,
"step": 18000
},
{
"epoch": 0.29626234767450427,
"grad_norm": 1.103242039680481,
"learning_rate": 1.9753156968352388e-05,
"loss": 0.7573,
"step": 18100
},
{
"epoch": 0.29789915622519214,
"grad_norm": 1.1128453016281128,
"learning_rate": 1.9749239799419827e-05,
"loss": 0.7692,
"step": 18200
},
{
"epoch": 0.29953596477588,
"grad_norm": 1.0762085914611816,
"learning_rate": 1.974529218834106e-05,
"loss": 0.7838,
"step": 18300
},
{
"epoch": 0.30117277332656783,
"grad_norm": 1.0150110721588135,
"learning_rate": 1.9741314147442573e-05,
"loss": 0.773,
"step": 18400
},
{
"epoch": 0.3028095818772557,
"grad_norm": 1.0824315547943115,
"learning_rate": 1.9737305689145842e-05,
"loss": 0.7636,
"step": 18500
},
{
"epoch": 0.3044463904279436,
"grad_norm": 1.2597285509109497,
"learning_rate": 1.973326682596735e-05,
"loss": 0.7688,
"step": 18600
},
{
"epoch": 0.30608319897863145,
"grad_norm": 1.112971544265747,
"learning_rate": 1.97291975705185e-05,
"loss": 0.762,
"step": 18700
},
{
"epoch": 0.30772000752931933,
"grad_norm": 1.11709725856781,
"learning_rate": 1.9725097935505607e-05,
"loss": 0.7674,
"step": 18800
},
{
"epoch": 0.3093568160800072,
"grad_norm": 1.0609350204467773,
"learning_rate": 1.972096793372984e-05,
"loss": 0.7603,
"step": 18900
},
{
"epoch": 0.3109936246306951,
"grad_norm": 1.111243486404419,
"learning_rate": 1.9716807578087193e-05,
"loss": 0.7572,
"step": 19000
},
{
"epoch": 0.31263043318138295,
"grad_norm": 0.9914565086364746,
"learning_rate": 1.971261688156843e-05,
"loss": 0.7558,
"step": 19100
},
{
"epoch": 0.3142672417320708,
"grad_norm": 1.030030369758606,
"learning_rate": 1.9708395857259077e-05,
"loss": 0.7558,
"step": 19200
},
{
"epoch": 0.3159040502827587,
"grad_norm": 1.1039714813232422,
"learning_rate": 1.9704144518339336e-05,
"loss": 0.7507,
"step": 19300
},
{
"epoch": 0.31754085883344657,
"grad_norm": 1.0048165321350098,
"learning_rate": 1.969986287808408e-05,
"loss": 0.7806,
"step": 19400
},
{
"epoch": 0.3191776673841344,
"grad_norm": 1.2964001893997192,
"learning_rate": 1.969555094986279e-05,
"loss": 0.7504,
"step": 19500
},
{
"epoch": 0.32081447593482226,
"grad_norm": 1.198273777961731,
"learning_rate": 1.9691208747139527e-05,
"loss": 0.7597,
"step": 19600
},
{
"epoch": 0.32245128448551014,
"grad_norm": 1.0260130167007446,
"learning_rate": 1.968683628347289e-05,
"loss": 0.7571,
"step": 19700
},
{
"epoch": 0.324088093036198,
"grad_norm": 1.1643099784851074,
"learning_rate": 1.9682433572515952e-05,
"loss": 0.7712,
"step": 19800
},
{
"epoch": 0.3257249015868859,
"grad_norm": 1.1653162240982056,
"learning_rate": 1.9678000628016248e-05,
"loss": 0.7599,
"step": 19900
},
{
"epoch": 0.32736171013757376,
"grad_norm": 1.5513461828231812,
"learning_rate": 1.9673537463815718e-05,
"loss": 0.7673,
"step": 20000
},
{
"epoch": 0.32899851868826163,
"grad_norm": 1.138498306274414,
"learning_rate": 1.9669044093850652e-05,
"loss": 0.7521,
"step": 20100
},
{
"epoch": 0.3306353272389495,
"grad_norm": 1.0548768043518066,
"learning_rate": 1.9664520532151664e-05,
"loss": 0.7596,
"step": 20200
},
{
"epoch": 0.3322721357896374,
"grad_norm": 1.0597394704818726,
"learning_rate": 1.965996679284365e-05,
"loss": 0.7586,
"step": 20300
},
{
"epoch": 0.33390894434032525,
"grad_norm": 1.1359139680862427,
"learning_rate": 1.965538289014572e-05,
"loss": 0.7618,
"step": 20400
},
{
"epoch": 0.3355457528910131,
"grad_norm": 1.1026830673217773,
"learning_rate": 1.9650768838371182e-05,
"loss": 0.7613,
"step": 20500
},
{
"epoch": 0.33718256144170095,
"grad_norm": 1.0065330266952515,
"learning_rate": 1.9646124651927484e-05,
"loss": 0.7394,
"step": 20600
},
{
"epoch": 0.3388193699923888,
"grad_norm": 0.9368694424629211,
"learning_rate": 1.964145034531616e-05,
"loss": 0.761,
"step": 20700
},
{
"epoch": 0.3404561785430767,
"grad_norm": 0.9686558246612549,
"learning_rate": 1.9636745933132807e-05,
"loss": 0.7597,
"step": 20800
},
{
"epoch": 0.34209298709376457,
"grad_norm": 1.114066243171692,
"learning_rate": 1.9632011430067024e-05,
"loss": 0.7675,
"step": 20900
},
{
"epoch": 0.34372979564445244,
"grad_norm": 1.1572498083114624,
"learning_rate": 1.9627246850902363e-05,
"loss": 0.7576,
"step": 21000
},
{
"epoch": 0.3453666041951403,
"grad_norm": 1.0342215299606323,
"learning_rate": 1.9622452210516296e-05,
"loss": 0.7629,
"step": 21100
},
{
"epoch": 0.3470034127458282,
"grad_norm": 1.0652525424957275,
"learning_rate": 1.9617627523880158e-05,
"loss": 0.7636,
"step": 21200
},
{
"epoch": 0.34864022129651606,
"grad_norm": 1.048869013786316,
"learning_rate": 1.9612772806059104e-05,
"loss": 0.7625,
"step": 21300
},
{
"epoch": 0.35027702984720394,
"grad_norm": 1.1751947402954102,
"learning_rate": 1.9607888072212062e-05,
"loss": 0.7475,
"step": 21400
},
{
"epoch": 0.3519138383978918,
"grad_norm": 1.2830709218978882,
"learning_rate": 1.9602973337591688e-05,
"loss": 0.7558,
"step": 21500
},
{
"epoch": 0.3535506469485797,
"grad_norm": 1.1591740846633911,
"learning_rate": 1.9598028617544313e-05,
"loss": 0.7435,
"step": 21600
},
{
"epoch": 0.3551874554992675,
"grad_norm": 0.9801552295684814,
"learning_rate": 1.95930539275099e-05,
"loss": 0.7621,
"step": 21700
},
{
"epoch": 0.3568242640499554,
"grad_norm": 1.126760721206665,
"learning_rate": 1.958804928302199e-05,
"loss": 0.7672,
"step": 21800
},
{
"epoch": 0.35846107260064325,
"grad_norm": 1.0655152797698975,
"learning_rate": 1.958301469970766e-05,
"loss": 0.7491,
"step": 21900
},
{
"epoch": 0.3600978811513311,
"grad_norm": 1.1613372564315796,
"learning_rate": 1.9577950193287475e-05,
"loss": 0.7733,
"step": 22000
},
{
"epoch": 0.361734689702019,
"grad_norm": 0.9363147020339966,
"learning_rate": 1.9572855779575427e-05,
"loss": 0.7522,
"step": 22100
},
{
"epoch": 0.36337149825270687,
"grad_norm": 1.1021246910095215,
"learning_rate": 1.9567731474478903e-05,
"loss": 0.7539,
"step": 22200
},
{
"epoch": 0.36500830680339474,
"grad_norm": 1.084695816040039,
"learning_rate": 1.9562577293998616e-05,
"loss": 0.7514,
"step": 22300
},
{
"epoch": 0.3666451153540826,
"grad_norm": 1.1221933364868164,
"learning_rate": 1.9557393254228575e-05,
"loss": 0.7608,
"step": 22400
},
{
"epoch": 0.3682819239047705,
"grad_norm": 1.073371410369873,
"learning_rate": 1.9552179371356024e-05,
"loss": 0.7509,
"step": 22500
},
{
"epoch": 0.36991873245545837,
"grad_norm": 1.124243140220642,
"learning_rate": 1.9546935661661382e-05,
"loss": 0.7552,
"step": 22600
},
{
"epoch": 0.37155554100614624,
"grad_norm": 1.0397138595581055,
"learning_rate": 1.9541662141518222e-05,
"loss": 0.7451,
"step": 22700
},
{
"epoch": 0.37319234955683406,
"grad_norm": 1.0600475072860718,
"learning_rate": 1.9536358827393177e-05,
"loss": 0.7358,
"step": 22800
},
{
"epoch": 0.37482915810752193,
"grad_norm": 1.1461478471755981,
"learning_rate": 1.953102573584593e-05,
"loss": 0.7513,
"step": 22900
},
{
"epoch": 0.3764659666582098,
"grad_norm": 1.093103051185608,
"learning_rate": 1.952566288352914e-05,
"loss": 0.7369,
"step": 23000
},
{
"epoch": 0.3781027752088977,
"grad_norm": 1.2357380390167236,
"learning_rate": 1.952027028718839e-05,
"loss": 0.7628,
"step": 23100
},
{
"epoch": 0.37973958375958555,
"grad_norm": 0.9737277030944824,
"learning_rate": 1.9514847963662144e-05,
"loss": 0.7358,
"step": 23200
},
{
"epoch": 0.3813763923102734,
"grad_norm": 1.0810784101486206,
"learning_rate": 1.9509395929881683e-05,
"loss": 0.7431,
"step": 23300
},
{
"epoch": 0.3830132008609613,
"grad_norm": 1.0600659847259521,
"learning_rate": 1.9503914202871072e-05,
"loss": 0.7465,
"step": 23400
},
{
"epoch": 0.3846500094116492,
"grad_norm": 1.129676342010498,
"learning_rate": 1.9498402799747077e-05,
"loss": 0.746,
"step": 23500
},
{
"epoch": 0.38628681796233705,
"grad_norm": 1.0627739429473877,
"learning_rate": 1.9492861737719145e-05,
"loss": 0.7517,
"step": 23600
},
{
"epoch": 0.3879236265130249,
"grad_norm": 1.0382601022720337,
"learning_rate": 1.9487291034089316e-05,
"loss": 0.7466,
"step": 23700
},
{
"epoch": 0.3895604350637128,
"grad_norm": 1.0782064199447632,
"learning_rate": 1.9481690706252198e-05,
"loss": 0.7436,
"step": 23800
},
{
"epoch": 0.39119724361440067,
"grad_norm": 1.052713394165039,
"learning_rate": 1.94760607716949e-05,
"loss": 0.7363,
"step": 23900
},
{
"epoch": 0.3928340521650885,
"grad_norm": 1.0485634803771973,
"learning_rate": 1.947040124799697e-05,
"loss": 0.7491,
"step": 24000
},
{
"epoch": 0.39447086071577636,
"grad_norm": 1.1206567287445068,
"learning_rate": 1.9464712152830368e-05,
"loss": 0.7372,
"step": 24100
},
{
"epoch": 0.39610766926646424,
"grad_norm": 1.0319308042526245,
"learning_rate": 1.9458993503959368e-05,
"loss": 0.7493,
"step": 24200
},
{
"epoch": 0.3977444778171521,
"grad_norm": 1.1401089429855347,
"learning_rate": 1.9453245319240533e-05,
"loss": 0.7693,
"step": 24300
},
{
"epoch": 0.39938128636784,
"grad_norm": 1.2440853118896484,
"learning_rate": 1.944746761662266e-05,
"loss": 0.7477,
"step": 24400
},
{
"epoch": 0.40101809491852786,
"grad_norm": 1.1666104793548584,
"learning_rate": 1.9441660414146715e-05,
"loss": 0.7364,
"step": 24500
},
{
"epoch": 0.40265490346921573,
"grad_norm": 1.0812019109725952,
"learning_rate": 1.9435823729945768e-05,
"loss": 0.7278,
"step": 24600
},
{
"epoch": 0.4042917120199036,
"grad_norm": 1.1338680982589722,
"learning_rate": 1.9429957582244957e-05,
"loss": 0.7396,
"step": 24700
},
{
"epoch": 0.4059285205705915,
"grad_norm": 1.0170310735702515,
"learning_rate": 1.942406198936141e-05,
"loss": 0.7373,
"step": 24800
},
{
"epoch": 0.40756532912127935,
"grad_norm": 1.0910414457321167,
"learning_rate": 1.941813696970421e-05,
"loss": 0.743,
"step": 24900
},
{
"epoch": 0.4092021376719672,
"grad_norm": 0.9840279221534729,
"learning_rate": 1.9412182541774312e-05,
"loss": 0.7432,
"step": 25000
},
{
"epoch": 0.41083894622265504,
"grad_norm": 1.1482113599777222,
"learning_rate": 1.9406198724164515e-05,
"loss": 0.7457,
"step": 25100
},
{
"epoch": 0.4124757547733429,
"grad_norm": 0.9647344946861267,
"learning_rate": 1.9400185535559366e-05,
"loss": 0.7494,
"step": 25200
},
{
"epoch": 0.4141125633240308,
"grad_norm": 1.1271613836288452,
"learning_rate": 1.9394142994735147e-05,
"loss": 0.7358,
"step": 25300
},
{
"epoch": 0.41574937187471867,
"grad_norm": 1.1209514141082764,
"learning_rate": 1.9388071120559774e-05,
"loss": 0.7477,
"step": 25400
},
{
"epoch": 0.41738618042540654,
"grad_norm": 1.1221638917922974,
"learning_rate": 1.9381969931992768e-05,
"loss": 0.7401,
"step": 25500
},
{
"epoch": 0.4190229889760944,
"grad_norm": 1.1341800689697266,
"learning_rate": 1.937583944808518e-05,
"loss": 0.7341,
"step": 25600
},
{
"epoch": 0.4206597975267823,
"grad_norm": 1.0561330318450928,
"learning_rate": 1.9369679687979538e-05,
"loss": 0.7427,
"step": 25700
},
{
"epoch": 0.42229660607747016,
"grad_norm": 1.0445774793624878,
"learning_rate": 1.9363490670909788e-05,
"loss": 0.7485,
"step": 25800
},
{
"epoch": 0.42393341462815803,
"grad_norm": 1.1463161706924438,
"learning_rate": 1.9357272416201214e-05,
"loss": 0.7345,
"step": 25900
},
{
"epoch": 0.4255702231788459,
"grad_norm": 1.1426818370819092,
"learning_rate": 1.9351024943270426e-05,
"loss": 0.7369,
"step": 26000
},
{
"epoch": 0.4272070317295338,
"grad_norm": 1.0911140441894531,
"learning_rate": 1.934474827162524e-05,
"loss": 0.7472,
"step": 26100
},
{
"epoch": 0.4288438402802216,
"grad_norm": 1.0775692462921143,
"learning_rate": 1.9338442420864663e-05,
"loss": 0.7401,
"step": 26200
},
{
"epoch": 0.4304806488309095,
"grad_norm": 1.136518955230713,
"learning_rate": 1.9332107410678805e-05,
"loss": 0.7355,
"step": 26300
},
{
"epoch": 0.43211745738159735,
"grad_norm": 1.085319995880127,
"learning_rate": 1.932574326084883e-05,
"loss": 0.7485,
"step": 26400
},
{
"epoch": 0.4337542659322852,
"grad_norm": 1.034986972808838,
"learning_rate": 1.9319349991246887e-05,
"loss": 0.7422,
"step": 26500
},
{
"epoch": 0.4353910744829731,
"grad_norm": 1.1199235916137695,
"learning_rate": 1.9312927621836058e-05,
"loss": 0.7362,
"step": 26600
},
{
"epoch": 0.43702788303366097,
"grad_norm": 1.1646606922149658,
"learning_rate": 1.930647617267029e-05,
"loss": 0.7274,
"step": 26700
},
{
"epoch": 0.43866469158434884,
"grad_norm": 1.1620571613311768,
"learning_rate": 1.9299995663894325e-05,
"loss": 0.7351,
"step": 26800
},
{
"epoch": 0.4403015001350367,
"grad_norm": 1.1194571256637573,
"learning_rate": 1.9293486115743646e-05,
"loss": 0.7309,
"step": 26900
},
{
"epoch": 0.4419383086857246,
"grad_norm": 1.1805561780929565,
"learning_rate": 1.928694754854442e-05,
"loss": 0.7378,
"step": 27000
},
{
"epoch": 0.44357511723641246,
"grad_norm": 1.1845600605010986,
"learning_rate": 1.9280379982713417e-05,
"loss": 0.7319,
"step": 27100
},
{
"epoch": 0.44521192578710034,
"grad_norm": 1.2962830066680908,
"learning_rate": 1.927378343875796e-05,
"loss": 0.7305,
"step": 27200
},
{
"epoch": 0.44684873433778816,
"grad_norm": 1.0655794143676758,
"learning_rate": 1.9267157937275854e-05,
"loss": 0.7236,
"step": 27300
},
{
"epoch": 0.44848554288847603,
"grad_norm": 1.0807515382766724,
"learning_rate": 1.9260503498955326e-05,
"loss": 0.7326,
"step": 27400
},
{
"epoch": 0.4501223514391639,
"grad_norm": 1.0515137910842896,
"learning_rate": 1.9253820144574958e-05,
"loss": 0.7293,
"step": 27500
},
{
"epoch": 0.4517591599898518,
"grad_norm": 1.103508710861206,
"learning_rate": 1.9247107895003628e-05,
"loss": 0.7473,
"step": 27600
},
{
"epoch": 0.45339596854053965,
"grad_norm": 1.1016185283660889,
"learning_rate": 1.924036677120043e-05,
"loss": 0.7264,
"step": 27700
},
{
"epoch": 0.4550327770912275,
"grad_norm": 1.0213091373443604,
"learning_rate": 1.9233596794214623e-05,
"loss": 0.7325,
"step": 27800
},
{
"epoch": 0.4566695856419154,
"grad_norm": 1.1028705835342407,
"learning_rate": 1.9226797985185565e-05,
"loss": 0.7381,
"step": 27900
},
{
"epoch": 0.4583063941926033,
"grad_norm": 1.0844396352767944,
"learning_rate": 1.9219970365342634e-05,
"loss": 0.7279,
"step": 28000
},
{
"epoch": 0.45994320274329115,
"grad_norm": 1.037714958190918,
"learning_rate": 1.9213113956005176e-05,
"loss": 0.7433,
"step": 28100
},
{
"epoch": 0.461580011293979,
"grad_norm": 1.2123370170593262,
"learning_rate": 1.9206228778582435e-05,
"loss": 0.7341,
"step": 28200
},
{
"epoch": 0.4632168198446669,
"grad_norm": 1.013845682144165,
"learning_rate": 1.9199314854573474e-05,
"loss": 0.7369,
"step": 28300
},
{
"epoch": 0.4648536283953547,
"grad_norm": 1.0552864074707031,
"learning_rate": 1.9192372205567123e-05,
"loss": 0.7202,
"step": 28400
},
{
"epoch": 0.4664904369460426,
"grad_norm": 1.049025058746338,
"learning_rate": 1.9185400853241917e-05,
"loss": 0.7246,
"step": 28500
},
{
"epoch": 0.46812724549673046,
"grad_norm": 1.0877737998962402,
"learning_rate": 1.9178400819365994e-05,
"loss": 0.7261,
"step": 28600
},
{
"epoch": 0.46976405404741833,
"grad_norm": 1.099348783493042,
"learning_rate": 1.9171372125797072e-05,
"loss": 0.7327,
"step": 28700
},
{
"epoch": 0.4714008625981062,
"grad_norm": 1.1000944375991821,
"learning_rate": 1.916431479448235e-05,
"loss": 0.7305,
"step": 28800
},
{
"epoch": 0.4730376711487941,
"grad_norm": 1.0979351997375488,
"learning_rate": 1.9157228847458446e-05,
"loss": 0.7279,
"step": 28900
},
{
"epoch": 0.47467447969948195,
"grad_norm": 1.0918766260147095,
"learning_rate": 1.9150114306851336e-05,
"loss": 0.7215,
"step": 29000
},
{
"epoch": 0.47631128825016983,
"grad_norm": 1.109971046447754,
"learning_rate": 1.9142971194876284e-05,
"loss": 0.7322,
"step": 29100
},
{
"epoch": 0.4779480968008577,
"grad_norm": 1.1282057762145996,
"learning_rate": 1.913579953383776e-05,
"loss": 0.7257,
"step": 29200
},
{
"epoch": 0.4795849053515456,
"grad_norm": 1.1076371669769287,
"learning_rate": 1.912859934612938e-05,
"loss": 0.7516,
"step": 29300
},
{
"epoch": 0.48122171390223345,
"grad_norm": 1.1480896472930908,
"learning_rate": 1.9121370654233843e-05,
"loss": 0.728,
"step": 29400
},
{
"epoch": 0.48285852245292127,
"grad_norm": 1.1083163022994995,
"learning_rate": 1.911411348072284e-05,
"loss": 0.7235,
"step": 29500
},
{
"epoch": 0.48449533100360914,
"grad_norm": 1.2141623497009277,
"learning_rate": 1.9106827848257007e-05,
"loss": 0.7237,
"step": 29600
},
{
"epoch": 0.486132139554297,
"grad_norm": 1.0334457159042358,
"learning_rate": 1.9099513779585836e-05,
"loss": 0.7306,
"step": 29700
},
{
"epoch": 0.4877689481049849,
"grad_norm": 1.1086657047271729,
"learning_rate": 1.909217129754762e-05,
"loss": 0.7295,
"step": 29800
},
{
"epoch": 0.48940575665567276,
"grad_norm": 1.0128360986709595,
"learning_rate": 1.908480042506937e-05,
"loss": 0.733,
"step": 29900
},
{
"epoch": 0.49104256520636064,
"grad_norm": 1.1484946012496948,
"learning_rate": 1.907740118516674e-05,
"loss": 0.7396,
"step": 30000
},
{
"epoch": 0.4926793737570485,
"grad_norm": 1.031750202178955,
"learning_rate": 1.9069973600943962e-05,
"loss": 0.7204,
"step": 30100
},
{
"epoch": 0.4943161823077364,
"grad_norm": 1.1274133920669556,
"learning_rate": 1.9062517695593792e-05,
"loss": 0.7235,
"step": 30200
},
{
"epoch": 0.49595299085842426,
"grad_norm": 1.1863317489624023,
"learning_rate": 1.9055033492397396e-05,
"loss": 0.7329,
"step": 30300
},
{
"epoch": 0.49758979940911213,
"grad_norm": 1.0985053777694702,
"learning_rate": 1.9047521014724303e-05,
"loss": 0.7341,
"step": 30400
},
{
"epoch": 0.4992266079598,
"grad_norm": 1.136760950088501,
"learning_rate": 1.9039980286032353e-05,
"loss": 0.7189,
"step": 30500
},
{
"epoch": 0.5008634165104878,
"grad_norm": 1.0787100791931152,
"learning_rate": 1.9032411329867573e-05,
"loss": 0.7298,
"step": 30600
},
{
"epoch": 0.5025002250611758,
"grad_norm": 1.3436377048492432,
"learning_rate": 1.902481416986414e-05,
"loss": 0.719,
"step": 30700
},
{
"epoch": 0.5041370336118636,
"grad_norm": 1.1863504648208618,
"learning_rate": 1.9017188829744305e-05,
"loss": 0.7125,
"step": 30800
},
{
"epoch": 0.5057738421625515,
"grad_norm": 1.0385360717773438,
"learning_rate": 1.90095353333183e-05,
"loss": 0.7297,
"step": 30900
},
{
"epoch": 0.5074106507132393,
"grad_norm": 1.1736425161361694,
"learning_rate": 1.9001853704484285e-05,
"loss": 0.7205,
"step": 31000
},
{
"epoch": 0.5090474592639272,
"grad_norm": 1.0939114093780518,
"learning_rate": 1.899414396722826e-05,
"loss": 0.741,
"step": 31100
},
{
"epoch": 0.5106842678146151,
"grad_norm": 1.3368091583251953,
"learning_rate": 1.8986406145623996e-05,
"loss": 0.7277,
"step": 31200
},
{
"epoch": 0.5123210763653029,
"grad_norm": 1.1556004285812378,
"learning_rate": 1.897864026383295e-05,
"loss": 0.7383,
"step": 31300
},
{
"epoch": 0.5139578849159908,
"grad_norm": 1.2308059930801392,
"learning_rate": 1.897084634610421e-05,
"loss": 0.7188,
"step": 31400
},
{
"epoch": 0.5155946934666786,
"grad_norm": 1.1211739778518677,
"learning_rate": 1.8963024416774393e-05,
"loss": 0.7241,
"step": 31500
},
{
"epoch": 0.5172315020173666,
"grad_norm": 1.1302770376205444,
"learning_rate": 1.8955174500267596e-05,
"loss": 0.7207,
"step": 31600
},
{
"epoch": 0.5188683105680544,
"grad_norm": 1.1893266439437866,
"learning_rate": 1.8947296621095297e-05,
"loss": 0.7088,
"step": 31700
},
{
"epoch": 0.5205051191187423,
"grad_norm": 1.2034817934036255,
"learning_rate": 1.893939080385629e-05,
"loss": 0.7225,
"step": 31800
},
{
"epoch": 0.5221419276694301,
"grad_norm": 1.0935208797454834,
"learning_rate": 1.8931457073236612e-05,
"loss": 0.7219,
"step": 31900
},
{
"epoch": 0.5237787362201181,
"grad_norm": 1.2129491567611694,
"learning_rate": 1.892349545400945e-05,
"loss": 0.7323,
"step": 32000
},
{
"epoch": 0.5254155447708059,
"grad_norm": 1.0750499963760376,
"learning_rate": 1.8915505971035077e-05,
"loss": 0.7213,
"step": 32100
},
{
"epoch": 0.5270523533214938,
"grad_norm": 1.1311250925064087,
"learning_rate": 1.8907488649260775e-05,
"loss": 0.7265,
"step": 32200
},
{
"epoch": 0.5286891618721816,
"grad_norm": 1.1503121852874756,
"learning_rate": 1.889944351372075e-05,
"loss": 0.7177,
"step": 32300
},
{
"epoch": 0.5303259704228694,
"grad_norm": 1.3034614324569702,
"learning_rate": 1.8891370589536058e-05,
"loss": 0.7118,
"step": 32400
},
{
"epoch": 0.5319627789735574,
"grad_norm": 1.0626057386398315,
"learning_rate": 1.8883269901914524e-05,
"loss": 0.7205,
"step": 32500
},
{
"epoch": 0.5335995875242452,
"grad_norm": 1.2290301322937012,
"learning_rate": 1.8875141476150664e-05,
"loss": 0.73,
"step": 32600
},
{
"epoch": 0.5352363960749331,
"grad_norm": 1.2172757387161255,
"learning_rate": 1.8866985337625615e-05,
"loss": 0.7234,
"step": 32700
},
{
"epoch": 0.5368732046256209,
"grad_norm": 1.0496524572372437,
"learning_rate": 1.885880151180703e-05,
"loss": 0.7127,
"step": 32800
},
{
"epoch": 0.5385100131763089,
"grad_norm": 0.9903925061225891,
"learning_rate": 1.8850590024249037e-05,
"loss": 0.728,
"step": 32900
},
{
"epoch": 0.5401468217269967,
"grad_norm": 1.2562659978866577,
"learning_rate": 1.8842350900592122e-05,
"loss": 0.7188,
"step": 33000
},
{
"epoch": 0.5417836302776846,
"grad_norm": 1.2212430238723755,
"learning_rate": 1.8834084166563072e-05,
"loss": 0.7086,
"step": 33100
},
{
"epoch": 0.5434204388283724,
"grad_norm": 1.1504745483398438,
"learning_rate": 1.882578984797489e-05,
"loss": 0.7198,
"step": 33200
},
{
"epoch": 0.5450572473790604,
"grad_norm": 1.1029900312423706,
"learning_rate": 1.8817467970726704e-05,
"loss": 0.729,
"step": 33300
},
{
"epoch": 0.5466940559297482,
"grad_norm": 1.1274054050445557,
"learning_rate": 1.8809118560803704e-05,
"loss": 0.7249,
"step": 33400
},
{
"epoch": 0.548330864480436,
"grad_norm": 1.093854546546936,
"learning_rate": 1.880074164427704e-05,
"loss": 0.704,
"step": 33500
},
{
"epoch": 0.5499676730311239,
"grad_norm": 1.0846567153930664,
"learning_rate": 1.879233724730377e-05,
"loss": 0.7194,
"step": 33600
},
{
"epoch": 0.5516044815818117,
"grad_norm": 1.35237455368042,
"learning_rate": 1.8783905396126737e-05,
"loss": 0.7205,
"step": 33700
},
{
"epoch": 0.5532412901324997,
"grad_norm": 0.9714828133583069,
"learning_rate": 1.8775446117074528e-05,
"loss": 0.7334,
"step": 33800
},
{
"epoch": 0.5548780986831875,
"grad_norm": 1.2619616985321045,
"learning_rate": 1.8766959436561363e-05,
"loss": 0.718,
"step": 33900
},
{
"epoch": 0.5565149072338754,
"grad_norm": 1.036129355430603,
"learning_rate": 1.8758445381087034e-05,
"loss": 0.7191,
"step": 34000
},
{
"epoch": 0.5581517157845632,
"grad_norm": 1.097095012664795,
"learning_rate": 1.8749903977236802e-05,
"loss": 0.7171,
"step": 34100
},
{
"epoch": 0.5597885243352512,
"grad_norm": 1.1133558750152588,
"learning_rate": 1.8741335251681328e-05,
"loss": 0.7179,
"step": 34200
},
{
"epoch": 0.561425332885939,
"grad_norm": 1.0562981367111206,
"learning_rate": 1.8732739231176587e-05,
"loss": 0.7201,
"step": 34300
},
{
"epoch": 0.5630621414366269,
"grad_norm": 1.20978581905365,
"learning_rate": 1.8724115942563773e-05,
"loss": 0.7129,
"step": 34400
},
{
"epoch": 0.5646989499873147,
"grad_norm": 1.0966860055923462,
"learning_rate": 1.8715465412769243e-05,
"loss": 0.715,
"step": 34500
},
{
"epoch": 0.5663357585380026,
"grad_norm": 1.2173317670822144,
"learning_rate": 1.87067876688044e-05,
"loss": 0.7052,
"step": 34600
},
{
"epoch": 0.5679725670886905,
"grad_norm": 1.126670241355896,
"learning_rate": 1.869808273776563e-05,
"loss": 0.7172,
"step": 34700
},
{
"epoch": 0.5696093756393783,
"grad_norm": 1.0486496686935425,
"learning_rate": 1.8689350646834207e-05,
"loss": 0.7269,
"step": 34800
},
{
"epoch": 0.5712461841900662,
"grad_norm": 1.1730561256408691,
"learning_rate": 1.868059142327622e-05,
"loss": 0.7191,
"step": 34900
},
{
"epoch": 0.572882992740754,
"grad_norm": 1.1153805255889893,
"learning_rate": 1.867180509444247e-05,
"loss": 0.7124,
"step": 35000
},
{
"epoch": 0.574519801291442,
"grad_norm": 1.200767159461975,
"learning_rate": 1.8662991687768394e-05,
"loss": 0.7342,
"step": 35100
},
{
"epoch": 0.5761566098421298,
"grad_norm": 1.093985676765442,
"learning_rate": 1.8654151230774e-05,
"loss": 0.7073,
"step": 35200
},
{
"epoch": 0.5777934183928177,
"grad_norm": 1.1902211904525757,
"learning_rate": 1.8645283751063734e-05,
"loss": 0.7147,
"step": 35300
},
{
"epoch": 0.5794302269435055,
"grad_norm": 1.1363279819488525,
"learning_rate": 1.863638927632644e-05,
"loss": 0.7162,
"step": 35400
},
{
"epoch": 0.5810670354941935,
"grad_norm": 1.2271382808685303,
"learning_rate": 1.8627467834335243e-05,
"loss": 0.7042,
"step": 35500
},
{
"epoch": 0.5827038440448813,
"grad_norm": 1.1823738813400269,
"learning_rate": 1.8618519452947484e-05,
"loss": 0.7197,
"step": 35600
},
{
"epoch": 0.5843406525955691,
"grad_norm": 1.042771577835083,
"learning_rate": 1.8609544160104608e-05,
"loss": 0.7103,
"step": 35700
},
{
"epoch": 0.585977461146257,
"grad_norm": 1.2053323984146118,
"learning_rate": 1.8600541983832114e-05,
"loss": 0.7206,
"step": 35800
},
{
"epoch": 0.5876142696969449,
"grad_norm": 1.2077679634094238,
"learning_rate": 1.8591512952239416e-05,
"loss": 0.7003,
"step": 35900
},
{
"epoch": 0.5892510782476328,
"grad_norm": 1.2675883769989014,
"learning_rate": 1.8582457093519806e-05,
"loss": 0.7119,
"step": 36000
},
{
"epoch": 0.5908878867983206,
"grad_norm": 1.102798342704773,
"learning_rate": 1.857337443595034e-05,
"loss": 0.7097,
"step": 36100
},
{
"epoch": 0.5925246953490085,
"grad_norm": 1.0432052612304688,
"learning_rate": 1.8564265007891747e-05,
"loss": 0.7197,
"step": 36200
},
{
"epoch": 0.5941615038996964,
"grad_norm": 1.1461999416351318,
"learning_rate": 1.8555128837788356e-05,
"loss": 0.7128,
"step": 36300
},
{
"epoch": 0.5957983124503843,
"grad_norm": 1.1425740718841553,
"learning_rate": 1.854596595416799e-05,
"loss": 0.7221,
"step": 36400
},
{
"epoch": 0.5974351210010721,
"grad_norm": 1.1499603986740112,
"learning_rate": 1.8536776385641896e-05,
"loss": 0.7118,
"step": 36500
},
{
"epoch": 0.59907192955176,
"grad_norm": 1.1369038820266724,
"learning_rate": 1.8527560160904628e-05,
"loss": 0.7101,
"step": 36600
},
{
"epoch": 0.6007087381024478,
"grad_norm": 1.3000248670578003,
"learning_rate": 1.8518317308733987e-05,
"loss": 0.7042,
"step": 36700
},
{
"epoch": 0.6023455466531357,
"grad_norm": 1.193550944328308,
"learning_rate": 1.8509047857990925e-05,
"loss": 0.7143,
"step": 36800
},
{
"epoch": 0.6039823552038236,
"grad_norm": 1.1038364171981812,
"learning_rate": 1.849975183761943e-05,
"loss": 0.6953,
"step": 36900
},
{
"epoch": 0.6056191637545114,
"grad_norm": 1.2535215616226196,
"learning_rate": 1.849042927664647e-05,
"loss": 0.7021,
"step": 37000
},
{
"epoch": 0.6072559723051993,
"grad_norm": 1.1770461797714233,
"learning_rate": 1.848108020418188e-05,
"loss": 0.6971,
"step": 37100
},
{
"epoch": 0.6088927808558872,
"grad_norm": 1.3245750665664673,
"learning_rate": 1.8471704649418272e-05,
"loss": 0.7062,
"step": 37200
},
{
"epoch": 0.6105295894065751,
"grad_norm": 1.064820408821106,
"learning_rate": 1.8462302641630957e-05,
"loss": 0.7247,
"step": 37300
},
{
"epoch": 0.6121663979572629,
"grad_norm": 1.2426869869232178,
"learning_rate": 1.8452874210177853e-05,
"loss": 0.697,
"step": 37400
},
{
"epoch": 0.6138032065079508,
"grad_norm": 1.0495688915252686,
"learning_rate": 1.8443419384499367e-05,
"loss": 0.7066,
"step": 37500
},
{
"epoch": 0.6154400150586387,
"grad_norm": 1.0227185487747192,
"learning_rate": 1.8433938194118332e-05,
"loss": 0.6975,
"step": 37600
},
{
"epoch": 0.6170768236093266,
"grad_norm": 1.1213784217834473,
"learning_rate": 1.8424430668639916e-05,
"loss": 0.7101,
"step": 37700
},
{
"epoch": 0.6187136321600144,
"grad_norm": 1.3823000192642212,
"learning_rate": 1.8414896837751497e-05,
"loss": 0.7143,
"step": 37800
},
{
"epoch": 0.6203504407107022,
"grad_norm": 1.280870795249939,
"learning_rate": 1.8405336731222615e-05,
"loss": 0.7137,
"step": 37900
},
{
"epoch": 0.6219872492613902,
"grad_norm": 1.1578929424285889,
"learning_rate": 1.839575037890483e-05,
"loss": 0.7035,
"step": 38000
},
{
"epoch": 0.623624057812078,
"grad_norm": 1.1784029006958008,
"learning_rate": 1.838613781073169e-05,
"loss": 0.7003,
"step": 38100
},
{
"epoch": 0.6252608663627659,
"grad_norm": 1.5140550136566162,
"learning_rate": 1.8376499056718563e-05,
"loss": 0.7182,
"step": 38200
},
{
"epoch": 0.6268976749134537,
"grad_norm": 1.1795947551727295,
"learning_rate": 1.8366834146962613e-05,
"loss": 0.707,
"step": 38300
},
{
"epoch": 0.6285344834641416,
"grad_norm": 1.2156872749328613,
"learning_rate": 1.8357143111642658e-05,
"loss": 0.7041,
"step": 38400
},
{
"epoch": 0.6301712920148295,
"grad_norm": 1.120609164237976,
"learning_rate": 1.8347425981019104e-05,
"loss": 0.7087,
"step": 38500
},
{
"epoch": 0.6318081005655174,
"grad_norm": 1.0960373878479004,
"learning_rate": 1.8337682785433838e-05,
"loss": 0.7136,
"step": 38600
},
{
"epoch": 0.6334449091162052,
"grad_norm": 1.2065433263778687,
"learning_rate": 1.8327913555310125e-05,
"loss": 0.7077,
"step": 38700
},
{
"epoch": 0.6350817176668931,
"grad_norm": 1.158570647239685,
"learning_rate": 1.8318118321152534e-05,
"loss": 0.7199,
"step": 38800
},
{
"epoch": 0.636718526217581,
"grad_norm": 1.1315112113952637,
"learning_rate": 1.8308297113546834e-05,
"loss": 0.7157,
"step": 38900
},
{
"epoch": 0.6383553347682688,
"grad_norm": 1.567763328552246,
"learning_rate": 1.829844996315989e-05,
"loss": 0.7024,
"step": 39000
},
{
"epoch": 0.6399921433189567,
"grad_norm": 1.3154592514038086,
"learning_rate": 1.8288576900739573e-05,
"loss": 0.7093,
"step": 39100
},
{
"epoch": 0.6416289518696445,
"grad_norm": 1.2426626682281494,
"learning_rate": 1.8278677957114666e-05,
"loss": 0.7108,
"step": 39200
},
{
"epoch": 0.6432657604203325,
"grad_norm": 1.2186305522918701,
"learning_rate": 1.8268753163194773e-05,
"loss": 0.704,
"step": 39300
},
{
"epoch": 0.6449025689710203,
"grad_norm": 1.049307942390442,
"learning_rate": 1.8258802549970206e-05,
"loss": 0.7057,
"step": 39400
},
{
"epoch": 0.6465393775217082,
"grad_norm": 1.3523504734039307,
"learning_rate": 1.8248826148511908e-05,
"loss": 0.6965,
"step": 39500
},
{
"epoch": 0.648176186072396,
"grad_norm": 1.2402653694152832,
"learning_rate": 1.823882398997133e-05,
"loss": 0.704,
"step": 39600
},
{
"epoch": 0.649812994623084,
"grad_norm": 1.3009974956512451,
"learning_rate": 1.8228796105580373e-05,
"loss": 0.6892,
"step": 39700
},
{
"epoch": 0.6514498031737718,
"grad_norm": 1.161328673362732,
"learning_rate": 1.821874252665125e-05,
"loss": 0.7099,
"step": 39800
},
{
"epoch": 0.6530866117244597,
"grad_norm": 1.5753206014633179,
"learning_rate": 1.820866328457641e-05,
"loss": 0.6958,
"step": 39900
},
{
"epoch": 0.6547234202751475,
"grad_norm": 1.1261160373687744,
"learning_rate": 1.8198558410828436e-05,
"loss": 0.7048,
"step": 40000
},
{
"epoch": 0.6563602288258353,
"grad_norm": 1.2303427457809448,
"learning_rate": 1.818842793695995e-05,
"loss": 0.7024,
"step": 40100
},
{
"epoch": 0.6579970373765233,
"grad_norm": 1.2187303304672241,
"learning_rate": 1.8178271894603502e-05,
"loss": 0.696,
"step": 40200
},
{
"epoch": 0.6596338459272111,
"grad_norm": 1.1081221103668213,
"learning_rate": 1.8168090315471488e-05,
"loss": 0.7082,
"step": 40300
},
{
"epoch": 0.661270654477899,
"grad_norm": 1.1961265802383423,
"learning_rate": 1.8157883231356036e-05,
"loss": 0.6875,
"step": 40400
},
{
"epoch": 0.6629074630285868,
"grad_norm": 1.1577361822128296,
"learning_rate": 1.8147650674128927e-05,
"loss": 0.7004,
"step": 40500
},
{
"epoch": 0.6645442715792748,
"grad_norm": 1.1837248802185059,
"learning_rate": 1.813739267574147e-05,
"loss": 0.7084,
"step": 40600
},
{
"epoch": 0.6661810801299626,
"grad_norm": 1.140136957168579,
"learning_rate": 1.8127109268224414e-05,
"loss": 0.6897,
"step": 40700
},
{
"epoch": 0.6678178886806505,
"grad_norm": 1.132994532585144,
"learning_rate": 1.811680048368785e-05,
"loss": 0.6999,
"step": 40800
},
{
"epoch": 0.6694546972313383,
"grad_norm": 1.184187889099121,
"learning_rate": 1.8106466354321113e-05,
"loss": 0.6994,
"step": 40900
},
{
"epoch": 0.6710915057820263,
"grad_norm": 1.1196414232254028,
"learning_rate": 1.809610691239268e-05,
"loss": 0.7008,
"step": 41000
},
{
"epoch": 0.6727283143327141,
"grad_norm": 1.1688846349716187,
"learning_rate": 1.808572219025006e-05,
"loss": 0.6954,
"step": 41100
},
{
"epoch": 0.6743651228834019,
"grad_norm": 1.222205638885498,
"learning_rate": 1.80753122203197e-05,
"loss": 0.6918,
"step": 41200
},
{
"epoch": 0.6760019314340898,
"grad_norm": 1.1374167203903198,
"learning_rate": 1.8064877035106887e-05,
"loss": 0.6906,
"step": 41300
},
{
"epoch": 0.6776387399847776,
"grad_norm": 1.0707694292068481,
"learning_rate": 1.8054416667195643e-05,
"loss": 0.6943,
"step": 41400
},
{
"epoch": 0.6792755485354656,
"grad_norm": 1.1394332647323608,
"learning_rate": 1.8043931149248625e-05,
"loss": 0.7073,
"step": 41500
},
{
"epoch": 0.6809123570861534,
"grad_norm": 1.118058443069458,
"learning_rate": 1.803342051400701e-05,
"loss": 0.6983,
"step": 41600
},
{
"epoch": 0.6825491656368413,
"grad_norm": 1.3730331659317017,
"learning_rate": 1.8022884794290417e-05,
"loss": 0.6924,
"step": 41700
},
{
"epoch": 0.6841859741875291,
"grad_norm": 1.1573492288589478,
"learning_rate": 1.801232402299679e-05,
"loss": 0.6964,
"step": 41800
},
{
"epoch": 0.6858227827382171,
"grad_norm": 1.1315394639968872,
"learning_rate": 1.80017382331023e-05,
"loss": 0.693,
"step": 41900
},
{
"epoch": 0.6874595912889049,
"grad_norm": 1.1479718685150146,
"learning_rate": 1.799112745766122e-05,
"loss": 0.6985,
"step": 42000
},
{
"epoch": 0.6890963998395928,
"grad_norm": 1.1869304180145264,
"learning_rate": 1.7980491729805858e-05,
"loss": 0.7132,
"step": 42100
},
{
"epoch": 0.6907332083902806,
"grad_norm": 1.322792887687683,
"learning_rate": 1.796983108274644e-05,
"loss": 0.7085,
"step": 42200
},
{
"epoch": 0.6923700169409684,
"grad_norm": 1.1635984182357788,
"learning_rate": 1.7959145549770985e-05,
"loss": 0.7117,
"step": 42300
},
{
"epoch": 0.6940068254916564,
"grad_norm": 1.1490191221237183,
"learning_rate": 1.7948435164245236e-05,
"loss": 0.697,
"step": 42400
},
{
"epoch": 0.6956436340423442,
"grad_norm": 1.2376859188079834,
"learning_rate": 1.7937699959612523e-05,
"loss": 0.7079,
"step": 42500
},
{
"epoch": 0.6972804425930321,
"grad_norm": 1.2555029392242432,
"learning_rate": 1.7926939969393693e-05,
"loss": 0.6895,
"step": 42600
},
{
"epoch": 0.6989172511437199,
"grad_norm": 1.1793533563613892,
"learning_rate": 1.7916155227186966e-05,
"loss": 0.6784,
"step": 42700
},
{
"epoch": 0.7005540596944079,
"grad_norm": 1.0882368087768555,
"learning_rate": 1.7905345766667867e-05,
"loss": 0.6875,
"step": 42800
},
{
"epoch": 0.7021908682450957,
"grad_norm": 1.2925825119018555,
"learning_rate": 1.789451162158909e-05,
"loss": 0.7072,
"step": 42900
},
{
"epoch": 0.7038276767957836,
"grad_norm": 1.2188570499420166,
"learning_rate": 1.7883652825780418e-05,
"loss": 0.7084,
"step": 43000
},
{
"epoch": 0.7054644853464714,
"grad_norm": 1.2425892353057861,
"learning_rate": 1.7872769413148602e-05,
"loss": 0.7059,
"step": 43100
},
{
"epoch": 0.7071012938971594,
"grad_norm": 1.3490030765533447,
"learning_rate": 1.786186141767726e-05,
"loss": 0.6861,
"step": 43200
},
{
"epoch": 0.7087381024478472,
"grad_norm": 1.2493983507156372,
"learning_rate": 1.785092887342677e-05,
"loss": 0.6862,
"step": 43300
},
{
"epoch": 0.710374910998535,
"grad_norm": 1.1606495380401611,
"learning_rate": 1.7839971814534163e-05,
"loss": 0.6959,
"step": 43400
},
{
"epoch": 0.7120117195492229,
"grad_norm": 1.0867750644683838,
"learning_rate": 1.7828990275213023e-05,
"loss": 0.6838,
"step": 43500
},
{
"epoch": 0.7136485280999108,
"grad_norm": 1.4481595754623413,
"learning_rate": 1.781798428975336e-05,
"loss": 0.6877,
"step": 43600
},
{
"epoch": 0.7152853366505987,
"grad_norm": 1.0603893995285034,
"learning_rate": 1.7806953892521536e-05,
"loss": 0.6922,
"step": 43700
},
{
"epoch": 0.7169221452012865,
"grad_norm": 1.1686676740646362,
"learning_rate": 1.7795899117960126e-05,
"loss": 0.6933,
"step": 43800
},
{
"epoch": 0.7185589537519744,
"grad_norm": 1.423593282699585,
"learning_rate": 1.7784820000587828e-05,
"loss": 0.6947,
"step": 43900
},
{
"epoch": 0.7201957623026622,
"grad_norm": 1.2158969640731812,
"learning_rate": 1.7773716574999354e-05,
"loss": 0.6832,
"step": 44000
},
{
"epoch": 0.7218325708533502,
"grad_norm": 1.3259363174438477,
"learning_rate": 1.776258887586531e-05,
"loss": 0.6836,
"step": 44100
},
{
"epoch": 0.723469379404038,
"grad_norm": 1.2114306688308716,
"learning_rate": 1.775143693793211e-05,
"loss": 0.6934,
"step": 44200
},
{
"epoch": 0.7251061879547259,
"grad_norm": 1.0769015550613403,
"learning_rate": 1.774026079602184e-05,
"loss": 0.692,
"step": 44300
},
{
"epoch": 0.7267429965054137,
"grad_norm": 1.098381519317627,
"learning_rate": 1.7729060485032167e-05,
"loss": 0.6929,
"step": 44400
},
{
"epoch": 0.7283798050561016,
"grad_norm": 1.1960115432739258,
"learning_rate": 1.7717836039936235e-05,
"loss": 0.6895,
"step": 44500
},
{
"epoch": 0.7300166136067895,
"grad_norm": 1.2899237871170044,
"learning_rate": 1.7706587495782538e-05,
"loss": 0.6891,
"step": 44600
},
{
"epoch": 0.7316534221574773,
"grad_norm": 1.1849106550216675,
"learning_rate": 1.769531488769482e-05,
"loss": 0.6994,
"step": 44700
},
{
"epoch": 0.7332902307081652,
"grad_norm": 1.0840647220611572,
"learning_rate": 1.7684018250871967e-05,
"loss": 0.6902,
"step": 44800
},
{
"epoch": 0.734927039258853,
"grad_norm": 1.1262308359146118,
"learning_rate": 1.7672697620587904e-05,
"loss": 0.686,
"step": 44900
},
{
"epoch": 0.736563847809541,
"grad_norm": 1.2281126976013184,
"learning_rate": 1.7661353032191458e-05,
"loss": 0.6971,
"step": 45000
},
{
"epoch": 0.7382006563602288,
"grad_norm": 1.0803622007369995,
"learning_rate": 1.7649984521106282e-05,
"loss": 0.694,
"step": 45100
},
{
"epoch": 0.7398374649109167,
"grad_norm": 1.4072610139846802,
"learning_rate": 1.763859212283071e-05,
"loss": 0.704,
"step": 45200
},
{
"epoch": 0.7414742734616045,
"grad_norm": 1.2351950407028198,
"learning_rate": 1.7627175872937686e-05,
"loss": 0.6991,
"step": 45300
},
{
"epoch": 0.7431110820122925,
"grad_norm": 1.1985889673233032,
"learning_rate": 1.7615735807074616e-05,
"loss": 0.6947,
"step": 45400
},
{
"epoch": 0.7447478905629803,
"grad_norm": 1.1948813199996948,
"learning_rate": 1.7604271960963274e-05,
"loss": 0.6986,
"step": 45500
},
{
"epoch": 0.7463846991136681,
"grad_norm": 1.2745295763015747,
"learning_rate": 1.759278437039969e-05,
"loss": 0.6989,
"step": 45600
},
{
"epoch": 0.748021507664356,
"grad_norm": 1.1414821147918701,
"learning_rate": 1.7581273071254038e-05,
"loss": 0.6883,
"step": 45700
},
{
"epoch": 0.7496583162150439,
"grad_norm": 1.1246697902679443,
"learning_rate": 1.7569738099470524e-05,
"loss": 0.6818,
"step": 45800
},
{
"epoch": 0.7512951247657318,
"grad_norm": 1.1820296049118042,
"learning_rate": 1.7558179491067263e-05,
"loss": 0.7079,
"step": 45900
},
{
"epoch": 0.7529319333164196,
"grad_norm": 1.1293789148330688,
"learning_rate": 1.7546597282136186e-05,
"loss": 0.696,
"step": 46000
},
{
"epoch": 0.7545687418671075,
"grad_norm": 1.2405450344085693,
"learning_rate": 1.753499150884291e-05,
"loss": 0.6912,
"step": 46100
},
{
"epoch": 0.7562055504177954,
"grad_norm": 1.2177417278289795,
"learning_rate": 1.7523362207426634e-05,
"loss": 0.6824,
"step": 46200
},
{
"epoch": 0.7578423589684833,
"grad_norm": 1.124414086341858,
"learning_rate": 1.7511709414200024e-05,
"loss": 0.6868,
"step": 46300
},
{
"epoch": 0.7594791675191711,
"grad_norm": 1.1439573764801025,
"learning_rate": 1.7500033165549105e-05,
"loss": 0.6882,
"step": 46400
},
{
"epoch": 0.761115976069859,
"grad_norm": 1.1549428701400757,
"learning_rate": 1.7488333497933133e-05,
"loss": 0.681,
"step": 46500
},
{
"epoch": 0.7627527846205469,
"grad_norm": 1.3092726469039917,
"learning_rate": 1.7476610447884492e-05,
"loss": 0.6973,
"step": 46600
},
{
"epoch": 0.7643895931712347,
"grad_norm": 1.5812910795211792,
"learning_rate": 1.7464864052008586e-05,
"loss": 0.6855,
"step": 46700
},
{
"epoch": 0.7660264017219226,
"grad_norm": 1.189775824546814,
"learning_rate": 1.7453094346983707e-05,
"loss": 0.6983,
"step": 46800
},
{
"epoch": 0.7676632102726104,
"grad_norm": 1.3100470304489136,
"learning_rate": 1.7441301369560934e-05,
"loss": 0.6938,
"step": 46900
},
{
"epoch": 0.7693000188232983,
"grad_norm": 1.227925419807434,
"learning_rate": 1.7429485156564014e-05,
"loss": 0.6762,
"step": 47000
},
{
"epoch": 0.7709368273739862,
"grad_norm": 1.3295223712921143,
"learning_rate": 1.7417645744889248e-05,
"loss": 0.6823,
"step": 47100
},
{
"epoch": 0.7725736359246741,
"grad_norm": 1.1091123819351196,
"learning_rate": 1.740578317150538e-05,
"loss": 0.6978,
"step": 47200
},
{
"epoch": 0.7742104444753619,
"grad_norm": 1.2926867008209229,
"learning_rate": 1.7393897473453462e-05,
"loss": 0.6853,
"step": 47300
},
{
"epoch": 0.7758472530260498,
"grad_norm": 1.279630422592163,
"learning_rate": 1.738198868784677e-05,
"loss": 0.6911,
"step": 47400
},
{
"epoch": 0.7774840615767377,
"grad_norm": 1.1175949573516846,
"learning_rate": 1.7370056851870665e-05,
"loss": 0.687,
"step": 47500
},
{
"epoch": 0.7791208701274256,
"grad_norm": 1.0889476537704468,
"learning_rate": 1.7358102002782477e-05,
"loss": 0.689,
"step": 47600
},
{
"epoch": 0.7807576786781134,
"grad_norm": 1.1944537162780762,
"learning_rate": 1.7346124177911402e-05,
"loss": 0.6841,
"step": 47700
},
{
"epoch": 0.7823944872288013,
"grad_norm": 1.208275556564331,
"learning_rate": 1.7334123414658376e-05,
"loss": 0.6777,
"step": 47800
},
{
"epoch": 0.7840312957794892,
"grad_norm": 1.1608806848526,
"learning_rate": 1.7322099750495964e-05,
"loss": 0.6841,
"step": 47900
},
{
"epoch": 0.785668104330177,
"grad_norm": 1.0674712657928467,
"learning_rate": 1.731005322296823e-05,
"loss": 0.6765,
"step": 48000
},
{
"epoch": 0.7873049128808649,
"grad_norm": 1.1852935552597046,
"learning_rate": 1.729798386969064e-05,
"loss": 0.6968,
"step": 48100
},
{
"epoch": 0.7889417214315527,
"grad_norm": 1.1918047666549683,
"learning_rate": 1.728589172834993e-05,
"loss": 0.6815,
"step": 48200
},
{
"epoch": 0.7905785299822407,
"grad_norm": 1.3117504119873047,
"learning_rate": 1.7273776836703985e-05,
"loss": 0.6799,
"step": 48300
},
{
"epoch": 0.7922153385329285,
"grad_norm": 1.2398260831832886,
"learning_rate": 1.726163923258174e-05,
"loss": 0.6869,
"step": 48400
},
{
"epoch": 0.7938521470836164,
"grad_norm": 1.2091760635375977,
"learning_rate": 1.724947895388304e-05,
"loss": 0.6679,
"step": 48500
},
{
"epoch": 0.7954889556343042,
"grad_norm": 1.1533339023590088,
"learning_rate": 1.723729603857854e-05,
"loss": 0.6877,
"step": 48600
},
{
"epoch": 0.7971257641849921,
"grad_norm": 1.2629398107528687,
"learning_rate": 1.7225090524709577e-05,
"loss": 0.6878,
"step": 48700
},
{
"epoch": 0.79876257273568,
"grad_norm": 1.202531099319458,
"learning_rate": 1.7212862450388037e-05,
"loss": 0.6911,
"step": 48800
},
{
"epoch": 0.8003993812863679,
"grad_norm": 1.189326286315918,
"learning_rate": 1.7200611853796278e-05,
"loss": 0.6966,
"step": 48900
},
{
"epoch": 0.8020361898370557,
"grad_norm": 1.2614778280258179,
"learning_rate": 1.718833877318696e-05,
"loss": 0.6952,
"step": 49000
},
{
"epoch": 0.8036729983877435,
"grad_norm": 1.1864616870880127,
"learning_rate": 1.7176043246882966e-05,
"loss": 0.6756,
"step": 49100
},
{
"epoch": 0.8053098069384315,
"grad_norm": 1.205569863319397,
"learning_rate": 1.7163725313277255e-05,
"loss": 0.6748,
"step": 49200
},
{
"epoch": 0.8069466154891193,
"grad_norm": 1.2782241106033325,
"learning_rate": 1.715138501083276e-05,
"loss": 0.6903,
"step": 49300
},
{
"epoch": 0.8085834240398072,
"grad_norm": 1.0571094751358032,
"learning_rate": 1.7139022378082256e-05,
"loss": 0.6871,
"step": 49400
},
{
"epoch": 0.810220232590495,
"grad_norm": 1.3369005918502808,
"learning_rate": 1.712663745362826e-05,
"loss": 0.6746,
"step": 49500
},
{
"epoch": 0.811857041141183,
"grad_norm": 1.2506871223449707,
"learning_rate": 1.7114230276142866e-05,
"loss": 0.6935,
"step": 49600
},
{
"epoch": 0.8134938496918708,
"grad_norm": 1.3436931371688843,
"learning_rate": 1.7101800884367676e-05,
"loss": 0.6859,
"step": 49700
},
{
"epoch": 0.8151306582425587,
"grad_norm": 1.3217076063156128,
"learning_rate": 1.708934931711365e-05,
"loss": 0.6766,
"step": 49800
},
{
"epoch": 0.8167674667932465,
"grad_norm": 1.3521711826324463,
"learning_rate": 1.7076875613261e-05,
"loss": 0.6828,
"step": 49900
},
{
"epoch": 0.8184042753439345,
"grad_norm": 1.1544018983840942,
"learning_rate": 1.706437981175904e-05,
"loss": 0.6866,
"step": 50000
},
{
"epoch": 0.8200410838946223,
"grad_norm": 1.3795074224472046,
"learning_rate": 1.7051861951626105e-05,
"loss": 0.6893,
"step": 50100
},
{
"epoch": 0.8216778924453101,
"grad_norm": 1.2545524835586548,
"learning_rate": 1.7039322071949396e-05,
"loss": 0.6865,
"step": 50200
},
{
"epoch": 0.823314700995998,
"grad_norm": 1.3663312196731567,
"learning_rate": 1.702676021188487e-05,
"loss": 0.6858,
"step": 50300
},
{
"epoch": 0.8249515095466858,
"grad_norm": 1.4371784925460815,
"learning_rate": 1.701417641065713e-05,
"loss": 0.6827,
"step": 50400
},
{
"epoch": 0.8265883180973738,
"grad_norm": 1.465648889541626,
"learning_rate": 1.7001570707559274e-05,
"loss": 0.6813,
"step": 50500
},
{
"epoch": 0.8282251266480616,
"grad_norm": 1.1045328378677368,
"learning_rate": 1.69889431419528e-05,
"loss": 0.6858,
"step": 50600
},
{
"epoch": 0.8298619351987495,
"grad_norm": 1.1676952838897705,
"learning_rate": 1.6976293753267467e-05,
"loss": 0.662,
"step": 50700
},
{
"epoch": 0.8314987437494373,
"grad_norm": 1.2377560138702393,
"learning_rate": 1.6963622581001188e-05,
"loss": 0.6853,
"step": 50800
},
{
"epoch": 0.8331355523001253,
"grad_norm": 1.2052476406097412,
"learning_rate": 1.6950929664719883e-05,
"loss": 0.6898,
"step": 50900
},
{
"epoch": 0.8347723608508131,
"grad_norm": 1.400944709777832,
"learning_rate": 1.6938215044057363e-05,
"loss": 0.6905,
"step": 51000
},
{
"epoch": 0.836409169401501,
"grad_norm": 1.2622673511505127,
"learning_rate": 1.6925478758715226e-05,
"loss": 0.6651,
"step": 51100
},
{
"epoch": 0.8380459779521888,
"grad_norm": 1.1664501428604126,
"learning_rate": 1.691272084846272e-05,
"loss": 0.6851,
"step": 51200
},
{
"epoch": 0.8396827865028766,
"grad_norm": 1.2591482400894165,
"learning_rate": 1.68999413531366e-05,
"loss": 0.6936,
"step": 51300
},
{
"epoch": 0.8413195950535646,
"grad_norm": 1.163874864578247,
"learning_rate": 1.6887140312641036e-05,
"loss": 0.6886,
"step": 51400
},
{
"epoch": 0.8429564036042524,
"grad_norm": 1.2441082000732422,
"learning_rate": 1.6874317766947458e-05,
"loss": 0.6761,
"step": 51500
},
{
"epoch": 0.8445932121549403,
"grad_norm": 1.1966642141342163,
"learning_rate": 1.6861473756094464e-05,
"loss": 0.6758,
"step": 51600
},
{
"epoch": 0.8462300207056281,
"grad_norm": 1.1858773231506348,
"learning_rate": 1.6848608320187668e-05,
"loss": 0.6806,
"step": 51700
},
{
"epoch": 0.8478668292563161,
"grad_norm": 1.1656018495559692,
"learning_rate": 1.6835721499399583e-05,
"loss": 0.6768,
"step": 51800
},
{
"epoch": 0.8495036378070039,
"grad_norm": 1.2097491025924683,
"learning_rate": 1.6822813333969495e-05,
"loss": 0.6936,
"step": 51900
},
{
"epoch": 0.8511404463576918,
"grad_norm": 1.4976009130477905,
"learning_rate": 1.6809883864203352e-05,
"loss": 0.6721,
"step": 52000
},
{
"epoch": 0.8527772549083796,
"grad_norm": 1.3640004396438599,
"learning_rate": 1.6796933130473606e-05,
"loss": 0.6738,
"step": 52100
},
{
"epoch": 0.8544140634590676,
"grad_norm": 1.2159740924835205,
"learning_rate": 1.6783961173219116e-05,
"loss": 0.6755,
"step": 52200
},
{
"epoch": 0.8560508720097554,
"grad_norm": 1.23357355594635,
"learning_rate": 1.677096803294502e-05,
"loss": 0.6789,
"step": 52300
},
{
"epoch": 0.8576876805604432,
"grad_norm": 1.2574186325073242,
"learning_rate": 1.6757953750222586e-05,
"loss": 0.6892,
"step": 52400
},
{
"epoch": 0.8593244891111311,
"grad_norm": 1.2394073009490967,
"learning_rate": 1.6744918365689106e-05,
"loss": 0.6726,
"step": 52500
},
{
"epoch": 0.860961297661819,
"grad_norm": 1.2098554372787476,
"learning_rate": 1.6731861920047758e-05,
"loss": 0.6714,
"step": 52600
},
{
"epoch": 0.8625981062125069,
"grad_norm": 1.3548126220703125,
"learning_rate": 1.6718784454067495e-05,
"loss": 0.6849,
"step": 52700
},
{
"epoch": 0.8642349147631947,
"grad_norm": 1.5218019485473633,
"learning_rate": 1.670568600858289e-05,
"loss": 0.6744,
"step": 52800
},
{
"epoch": 0.8658717233138826,
"grad_norm": 1.3826264142990112,
"learning_rate": 1.669256662449404e-05,
"loss": 0.6762,
"step": 52900
},
{
"epoch": 0.8675085318645704,
"grad_norm": 1.2154985666275024,
"learning_rate": 1.667942634276642e-05,
"loss": 0.6711,
"step": 53000
},
{
"epoch": 0.8691453404152584,
"grad_norm": 1.3120452165603638,
"learning_rate": 1.666626520443075e-05,
"loss": 0.6788,
"step": 53100
},
{
"epoch": 0.8707821489659462,
"grad_norm": 1.2221883535385132,
"learning_rate": 1.665308325058288e-05,
"loss": 0.6661,
"step": 53200
},
{
"epoch": 0.8724189575166341,
"grad_norm": 1.385396957397461,
"learning_rate": 1.6639880522383655e-05,
"loss": 0.6714,
"step": 53300
},
{
"epoch": 0.8740557660673219,
"grad_norm": 1.2685418128967285,
"learning_rate": 1.6626657061058797e-05,
"loss": 0.668,
"step": 53400
},
{
"epoch": 0.8756925746180098,
"grad_norm": 1.513152837753296,
"learning_rate": 1.661341290789875e-05,
"loss": 0.6706,
"step": 53500
},
{
"epoch": 0.8773293831686977,
"grad_norm": 1.2810958623886108,
"learning_rate": 1.6600148104258594e-05,
"loss": 0.6904,
"step": 53600
},
{
"epoch": 0.8789661917193855,
"grad_norm": 1.2695286273956299,
"learning_rate": 1.6586862691557863e-05,
"loss": 0.6733,
"step": 53700
},
{
"epoch": 0.8806030002700734,
"grad_norm": 1.0760889053344727,
"learning_rate": 1.6573556711280457e-05,
"loss": 0.6743,
"step": 53800
},
{
"epoch": 0.8822398088207613,
"grad_norm": 1.3402081727981567,
"learning_rate": 1.6560230204974502e-05,
"loss": 0.6706,
"step": 53900
},
{
"epoch": 0.8838766173714492,
"grad_norm": 1.191873550415039,
"learning_rate": 1.654688321425221e-05,
"loss": 0.6764,
"step": 54000
},
{
"epoch": 0.885513425922137,
"grad_norm": 1.1215344667434692,
"learning_rate": 1.6533515780789758e-05,
"loss": 0.6857,
"step": 54100
},
{
"epoch": 0.8871502344728249,
"grad_norm": 1.1322293281555176,
"learning_rate": 1.6520127946327155e-05,
"loss": 0.6723,
"step": 54200
},
{
"epoch": 0.8887870430235127,
"grad_norm": 1.7162648439407349,
"learning_rate": 1.6506719752668115e-05,
"loss": 0.679,
"step": 54300
},
{
"epoch": 0.8904238515742007,
"grad_norm": 1.5632336139678955,
"learning_rate": 1.6493291241679922e-05,
"loss": 0.6807,
"step": 54400
},
{
"epoch": 0.8920606601248885,
"grad_norm": 1.0530614852905273,
"learning_rate": 1.6479842455293297e-05,
"loss": 0.6681,
"step": 54500
},
{
"epoch": 0.8936974686755763,
"grad_norm": 1.2179269790649414,
"learning_rate": 1.6466373435502276e-05,
"loss": 0.6614,
"step": 54600
},
{
"epoch": 0.8953342772262642,
"grad_norm": 1.3225027322769165,
"learning_rate": 1.6452884224364082e-05,
"loss": 0.671,
"step": 54700
},
{
"epoch": 0.8969710857769521,
"grad_norm": 1.3610303401947021,
"learning_rate": 1.6439374863998966e-05,
"loss": 0.6801,
"step": 54800
},
{
"epoch": 0.89860789432764,
"grad_norm": 1.3277727365493774,
"learning_rate": 1.6425845396590114e-05,
"loss": 0.6746,
"step": 54900
},
{
"epoch": 0.9002447028783278,
"grad_norm": 1.2963169813156128,
"learning_rate": 1.6412295864383487e-05,
"loss": 0.6817,
"step": 55000
},
{
"epoch": 0.9018815114290157,
"grad_norm": 1.475885033607483,
"learning_rate": 1.6398726309687704e-05,
"loss": 0.6891,
"step": 55100
},
{
"epoch": 0.9035183199797036,
"grad_norm": 1.2722758054733276,
"learning_rate": 1.638513677487389e-05,
"loss": 0.6709,
"step": 55200
},
{
"epoch": 0.9051551285303915,
"grad_norm": 1.3521857261657715,
"learning_rate": 1.637152730237558e-05,
"loss": 0.6812,
"step": 55300
},
{
"epoch": 0.9067919370810793,
"grad_norm": 1.2276744842529297,
"learning_rate": 1.6357897934688555e-05,
"loss": 0.6644,
"step": 55400
},
{
"epoch": 0.9084287456317672,
"grad_norm": 1.5432332754135132,
"learning_rate": 1.634424871437071e-05,
"loss": 0.6817,
"step": 55500
},
{
"epoch": 0.910065554182455,
"grad_norm": 1.2314627170562744,
"learning_rate": 1.6330579684041946e-05,
"loss": 0.6761,
"step": 55600
},
{
"epoch": 0.9117023627331429,
"grad_norm": 1.473347544670105,
"learning_rate": 1.631689088638401e-05,
"loss": 0.6587,
"step": 55700
},
{
"epoch": 0.9133391712838308,
"grad_norm": 1.4029542207717896,
"learning_rate": 1.6303182364140376e-05,
"loss": 0.6863,
"step": 55800
},
{
"epoch": 0.9149759798345186,
"grad_norm": 1.1235482692718506,
"learning_rate": 1.628945416011611e-05,
"loss": 0.6717,
"step": 55900
},
{
"epoch": 0.9166127883852065,
"grad_norm": 1.1514254808425903,
"learning_rate": 1.6275706317177732e-05,
"loss": 0.6815,
"step": 56000
},
{
"epoch": 0.9182495969358944,
"grad_norm": 1.388074517250061,
"learning_rate": 1.6261938878253086e-05,
"loss": 0.6849,
"step": 56100
},
{
"epoch": 0.9198864054865823,
"grad_norm": 1.1814851760864258,
"learning_rate": 1.6248151886331208e-05,
"loss": 0.6641,
"step": 56200
},
{
"epoch": 0.9215232140372701,
"grad_norm": 1.4052802324295044,
"learning_rate": 1.6234345384462174e-05,
"loss": 0.6787,
"step": 56300
},
{
"epoch": 0.923160022587958,
"grad_norm": 1.5508378744125366,
"learning_rate": 1.6220519415757005e-05,
"loss": 0.6808,
"step": 56400
},
{
"epoch": 0.9247968311386459,
"grad_norm": 1.3127562999725342,
"learning_rate": 1.620667402338749e-05,
"loss": 0.6663,
"step": 56500
},
{
"epoch": 0.9264336396893338,
"grad_norm": 1.2677356004714966,
"learning_rate": 1.619280925058607e-05,
"loss": 0.6723,
"step": 56600
},
{
"epoch": 0.9280704482400216,
"grad_norm": 1.2480475902557373,
"learning_rate": 1.61789251406457e-05,
"loss": 0.6583,
"step": 56700
},
{
"epoch": 0.9297072567907094,
"grad_norm": 1.1523864269256592,
"learning_rate": 1.616502173691973e-05,
"loss": 0.6858,
"step": 56800
},
{
"epoch": 0.9313440653413974,
"grad_norm": 1.2443100214004517,
"learning_rate": 1.615109908282174e-05,
"loss": 0.6842,
"step": 56900
},
{
"epoch": 0.9329808738920852,
"grad_norm": 1.172663927078247,
"learning_rate": 1.6137157221825418e-05,
"loss": 0.6708,
"step": 57000
},
{
"epoch": 0.9346176824427731,
"grad_norm": 1.2049202919006348,
"learning_rate": 1.6123196197464445e-05,
"loss": 0.6665,
"step": 57100
},
{
"epoch": 0.9362544909934609,
"grad_norm": 1.3395051956176758,
"learning_rate": 1.6109216053332313e-05,
"loss": 0.6593,
"step": 57200
},
{
"epoch": 0.9378912995441488,
"grad_norm": 1.4670510292053223,
"learning_rate": 1.6095216833082242e-05,
"loss": 0.6715,
"step": 57300
},
{
"epoch": 0.9395281080948367,
"grad_norm": 1.349523663520813,
"learning_rate": 1.6081198580427e-05,
"loss": 0.6724,
"step": 57400
},
{
"epoch": 0.9411649166455246,
"grad_norm": 1.5846613645553589,
"learning_rate": 1.606716133913879e-05,
"loss": 0.6716,
"step": 57500
},
{
"epoch": 0.9428017251962124,
"grad_norm": 1.1905144453048706,
"learning_rate": 1.6053105153049103e-05,
"loss": 0.6702,
"step": 57600
},
{
"epoch": 0.9444385337469003,
"grad_norm": 1.4006574153900146,
"learning_rate": 1.6039030066048592e-05,
"loss": 0.6665,
"step": 57700
},
{
"epoch": 0.9460753422975882,
"grad_norm": 1.3038159608840942,
"learning_rate": 1.602493612208693e-05,
"loss": 0.665,
"step": 57800
},
{
"epoch": 0.947712150848276,
"grad_norm": 1.336591124534607,
"learning_rate": 1.601082336517266e-05,
"loss": 0.6572,
"step": 57900
},
{
"epoch": 0.9493489593989639,
"grad_norm": 1.3096286058425903,
"learning_rate": 1.5996691839373077e-05,
"loss": 0.6651,
"step": 58000
},
{
"epoch": 0.9509857679496517,
"grad_norm": 1.3385711908340454,
"learning_rate": 1.5982541588814083e-05,
"loss": 0.6708,
"step": 58100
},
{
"epoch": 0.9526225765003397,
"grad_norm": 1.2425600290298462,
"learning_rate": 1.596837265768004e-05,
"loss": 0.6629,
"step": 58200
},
{
"epoch": 0.9542593850510275,
"grad_norm": 1.1755977869033813,
"learning_rate": 1.5954185090213653e-05,
"loss": 0.6618,
"step": 58300
},
{
"epoch": 0.9558961936017154,
"grad_norm": 1.5241588354110718,
"learning_rate": 1.5939978930715808e-05,
"loss": 0.6747,
"step": 58400
},
{
"epoch": 0.9575330021524032,
"grad_norm": 1.113451361656189,
"learning_rate": 1.5925754223545452e-05,
"loss": 0.6779,
"step": 58500
},
{
"epoch": 0.9591698107030912,
"grad_norm": 1.2721067667007446,
"learning_rate": 1.5911511013119438e-05,
"loss": 0.6586,
"step": 58600
},
{
"epoch": 0.960806619253779,
"grad_norm": 1.5037124156951904,
"learning_rate": 1.589724934391241e-05,
"loss": 0.6646,
"step": 58700
},
{
"epoch": 0.9624434278044669,
"grad_norm": 1.2813490629196167,
"learning_rate": 1.588296926045664e-05,
"loss": 0.6644,
"step": 58800
},
{
"epoch": 0.9640802363551547,
"grad_norm": 1.2610142230987549,
"learning_rate": 1.58686708073419e-05,
"loss": 0.6717,
"step": 58900
},
{
"epoch": 0.9657170449058425,
"grad_norm": 1.2408130168914795,
"learning_rate": 1.585435402921532e-05,
"loss": 0.6695,
"step": 59000
},
{
"epoch": 0.9673538534565305,
"grad_norm": 1.4657983779907227,
"learning_rate": 1.584001897078126e-05,
"loss": 0.6777,
"step": 59100
},
{
"epoch": 0.9689906620072183,
"grad_norm": 1.370548129081726,
"learning_rate": 1.5825665676801145e-05,
"loss": 0.6881,
"step": 59200
},
{
"epoch": 0.9706274705579062,
"grad_norm": 1.3695186376571655,
"learning_rate": 1.5811294192093353e-05,
"loss": 0.6594,
"step": 59300
},
{
"epoch": 0.972264279108594,
"grad_norm": 1.2767751216888428,
"learning_rate": 1.5796904561533054e-05,
"loss": 0.6661,
"step": 59400
},
{
"epoch": 0.973901087659282,
"grad_norm": 1.293419361114502,
"learning_rate": 1.578249683005209e-05,
"loss": 0.6781,
"step": 59500
},
{
"epoch": 0.9755378962099698,
"grad_norm": 1.5075045824050903,
"learning_rate": 1.576807104263881e-05,
"loss": 0.6706,
"step": 59600
},
{
"epoch": 0.9771747047606577,
"grad_norm": 1.1597870588302612,
"learning_rate": 1.5753627244337958e-05,
"loss": 0.6709,
"step": 59700
},
{
"epoch": 0.9788115133113455,
"grad_norm": 1.5488371849060059,
"learning_rate": 1.5739165480250504e-05,
"loss": 0.6611,
"step": 59800
},
{
"epoch": 0.9804483218620335,
"grad_norm": 1.3339688777923584,
"learning_rate": 1.5724685795533518e-05,
"loss": 0.679,
"step": 59900
},
{
"epoch": 0.9820851304127213,
"grad_norm": 1.3151462078094482,
"learning_rate": 1.571018823540004e-05,
"loss": 0.6636,
"step": 60000
},
{
"epoch": 0.9837219389634091,
"grad_norm": 1.3205444812774658,
"learning_rate": 1.5695672845118903e-05,
"loss": 0.6623,
"step": 60100
},
{
"epoch": 0.985358747514097,
"grad_norm": 1.294420599937439,
"learning_rate": 1.5681139670014643e-05,
"loss": 0.6666,
"step": 60200
},
{
"epoch": 0.9869955560647848,
"grad_norm": 1.3142366409301758,
"learning_rate": 1.566658875546731e-05,
"loss": 0.6629,
"step": 60300
},
{
"epoch": 0.9886323646154728,
"grad_norm": 1.3516416549682617,
"learning_rate": 1.565202014691235e-05,
"loss": 0.6664,
"step": 60400
},
{
"epoch": 0.9902691731661606,
"grad_norm": 1.2360502481460571,
"learning_rate": 1.5637433889840455e-05,
"loss": 0.6608,
"step": 60500
},
{
"epoch": 0.9919059817168485,
"grad_norm": 1.155104398727417,
"learning_rate": 1.562283002979744e-05,
"loss": 0.6676,
"step": 60600
},
{
"epoch": 0.9935427902675363,
"grad_norm": 1.2880823612213135,
"learning_rate": 1.560820861238407e-05,
"loss": 0.6632,
"step": 60700
},
{
"epoch": 0.9951795988182243,
"grad_norm": 1.2748744487762451,
"learning_rate": 1.5593569683255936e-05,
"loss": 0.6723,
"step": 60800
},
{
"epoch": 0.9968164073689121,
"grad_norm": 1.2065379619598389,
"learning_rate": 1.557891328812332e-05,
"loss": 0.6831,
"step": 60900
},
{
"epoch": 0.9984532159196,
"grad_norm": 1.143071174621582,
"learning_rate": 1.5564239472751022e-05,
"loss": 0.6656,
"step": 61000
},
{
"epoch": 1.0000818404275345,
"grad_norm": 1.1476441621780396,
"learning_rate": 1.5549548282958253e-05,
"loss": 0.6591,
"step": 61100
},
{
"epoch": 1.0017186489782222,
"grad_norm": 1.210295557975769,
"learning_rate": 1.5534839764618477e-05,
"loss": 0.6559,
"step": 61200
},
{
"epoch": 1.00335545752891,
"grad_norm": 1.5003302097320557,
"learning_rate": 1.5520113963659257e-05,
"loss": 0.6615,
"step": 61300
},
{
"epoch": 1.004992266079598,
"grad_norm": 1.235449194908142,
"learning_rate": 1.550537092606212e-05,
"loss": 0.6709,
"step": 61400
},
{
"epoch": 1.006629074630286,
"grad_norm": 1.1739157438278198,
"learning_rate": 1.549061069786243e-05,
"loss": 0.668,
"step": 61500
},
{
"epoch": 1.0082658831809737,
"grad_norm": 1.2646570205688477,
"learning_rate": 1.5475833325149215e-05,
"loss": 0.6553,
"step": 61600
},
{
"epoch": 1.0099026917316616,
"grad_norm": 1.2951397895812988,
"learning_rate": 1.546103885406504e-05,
"loss": 0.6584,
"step": 61700
},
{
"epoch": 1.0115395002823495,
"grad_norm": 1.2838189601898193,
"learning_rate": 1.544622733080586e-05,
"loss": 0.6518,
"step": 61800
},
{
"epoch": 1.0131763088330374,
"grad_norm": 1.3708552122116089,
"learning_rate": 1.543139880162088e-05,
"loss": 0.6628,
"step": 61900
},
{
"epoch": 1.0148131173837251,
"grad_norm": 1.301353931427002,
"learning_rate": 1.54165533128124e-05,
"loss": 0.6478,
"step": 62000
},
{
"epoch": 1.016449925934413,
"grad_norm": 1.3044975996017456,
"learning_rate": 1.5401690910735677e-05,
"loss": 0.6439,
"step": 62100
},
{
"epoch": 1.018086734485101,
"grad_norm": 1.4568370580673218,
"learning_rate": 1.5386811641798785e-05,
"loss": 0.6482,
"step": 62200
},
{
"epoch": 1.0197235430357887,
"grad_norm": 1.3758224248886108,
"learning_rate": 1.5371915552462466e-05,
"loss": 0.663,
"step": 62300
},
{
"epoch": 1.0213603515864766,
"grad_norm": 1.6428395509719849,
"learning_rate": 1.535700268923998e-05,
"loss": 0.6533,
"step": 62400
},
{
"epoch": 1.0229971601371646,
"grad_norm": 1.3830885887145996,
"learning_rate": 1.5342073098696956e-05,
"loss": 0.6632,
"step": 62500
},
{
"epoch": 1.0246339686878525,
"grad_norm": 1.426006555557251,
"learning_rate": 1.5327126827451272e-05,
"loss": 0.6491,
"step": 62600
},
{
"epoch": 1.0262707772385402,
"grad_norm": 1.4166696071624756,
"learning_rate": 1.531216392217288e-05,
"loss": 0.6465,
"step": 62700
},
{
"epoch": 1.0279075857892281,
"grad_norm": 1.224443793296814,
"learning_rate": 1.529718442958367e-05,
"loss": 0.6642,
"step": 62800
},
{
"epoch": 1.029544394339916,
"grad_norm": 1.250406265258789,
"learning_rate": 1.528218839645733e-05,
"loss": 0.6516,
"step": 62900
},
{
"epoch": 1.031181202890604,
"grad_norm": 1.2630037069320679,
"learning_rate": 1.52671758696192e-05,
"loss": 0.6649,
"step": 63000
}
],
"logging_steps": 100,
"max_steps": 183285,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.34907099427588e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}