EncodeRec / last-checkpoint /trainer_state.json
guyhadad01's picture
Training in progress, step 22800, checkpoint
00a697d verified
raw
history blame
80.6 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.40289091905073243,
"eval_steps": 500,
"global_step": 22800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008835327172165185,
"grad_norm": 5.665971279144287,
"learning_rate": 4.3286219081272084e-07,
"loss": 1.3738,
"step": 50
},
{
"epoch": 0.001767065434433037,
"grad_norm": 5.6161651611328125,
"learning_rate": 8.745583038869259e-07,
"loss": 1.1661,
"step": 100
},
{
"epoch": 0.0026505981516495554,
"grad_norm": 7.866199970245361,
"learning_rate": 1.3162544169611309e-06,
"loss": 1.2107,
"step": 150
},
{
"epoch": 0.003534130868866074,
"grad_norm": 5.07379674911499,
"learning_rate": 1.7579505300353357e-06,
"loss": 0.9855,
"step": 200
},
{
"epoch": 0.004417663586082593,
"grad_norm": 3.2607851028442383,
"learning_rate": 2.199646643109541e-06,
"loss": 0.9431,
"step": 250
},
{
"epoch": 0.005301196303299111,
"grad_norm": 6.517599105834961,
"learning_rate": 2.6413427561837457e-06,
"loss": 0.8566,
"step": 300
},
{
"epoch": 0.00618472902051563,
"grad_norm": 2.8523333072662354,
"learning_rate": 3.0830388692579506e-06,
"loss": 0.8697,
"step": 350
},
{
"epoch": 0.007068261737732148,
"grad_norm": 3.460226058959961,
"learning_rate": 3.5247349823321555e-06,
"loss": 0.8099,
"step": 400
},
{
"epoch": 0.007951794454948667,
"grad_norm": 3.2528891563415527,
"learning_rate": 3.966431095406361e-06,
"loss": 0.766,
"step": 450
},
{
"epoch": 0.008835327172165185,
"grad_norm": 4.1086039543151855,
"learning_rate": 4.408127208480566e-06,
"loss": 0.7402,
"step": 500
},
{
"epoch": 0.009718859889381704,
"grad_norm": 3.8160510063171387,
"learning_rate": 4.849823321554771e-06,
"loss": 0.8769,
"step": 550
},
{
"epoch": 0.010602392606598222,
"grad_norm": 2.901653289794922,
"learning_rate": 5.291519434628975e-06,
"loss": 0.6827,
"step": 600
},
{
"epoch": 0.011485925323814742,
"grad_norm": 2.5824739933013916,
"learning_rate": 5.73321554770318e-06,
"loss": 0.7252,
"step": 650
},
{
"epoch": 0.01236945804103126,
"grad_norm": 2.586138963699341,
"learning_rate": 6.174911660777385e-06,
"loss": 0.7701,
"step": 700
},
{
"epoch": 0.013252990758247778,
"grad_norm": 2.3450210094451904,
"learning_rate": 6.6166077738515904e-06,
"loss": 0.7525,
"step": 750
},
{
"epoch": 0.014136523475464296,
"grad_norm": 2.7902042865753174,
"learning_rate": 7.058303886925795e-06,
"loss": 0.7097,
"step": 800
},
{
"epoch": 0.015020056192680814,
"grad_norm": 3.297929286956787,
"learning_rate": 7.5e-06,
"loss": 0.7575,
"step": 850
},
{
"epoch": 0.015903588909897334,
"grad_norm": 4.028406143188477,
"learning_rate": 7.941696113074205e-06,
"loss": 0.6899,
"step": 900
},
{
"epoch": 0.016787121627113853,
"grad_norm": 2.2513041496276855,
"learning_rate": 8.38339222614841e-06,
"loss": 0.6655,
"step": 950
},
{
"epoch": 0.01767065434433037,
"grad_norm": 2.402355670928955,
"learning_rate": 8.825088339222614e-06,
"loss": 0.6601,
"step": 1000
},
{
"epoch": 0.01855418706154689,
"grad_norm": 4.492621898651123,
"learning_rate": 9.26678445229682e-06,
"loss": 0.6925,
"step": 1050
},
{
"epoch": 0.019437719778763407,
"grad_norm": 3.8099517822265625,
"learning_rate": 9.708480565371025e-06,
"loss": 0.6169,
"step": 1100
},
{
"epoch": 0.020321252495979925,
"grad_norm": 4.58193826675415,
"learning_rate": 1.0150176678445231e-05,
"loss": 0.6367,
"step": 1150
},
{
"epoch": 0.021204785213196443,
"grad_norm": 4.745123863220215,
"learning_rate": 1.0591872791519434e-05,
"loss": 0.615,
"step": 1200
},
{
"epoch": 0.02208831793041296,
"grad_norm": 3.260239601135254,
"learning_rate": 1.103356890459364e-05,
"loss": 0.6869,
"step": 1250
},
{
"epoch": 0.022971850647629483,
"grad_norm": 2.485383987426758,
"learning_rate": 1.1475265017667845e-05,
"loss": 0.7527,
"step": 1300
},
{
"epoch": 0.023855383364846,
"grad_norm": 2.26680326461792,
"learning_rate": 1.191696113074205e-05,
"loss": 0.6124,
"step": 1350
},
{
"epoch": 0.02473891608206252,
"grad_norm": 2.348688840866089,
"learning_rate": 1.2358657243816255e-05,
"loss": 0.6511,
"step": 1400
},
{
"epoch": 0.025622448799279038,
"grad_norm": 2.770859956741333,
"learning_rate": 1.280035335689046e-05,
"loss": 0.7047,
"step": 1450
},
{
"epoch": 0.026505981516495556,
"grad_norm": 3.188656806945801,
"learning_rate": 1.3242049469964666e-05,
"loss": 0.6639,
"step": 1500
},
{
"epoch": 0.027389514233712074,
"grad_norm": 2.7158899307250977,
"learning_rate": 1.368374558303887e-05,
"loss": 0.6795,
"step": 1550
},
{
"epoch": 0.028273046950928592,
"grad_norm": 2.7986080646514893,
"learning_rate": 1.4125441696113076e-05,
"loss": 0.6341,
"step": 1600
},
{
"epoch": 0.02915657966814511,
"grad_norm": 1.9698214530944824,
"learning_rate": 1.456713780918728e-05,
"loss": 0.6031,
"step": 1650
},
{
"epoch": 0.03004011238536163,
"grad_norm": 2.495985507965088,
"learning_rate": 1.5008833922261484e-05,
"loss": 0.5959,
"step": 1700
},
{
"epoch": 0.030923645102578147,
"grad_norm": 2.990360975265503,
"learning_rate": 1.545053003533569e-05,
"loss": 0.6412,
"step": 1750
},
{
"epoch": 0.03180717781979467,
"grad_norm": 3.658212184906006,
"learning_rate": 1.5892226148409894e-05,
"loss": 0.5065,
"step": 1800
},
{
"epoch": 0.03269071053701118,
"grad_norm": 2.010875940322876,
"learning_rate": 1.63339222614841e-05,
"loss": 0.5611,
"step": 1850
},
{
"epoch": 0.033574243254227705,
"grad_norm": 2.408937692642212,
"learning_rate": 1.6775618374558306e-05,
"loss": 0.5298,
"step": 1900
},
{
"epoch": 0.03445777597144422,
"grad_norm": 2.3144407272338867,
"learning_rate": 1.721731448763251e-05,
"loss": 0.5759,
"step": 1950
},
{
"epoch": 0.03534130868866074,
"grad_norm": 2.944115400314331,
"learning_rate": 1.7659010600706715e-05,
"loss": 0.5782,
"step": 2000
},
{
"epoch": 0.03622484140587726,
"grad_norm": 2.3239428997039795,
"learning_rate": 1.810070671378092e-05,
"loss": 0.5221,
"step": 2050
},
{
"epoch": 0.03710837412309378,
"grad_norm": 4.565939426422119,
"learning_rate": 1.8542402826855124e-05,
"loss": 0.5966,
"step": 2100
},
{
"epoch": 0.0379919068403103,
"grad_norm": 2.6089091300964355,
"learning_rate": 1.898409893992933e-05,
"loss": 0.5989,
"step": 2150
},
{
"epoch": 0.038875439557526814,
"grad_norm": 2.4395945072174072,
"learning_rate": 1.9425795053003533e-05,
"loss": 0.5097,
"step": 2200
},
{
"epoch": 0.039758972274743336,
"grad_norm": 2.274600028991699,
"learning_rate": 1.986749116607774e-05,
"loss": 0.4934,
"step": 2250
},
{
"epoch": 0.04064250499195985,
"grad_norm": 2.393251895904541,
"learning_rate": 2.0309187279151945e-05,
"loss": 0.5354,
"step": 2300
},
{
"epoch": 0.04152603770917637,
"grad_norm": 2.613900899887085,
"learning_rate": 2.075088339222615e-05,
"loss": 0.5236,
"step": 2350
},
{
"epoch": 0.04240957042639289,
"grad_norm": 2.233302116394043,
"learning_rate": 2.1192579505300354e-05,
"loss": 0.5057,
"step": 2400
},
{
"epoch": 0.04329310314360941,
"grad_norm": 2.2634503841400146,
"learning_rate": 2.163427561837456e-05,
"loss": 0.5448,
"step": 2450
},
{
"epoch": 0.04417663586082592,
"grad_norm": 1.6744658946990967,
"learning_rate": 2.2075971731448763e-05,
"loss": 0.5418,
"step": 2500
},
{
"epoch": 0.045060168578042445,
"grad_norm": 2.9320178031921387,
"learning_rate": 2.2517667844522968e-05,
"loss": 0.5944,
"step": 2550
},
{
"epoch": 0.04594370129525897,
"grad_norm": 2.2643797397613525,
"learning_rate": 2.2959363957597176e-05,
"loss": 0.4945,
"step": 2600
},
{
"epoch": 0.04682723401247548,
"grad_norm": 2.389902114868164,
"learning_rate": 2.340106007067138e-05,
"loss": 0.5225,
"step": 2650
},
{
"epoch": 0.047710766729692,
"grad_norm": 2.2676665782928467,
"learning_rate": 2.3842756183745584e-05,
"loss": 0.5661,
"step": 2700
},
{
"epoch": 0.04859429944690852,
"grad_norm": 2.340926170349121,
"learning_rate": 2.428445229681979e-05,
"loss": 0.6125,
"step": 2750
},
{
"epoch": 0.04947783216412504,
"grad_norm": 1.925943374633789,
"learning_rate": 2.4726148409893997e-05,
"loss": 0.5105,
"step": 2800
},
{
"epoch": 0.050361364881341554,
"grad_norm": 3.1281192302703857,
"learning_rate": 2.5167844522968198e-05,
"loss": 0.5893,
"step": 2850
},
{
"epoch": 0.051244897598558076,
"grad_norm": 2.345649242401123,
"learning_rate": 2.5609540636042406e-05,
"loss": 0.545,
"step": 2900
},
{
"epoch": 0.05212843031577459,
"grad_norm": 2.9023561477661133,
"learning_rate": 2.605123674911661e-05,
"loss": 0.5299,
"step": 2950
},
{
"epoch": 0.05301196303299111,
"grad_norm": 2.491269588470459,
"learning_rate": 2.649293286219081e-05,
"loss": 0.5186,
"step": 3000
},
{
"epoch": 0.05389549575020763,
"grad_norm": 1.842517375946045,
"learning_rate": 2.693462897526502e-05,
"loss": 0.5259,
"step": 3050
},
{
"epoch": 0.05477902846742415,
"grad_norm": 3.319514274597168,
"learning_rate": 2.7376325088339223e-05,
"loss": 0.6663,
"step": 3100
},
{
"epoch": 0.05566256118464067,
"grad_norm": 2.7143654823303223,
"learning_rate": 2.781802120141343e-05,
"loss": 0.5152,
"step": 3150
},
{
"epoch": 0.056546093901857185,
"grad_norm": 2.8187732696533203,
"learning_rate": 2.8259717314487632e-05,
"loss": 0.5417,
"step": 3200
},
{
"epoch": 0.057429626619073706,
"grad_norm": 2.8348097801208496,
"learning_rate": 2.870141342756184e-05,
"loss": 0.5039,
"step": 3250
},
{
"epoch": 0.05831315933629022,
"grad_norm": 3.6297833919525146,
"learning_rate": 2.9143109540636045e-05,
"loss": 0.4647,
"step": 3300
},
{
"epoch": 0.05919669205350674,
"grad_norm": 2.6729063987731934,
"learning_rate": 2.9584805653710253e-05,
"loss": 0.4652,
"step": 3350
},
{
"epoch": 0.06008022477072326,
"grad_norm": 3.030548572540283,
"learning_rate": 3.0026501766784454e-05,
"loss": 0.4914,
"step": 3400
},
{
"epoch": 0.06096375748793978,
"grad_norm": 1.844643235206604,
"learning_rate": 3.0468197879858658e-05,
"loss": 0.5449,
"step": 3450
},
{
"epoch": 0.061847290205156294,
"grad_norm": 1.6973118782043457,
"learning_rate": 3.090989399293286e-05,
"loss": 0.5072,
"step": 3500
},
{
"epoch": 0.06273082292237281,
"grad_norm": 2.626692295074463,
"learning_rate": 3.135159010600707e-05,
"loss": 0.5639,
"step": 3550
},
{
"epoch": 0.06361435563958934,
"grad_norm": 2.971773624420166,
"learning_rate": 3.179328621908128e-05,
"loss": 0.4729,
"step": 3600
},
{
"epoch": 0.06449788835680585,
"grad_norm": 2.134610414505005,
"learning_rate": 3.2234982332155476e-05,
"loss": 0.6047,
"step": 3650
},
{
"epoch": 0.06538142107402237,
"grad_norm": 1.8596552610397339,
"learning_rate": 3.267667844522969e-05,
"loss": 0.5369,
"step": 3700
},
{
"epoch": 0.0662649537912389,
"grad_norm": 2.5137698650360107,
"learning_rate": 3.311837455830389e-05,
"loss": 0.5014,
"step": 3750
},
{
"epoch": 0.06714848650845541,
"grad_norm": 2.8211522102355957,
"learning_rate": 3.356007067137809e-05,
"loss": 0.5128,
"step": 3800
},
{
"epoch": 0.06803201922567192,
"grad_norm": 2.095426559448242,
"learning_rate": 3.40017667844523e-05,
"loss": 0.5345,
"step": 3850
},
{
"epoch": 0.06891555194288844,
"grad_norm": 2.1965081691741943,
"learning_rate": 3.4443462897526505e-05,
"loss": 0.479,
"step": 3900
},
{
"epoch": 0.06979908466010497,
"grad_norm": 2.1722958087921143,
"learning_rate": 3.488515901060071e-05,
"loss": 0.5652,
"step": 3950
},
{
"epoch": 0.07068261737732148,
"grad_norm": 2.7183449268341064,
"learning_rate": 3.5326855123674914e-05,
"loss": 0.5272,
"step": 4000
},
{
"epoch": 0.071566150094538,
"grad_norm": 2.356076717376709,
"learning_rate": 3.576855123674912e-05,
"loss": 0.4904,
"step": 4050
},
{
"epoch": 0.07244968281175453,
"grad_norm": 1.7549006938934326,
"learning_rate": 3.621024734982332e-05,
"loss": 0.4755,
"step": 4100
},
{
"epoch": 0.07333321552897104,
"grad_norm": 2.0377912521362305,
"learning_rate": 3.665194346289753e-05,
"loss": 0.4897,
"step": 4150
},
{
"epoch": 0.07421674824618756,
"grad_norm": 2.4711716175079346,
"learning_rate": 3.709363957597173e-05,
"loss": 0.4679,
"step": 4200
},
{
"epoch": 0.07510028096340407,
"grad_norm": 2.700162649154663,
"learning_rate": 3.7535335689045936e-05,
"loss": 0.4712,
"step": 4250
},
{
"epoch": 0.0759838136806206,
"grad_norm": 1.9648590087890625,
"learning_rate": 3.797703180212015e-05,
"loss": 0.4779,
"step": 4300
},
{
"epoch": 0.07686734639783711,
"grad_norm": 2.4238970279693604,
"learning_rate": 3.8418727915194345e-05,
"loss": 0.4463,
"step": 4350
},
{
"epoch": 0.07775087911505363,
"grad_norm": 1.745356798171997,
"learning_rate": 3.8860424028268556e-05,
"loss": 0.4917,
"step": 4400
},
{
"epoch": 0.07863441183227014,
"grad_norm": 5.889612197875977,
"learning_rate": 3.930212014134276e-05,
"loss": 0.5572,
"step": 4450
},
{
"epoch": 0.07951794454948667,
"grad_norm": 2.7529609203338623,
"learning_rate": 3.9743816254416965e-05,
"loss": 0.4553,
"step": 4500
},
{
"epoch": 0.08040147726670319,
"grad_norm": 2.4175944328308105,
"learning_rate": 4.018551236749117e-05,
"loss": 0.4598,
"step": 4550
},
{
"epoch": 0.0812850099839197,
"grad_norm": 2.2330217361450195,
"learning_rate": 4.0627208480565374e-05,
"loss": 0.5445,
"step": 4600
},
{
"epoch": 0.08216854270113623,
"grad_norm": 2.4177329540252686,
"learning_rate": 4.106890459363958e-05,
"loss": 0.4537,
"step": 4650
},
{
"epoch": 0.08305207541835274,
"grad_norm": 2.6188764572143555,
"learning_rate": 4.151060070671378e-05,
"loss": 0.5158,
"step": 4700
},
{
"epoch": 0.08393560813556926,
"grad_norm": 3.5044455528259277,
"learning_rate": 4.195229681978799e-05,
"loss": 0.4598,
"step": 4750
},
{
"epoch": 0.08481914085278577,
"grad_norm": 2.2751505374908447,
"learning_rate": 4.239399293286219e-05,
"loss": 0.4662,
"step": 4800
},
{
"epoch": 0.0857026735700023,
"grad_norm": 2.0289080142974854,
"learning_rate": 4.28356890459364e-05,
"loss": 0.459,
"step": 4850
},
{
"epoch": 0.08658620628721882,
"grad_norm": 2.6102516651153564,
"learning_rate": 4.32773851590106e-05,
"loss": 0.4275,
"step": 4900
},
{
"epoch": 0.08746973900443533,
"grad_norm": 2.5842251777648926,
"learning_rate": 4.3719081272084805e-05,
"loss": 0.5575,
"step": 4950
},
{
"epoch": 0.08835327172165185,
"grad_norm": 3.6427652835845947,
"learning_rate": 4.4160777385159016e-05,
"loss": 0.4197,
"step": 5000
},
{
"epoch": 0.08923680443886838,
"grad_norm": 1.8962676525115967,
"learning_rate": 4.4602473498233214e-05,
"loss": 0.4525,
"step": 5050
},
{
"epoch": 0.09012033715608489,
"grad_norm": 2.1373822689056396,
"learning_rate": 4.5044169611307425e-05,
"loss": 0.4469,
"step": 5100
},
{
"epoch": 0.0910038698733014,
"grad_norm": 5.542126178741455,
"learning_rate": 4.548586572438163e-05,
"loss": 0.5283,
"step": 5150
},
{
"epoch": 0.09188740259051793,
"grad_norm": 2.4414310455322266,
"learning_rate": 4.5927561837455834e-05,
"loss": 0.4826,
"step": 5200
},
{
"epoch": 0.09277093530773445,
"grad_norm": 3.52422833442688,
"learning_rate": 4.636925795053004e-05,
"loss": 0.3895,
"step": 5250
},
{
"epoch": 0.09365446802495096,
"grad_norm": 2.1975631713867188,
"learning_rate": 4.681095406360424e-05,
"loss": 0.4873,
"step": 5300
},
{
"epoch": 0.09453800074216748,
"grad_norm": 3.4910616874694824,
"learning_rate": 4.725265017667845e-05,
"loss": 0.4895,
"step": 5350
},
{
"epoch": 0.095421533459384,
"grad_norm": 2.1225690841674805,
"learning_rate": 4.769434628975265e-05,
"loss": 0.4686,
"step": 5400
},
{
"epoch": 0.09630506617660052,
"grad_norm": 2.2319257259368896,
"learning_rate": 4.8136042402826856e-05,
"loss": 0.4723,
"step": 5450
},
{
"epoch": 0.09718859889381704,
"grad_norm": 2.2340879440307617,
"learning_rate": 4.857773851590106e-05,
"loss": 0.5258,
"step": 5500
},
{
"epoch": 0.09807213161103355,
"grad_norm": 3.2808139324188232,
"learning_rate": 4.901943462897527e-05,
"loss": 0.4851,
"step": 5550
},
{
"epoch": 0.09895566432825008,
"grad_norm": 2.4828484058380127,
"learning_rate": 4.946113074204947e-05,
"loss": 0.5311,
"step": 5600
},
{
"epoch": 0.0998391970454666,
"grad_norm": 1.7307246923446655,
"learning_rate": 4.990282685512368e-05,
"loss": 0.411,
"step": 5650
},
{
"epoch": 0.10072272976268311,
"grad_norm": 1.9073278903961182,
"learning_rate": 4.996171290569595e-05,
"loss": 0.4184,
"step": 5700
},
{
"epoch": 0.10160626247989964,
"grad_norm": 1.8571208715438843,
"learning_rate": 4.9912626887357406e-05,
"loss": 0.4071,
"step": 5750
},
{
"epoch": 0.10248979519711615,
"grad_norm": 1.7524621486663818,
"learning_rate": 4.986354086901887e-05,
"loss": 0.4712,
"step": 5800
},
{
"epoch": 0.10337332791433267,
"grad_norm": 4.2943434715271,
"learning_rate": 4.9814454850680335e-05,
"loss": 0.4912,
"step": 5850
},
{
"epoch": 0.10425686063154918,
"grad_norm": 2.398043632507324,
"learning_rate": 4.97653688323418e-05,
"loss": 0.5589,
"step": 5900
},
{
"epoch": 0.10514039334876571,
"grad_norm": 1.9587973356246948,
"learning_rate": 4.9716282814003265e-05,
"loss": 0.4507,
"step": 5950
},
{
"epoch": 0.10602392606598222,
"grad_norm": 2.0629475116729736,
"learning_rate": 4.966719679566473e-05,
"loss": 0.5429,
"step": 6000
},
{
"epoch": 0.10690745878319874,
"grad_norm": 1.6127039194107056,
"learning_rate": 4.961811077732619e-05,
"loss": 0.3789,
"step": 6050
},
{
"epoch": 0.10779099150041525,
"grad_norm": 2.230015993118286,
"learning_rate": 4.956902475898765e-05,
"loss": 0.3949,
"step": 6100
},
{
"epoch": 0.10867452421763178,
"grad_norm": 1.9963310956954956,
"learning_rate": 4.9519938740649116e-05,
"loss": 0.4491,
"step": 6150
},
{
"epoch": 0.1095580569348483,
"grad_norm": 2.2731542587280273,
"learning_rate": 4.947085272231058e-05,
"loss": 0.435,
"step": 6200
},
{
"epoch": 0.11044158965206481,
"grad_norm": 2.447551727294922,
"learning_rate": 4.9421766703972046e-05,
"loss": 0.3865,
"step": 6250
},
{
"epoch": 0.11132512236928134,
"grad_norm": 2.126950740814209,
"learning_rate": 4.9372680685633504e-05,
"loss": 0.4175,
"step": 6300
},
{
"epoch": 0.11220865508649785,
"grad_norm": 2.22995924949646,
"learning_rate": 4.932359466729497e-05,
"loss": 0.4387,
"step": 6350
},
{
"epoch": 0.11309218780371437,
"grad_norm": 1.5801736116409302,
"learning_rate": 4.927450864895643e-05,
"loss": 0.4554,
"step": 6400
},
{
"epoch": 0.11397572052093088,
"grad_norm": 4.113645553588867,
"learning_rate": 4.92254226306179e-05,
"loss": 0.581,
"step": 6450
},
{
"epoch": 0.11485925323814741,
"grad_norm": 1.6027569770812988,
"learning_rate": 4.917633661227936e-05,
"loss": 0.4746,
"step": 6500
},
{
"epoch": 0.11574278595536393,
"grad_norm": 2.0555272102355957,
"learning_rate": 4.912725059394083e-05,
"loss": 0.4511,
"step": 6550
},
{
"epoch": 0.11662631867258044,
"grad_norm": 2.6827495098114014,
"learning_rate": 4.9078164575602285e-05,
"loss": 0.3871,
"step": 6600
},
{
"epoch": 0.11750985138979697,
"grad_norm": 1.969202995300293,
"learning_rate": 4.902907855726375e-05,
"loss": 0.449,
"step": 6650
},
{
"epoch": 0.11839338410701349,
"grad_norm": 1.9535086154937744,
"learning_rate": 4.8979992538925214e-05,
"loss": 0.3458,
"step": 6700
},
{
"epoch": 0.11927691682423,
"grad_norm": 1.7251821756362915,
"learning_rate": 4.893090652058668e-05,
"loss": 0.4791,
"step": 6750
},
{
"epoch": 0.12016044954144652,
"grad_norm": 1.7175688743591309,
"learning_rate": 4.8881820502248144e-05,
"loss": 0.4445,
"step": 6800
},
{
"epoch": 0.12104398225866304,
"grad_norm": 3.1055896282196045,
"learning_rate": 4.88327344839096e-05,
"loss": 0.4907,
"step": 6850
},
{
"epoch": 0.12192751497587956,
"grad_norm": 3.251380681991577,
"learning_rate": 4.8783648465571066e-05,
"loss": 0.5377,
"step": 6900
},
{
"epoch": 0.12281104769309607,
"grad_norm": 2.909510850906372,
"learning_rate": 4.873456244723254e-05,
"loss": 0.5275,
"step": 6950
},
{
"epoch": 0.12369458041031259,
"grad_norm": 2.0700035095214844,
"learning_rate": 4.8685476428893995e-05,
"loss": 0.5489,
"step": 7000
},
{
"epoch": 0.12457811312752912,
"grad_norm": 1.9759315252304077,
"learning_rate": 4.863639041055546e-05,
"loss": 0.3931,
"step": 7050
},
{
"epoch": 0.12546164584474562,
"grad_norm": 1.9036837816238403,
"learning_rate": 4.8587304392216925e-05,
"loss": 0.5155,
"step": 7100
},
{
"epoch": 0.12634517856196215,
"grad_norm": 3.4224536418914795,
"learning_rate": 4.853821837387838e-05,
"loss": 0.4282,
"step": 7150
},
{
"epoch": 0.12722871127917867,
"grad_norm": 3.1725916862487793,
"learning_rate": 4.8489132355539854e-05,
"loss": 0.4639,
"step": 7200
},
{
"epoch": 0.12811224399639518,
"grad_norm": 1.7154817581176758,
"learning_rate": 4.844004633720131e-05,
"loss": 0.5294,
"step": 7250
},
{
"epoch": 0.1289957767136117,
"grad_norm": 2.130659580230713,
"learning_rate": 4.839096031886278e-05,
"loss": 0.4121,
"step": 7300
},
{
"epoch": 0.12987930943082823,
"grad_norm": 1.8878060579299927,
"learning_rate": 4.834187430052424e-05,
"loss": 0.4139,
"step": 7350
},
{
"epoch": 0.13076284214804473,
"grad_norm": 1.9885565042495728,
"learning_rate": 4.82927882821857e-05,
"loss": 0.4311,
"step": 7400
},
{
"epoch": 0.13164637486526126,
"grad_norm": 2.3639650344848633,
"learning_rate": 4.824370226384717e-05,
"loss": 0.4025,
"step": 7450
},
{
"epoch": 0.1325299075824778,
"grad_norm": 3.4997270107269287,
"learning_rate": 4.8194616245508635e-05,
"loss": 0.4791,
"step": 7500
},
{
"epoch": 0.1334134402996943,
"grad_norm": 1.644084095954895,
"learning_rate": 4.814553022717009e-05,
"loss": 0.4498,
"step": 7550
},
{
"epoch": 0.13429697301691082,
"grad_norm": 1.8292336463928223,
"learning_rate": 4.809644420883156e-05,
"loss": 0.4538,
"step": 7600
},
{
"epoch": 0.13518050573412735,
"grad_norm": 3.380443572998047,
"learning_rate": 4.804735819049302e-05,
"loss": 0.4596,
"step": 7650
},
{
"epoch": 0.13606403845134385,
"grad_norm": 1.6248747110366821,
"learning_rate": 4.799827217215449e-05,
"loss": 0.3508,
"step": 7700
},
{
"epoch": 0.13694757116856038,
"grad_norm": 1.6644774675369263,
"learning_rate": 4.794918615381595e-05,
"loss": 0.5145,
"step": 7750
},
{
"epoch": 0.13783110388577688,
"grad_norm": 1.8441638946533203,
"learning_rate": 4.790010013547741e-05,
"loss": 0.3505,
"step": 7800
},
{
"epoch": 0.1387146366029934,
"grad_norm": 1.761982798576355,
"learning_rate": 4.7851014117138874e-05,
"loss": 0.3354,
"step": 7850
},
{
"epoch": 0.13959816932020994,
"grad_norm": 3.417602777481079,
"learning_rate": 4.780192809880034e-05,
"loss": 0.4474,
"step": 7900
},
{
"epoch": 0.14048170203742644,
"grad_norm": 1.7687017917633057,
"learning_rate": 4.7752842080461804e-05,
"loss": 0.3524,
"step": 7950
},
{
"epoch": 0.14136523475464297,
"grad_norm": 3.2442593574523926,
"learning_rate": 4.770375606212327e-05,
"loss": 0.4957,
"step": 8000
},
{
"epoch": 0.1422487674718595,
"grad_norm": 1.813818335533142,
"learning_rate": 4.765467004378473e-05,
"loss": 0.4461,
"step": 8050
},
{
"epoch": 0.143132300189076,
"grad_norm": 1.936123013496399,
"learning_rate": 4.760558402544619e-05,
"loss": 0.4983,
"step": 8100
},
{
"epoch": 0.14401583290629252,
"grad_norm": 2.0068929195404053,
"learning_rate": 4.7556498007107656e-05,
"loss": 0.4535,
"step": 8150
},
{
"epoch": 0.14489936562350905,
"grad_norm": 1.6743545532226562,
"learning_rate": 4.750741198876913e-05,
"loss": 0.3668,
"step": 8200
},
{
"epoch": 0.14578289834072555,
"grad_norm": 1.9963476657867432,
"learning_rate": 4.7458325970430585e-05,
"loss": 0.4688,
"step": 8250
},
{
"epoch": 0.14666643105794208,
"grad_norm": 1.7402074337005615,
"learning_rate": 4.740923995209205e-05,
"loss": 0.3967,
"step": 8300
},
{
"epoch": 0.14754996377515858,
"grad_norm": 2.0074145793914795,
"learning_rate": 4.736015393375351e-05,
"loss": 0.4911,
"step": 8350
},
{
"epoch": 0.1484334964923751,
"grad_norm": 1.7804876565933228,
"learning_rate": 4.731106791541497e-05,
"loss": 0.4076,
"step": 8400
},
{
"epoch": 0.14931702920959164,
"grad_norm": 2.1234054565429688,
"learning_rate": 4.7261981897076444e-05,
"loss": 0.398,
"step": 8450
},
{
"epoch": 0.15020056192680814,
"grad_norm": 2.1532113552093506,
"learning_rate": 4.72128958787379e-05,
"loss": 0.4203,
"step": 8500
},
{
"epoch": 0.15108409464402467,
"grad_norm": 1.8909550905227661,
"learning_rate": 4.7163809860399366e-05,
"loss": 0.414,
"step": 8550
},
{
"epoch": 0.1519676273612412,
"grad_norm": 1.9415462017059326,
"learning_rate": 4.711472384206083e-05,
"loss": 0.3436,
"step": 8600
},
{
"epoch": 0.1528511600784577,
"grad_norm": 2.2018544673919678,
"learning_rate": 4.706563782372229e-05,
"loss": 0.436,
"step": 8650
},
{
"epoch": 0.15373469279567423,
"grad_norm": 1.5418767929077148,
"learning_rate": 4.701655180538376e-05,
"loss": 0.3761,
"step": 8700
},
{
"epoch": 0.15461822551289076,
"grad_norm": 4.974616050720215,
"learning_rate": 4.6967465787045225e-05,
"loss": 0.5579,
"step": 8750
},
{
"epoch": 0.15550175823010726,
"grad_norm": 1.8653486967086792,
"learning_rate": 4.691837976870668e-05,
"loss": 0.441,
"step": 8800
},
{
"epoch": 0.15638529094732379,
"grad_norm": 2.2241523265838623,
"learning_rate": 4.686929375036815e-05,
"loss": 0.5877,
"step": 8850
},
{
"epoch": 0.15726882366454029,
"grad_norm": 1.8084393739700317,
"learning_rate": 4.6820207732029605e-05,
"loss": 0.4081,
"step": 8900
},
{
"epoch": 0.15815235638175681,
"grad_norm": 1.5464160442352295,
"learning_rate": 4.677112171369108e-05,
"loss": 0.4648,
"step": 8950
},
{
"epoch": 0.15903588909897334,
"grad_norm": 1.7731395959854126,
"learning_rate": 4.672203569535254e-05,
"loss": 0.4321,
"step": 9000
},
{
"epoch": 0.15991942181618984,
"grad_norm": 1.8130481243133545,
"learning_rate": 4.6672949677014e-05,
"loss": 0.4226,
"step": 9050
},
{
"epoch": 0.16080295453340637,
"grad_norm": 2.4127371311187744,
"learning_rate": 4.6623863658675464e-05,
"loss": 0.3634,
"step": 9100
},
{
"epoch": 0.1616864872506229,
"grad_norm": 2.362494707107544,
"learning_rate": 4.657477764033693e-05,
"loss": 0.4252,
"step": 9150
},
{
"epoch": 0.1625700199678394,
"grad_norm": 1.855000615119934,
"learning_rate": 4.6525691621998393e-05,
"loss": 0.3899,
"step": 9200
},
{
"epoch": 0.16345355268505593,
"grad_norm": 1.8728185892105103,
"learning_rate": 4.647660560365986e-05,
"loss": 0.4335,
"step": 9250
},
{
"epoch": 0.16433708540227246,
"grad_norm": 1.977250576019287,
"learning_rate": 4.642751958532132e-05,
"loss": 0.4204,
"step": 9300
},
{
"epoch": 0.16522061811948896,
"grad_norm": 4.992434978485107,
"learning_rate": 4.637843356698278e-05,
"loss": 0.5576,
"step": 9350
},
{
"epoch": 0.1661041508367055,
"grad_norm": 1.673086166381836,
"learning_rate": 4.6329347548644245e-05,
"loss": 0.4712,
"step": 9400
},
{
"epoch": 0.166987683553922,
"grad_norm": 1.8109374046325684,
"learning_rate": 4.628026153030571e-05,
"loss": 0.366,
"step": 9450
},
{
"epoch": 0.16787121627113852,
"grad_norm": 1.9352269172668457,
"learning_rate": 4.6231175511967175e-05,
"loss": 0.3932,
"step": 9500
},
{
"epoch": 0.16875474898835505,
"grad_norm": 1.7740451097488403,
"learning_rate": 4.618208949362864e-05,
"loss": 0.4836,
"step": 9550
},
{
"epoch": 0.16963828170557155,
"grad_norm": 2.0106916427612305,
"learning_rate": 4.61330034752901e-05,
"loss": 0.3989,
"step": 9600
},
{
"epoch": 0.17052181442278808,
"grad_norm": 1.5831292867660522,
"learning_rate": 4.608391745695156e-05,
"loss": 0.4025,
"step": 9650
},
{
"epoch": 0.1714053471400046,
"grad_norm": 5.1861371994018555,
"learning_rate": 4.6034831438613027e-05,
"loss": 0.467,
"step": 9700
},
{
"epoch": 0.1722888798572211,
"grad_norm": 3.7466721534729004,
"learning_rate": 4.598574542027449e-05,
"loss": 0.3558,
"step": 9750
},
{
"epoch": 0.17317241257443763,
"grad_norm": 2.143721342086792,
"learning_rate": 4.5936659401935956e-05,
"loss": 0.3623,
"step": 9800
},
{
"epoch": 0.17405594529165416,
"grad_norm": 2.1482434272766113,
"learning_rate": 4.588757338359742e-05,
"loss": 0.3438,
"step": 9850
},
{
"epoch": 0.17493947800887066,
"grad_norm": 1.458309531211853,
"learning_rate": 4.583848736525888e-05,
"loss": 0.4193,
"step": 9900
},
{
"epoch": 0.1758230107260872,
"grad_norm": 1.8698090314865112,
"learning_rate": 4.578940134692034e-05,
"loss": 0.3173,
"step": 9950
},
{
"epoch": 0.1767065434433037,
"grad_norm": 2.087970018386841,
"learning_rate": 4.574031532858181e-05,
"loss": 0.4569,
"step": 10000
},
{
"epoch": 0.17759007616052022,
"grad_norm": 1.6226812601089478,
"learning_rate": 4.569122931024327e-05,
"loss": 0.4538,
"step": 10050
},
{
"epoch": 0.17847360887773675,
"grad_norm": 1.9845385551452637,
"learning_rate": 4.564214329190474e-05,
"loss": 0.4422,
"step": 10100
},
{
"epoch": 0.17935714159495325,
"grad_norm": 1.7016047239303589,
"learning_rate": 4.5593057273566195e-05,
"loss": 0.3747,
"step": 10150
},
{
"epoch": 0.18024067431216978,
"grad_norm": 2.2167670726776123,
"learning_rate": 4.5543971255227666e-05,
"loss": 0.3989,
"step": 10200
},
{
"epoch": 0.1811242070293863,
"grad_norm": 1.464385747909546,
"learning_rate": 4.549488523688913e-05,
"loss": 0.5315,
"step": 10250
},
{
"epoch": 0.1820077397466028,
"grad_norm": 1.2073971033096313,
"learning_rate": 4.544579921855059e-05,
"loss": 0.3565,
"step": 10300
},
{
"epoch": 0.18289127246381934,
"grad_norm": 1.1773017644882202,
"learning_rate": 4.5396713200212054e-05,
"loss": 0.4409,
"step": 10350
},
{
"epoch": 0.18377480518103587,
"grad_norm": 2.4389290809631348,
"learning_rate": 4.534762718187352e-05,
"loss": 0.3762,
"step": 10400
},
{
"epoch": 0.18465833789825237,
"grad_norm": 3.560997247695923,
"learning_rate": 4.529854116353498e-05,
"loss": 0.4571,
"step": 10450
},
{
"epoch": 0.1855418706154689,
"grad_norm": 2.0075438022613525,
"learning_rate": 4.524945514519645e-05,
"loss": 0.3561,
"step": 10500
},
{
"epoch": 0.1864254033326854,
"grad_norm": 2.405439853668213,
"learning_rate": 4.5200369126857906e-05,
"loss": 0.4595,
"step": 10550
},
{
"epoch": 0.18730893604990193,
"grad_norm": 1.6211732625961304,
"learning_rate": 4.515128310851937e-05,
"loss": 0.4576,
"step": 10600
},
{
"epoch": 0.18819246876711845,
"grad_norm": 1.7272285223007202,
"learning_rate": 4.5102197090180835e-05,
"loss": 0.4957,
"step": 10650
},
{
"epoch": 0.18907600148433495,
"grad_norm": 1.529583215713501,
"learning_rate": 4.50531110718423e-05,
"loss": 0.3533,
"step": 10700
},
{
"epoch": 0.18995953420155148,
"grad_norm": 1.3267425298690796,
"learning_rate": 4.5004025053503764e-05,
"loss": 0.5213,
"step": 10750
},
{
"epoch": 0.190843066918768,
"grad_norm": 2.40889573097229,
"learning_rate": 4.495493903516523e-05,
"loss": 0.4372,
"step": 10800
},
{
"epoch": 0.1917265996359845,
"grad_norm": 2.532017230987549,
"learning_rate": 4.4906834737193457e-05,
"loss": 0.3286,
"step": 10850
},
{
"epoch": 0.19261013235320104,
"grad_norm": 3.721505641937256,
"learning_rate": 4.485774871885493e-05,
"loss": 0.4082,
"step": 10900
},
{
"epoch": 0.19349366507041757,
"grad_norm": 2.2368271350860596,
"learning_rate": 4.4808662700516386e-05,
"loss": 0.4056,
"step": 10950
},
{
"epoch": 0.19437719778763407,
"grad_norm": 2.2011897563934326,
"learning_rate": 4.475957668217785e-05,
"loss": 0.4435,
"step": 11000
},
{
"epoch": 0.1952607305048506,
"grad_norm": 2.1512463092803955,
"learning_rate": 4.4710490663839315e-05,
"loss": 0.4272,
"step": 11050
},
{
"epoch": 0.1961442632220671,
"grad_norm": 1.5526123046875,
"learning_rate": 4.466140464550077e-05,
"loss": 0.4334,
"step": 11100
},
{
"epoch": 0.19702779593928363,
"grad_norm": 1.4258567094802856,
"learning_rate": 4.4612318627162245e-05,
"loss": 0.4479,
"step": 11150
},
{
"epoch": 0.19791132865650016,
"grad_norm": 3.2408463954925537,
"learning_rate": 4.456323260882371e-05,
"loss": 0.3545,
"step": 11200
},
{
"epoch": 0.19879486137371666,
"grad_norm": 2.1903252601623535,
"learning_rate": 4.451414659048517e-05,
"loss": 0.3192,
"step": 11250
},
{
"epoch": 0.1996783940909332,
"grad_norm": 1.9699974060058594,
"learning_rate": 4.446506057214663e-05,
"loss": 0.3883,
"step": 11300
},
{
"epoch": 0.20056192680814972,
"grad_norm": 1.7133831977844238,
"learning_rate": 4.441597455380809e-05,
"loss": 0.3312,
"step": 11350
},
{
"epoch": 0.20144545952536622,
"grad_norm": 3.0174543857574463,
"learning_rate": 4.436688853546956e-05,
"loss": 0.4888,
"step": 11400
},
{
"epoch": 0.20232899224258274,
"grad_norm": 2.010566473007202,
"learning_rate": 4.4317802517131026e-05,
"loss": 0.5102,
"step": 11450
},
{
"epoch": 0.20321252495979927,
"grad_norm": 2.093271493911743,
"learning_rate": 4.4268716498792484e-05,
"loss": 0.4133,
"step": 11500
},
{
"epoch": 0.20409605767701577,
"grad_norm": 1.9231561422348022,
"learning_rate": 4.421963048045395e-05,
"loss": 0.4255,
"step": 11550
},
{
"epoch": 0.2049795903942323,
"grad_norm": 1.561781644821167,
"learning_rate": 4.417054446211541e-05,
"loss": 0.3766,
"step": 11600
},
{
"epoch": 0.2058631231114488,
"grad_norm": 2.006748676300049,
"learning_rate": 4.412145844377688e-05,
"loss": 0.3651,
"step": 11650
},
{
"epoch": 0.20674665582866533,
"grad_norm": 1.5192091464996338,
"learning_rate": 4.407237242543834e-05,
"loss": 0.4562,
"step": 11700
},
{
"epoch": 0.20763018854588186,
"grad_norm": 1.820331335067749,
"learning_rate": 4.402328640709981e-05,
"loss": 0.3946,
"step": 11750
},
{
"epoch": 0.20851372126309836,
"grad_norm": 3.302582025527954,
"learning_rate": 4.3974200388761265e-05,
"loss": 0.4075,
"step": 11800
},
{
"epoch": 0.2093972539803149,
"grad_norm": 2.601897716522217,
"learning_rate": 4.392511437042273e-05,
"loss": 0.4304,
"step": 11850
},
{
"epoch": 0.21028078669753142,
"grad_norm": 1.58085036277771,
"learning_rate": 4.3876028352084194e-05,
"loss": 0.3404,
"step": 11900
},
{
"epoch": 0.21116431941474792,
"grad_norm": 1.7569571733474731,
"learning_rate": 4.382694233374566e-05,
"loss": 0.4013,
"step": 11950
},
{
"epoch": 0.21204785213196445,
"grad_norm": 1.9872467517852783,
"learning_rate": 4.3777856315407124e-05,
"loss": 0.4278,
"step": 12000
},
{
"epoch": 0.21293138484918098,
"grad_norm": 1.4981114864349365,
"learning_rate": 4.372877029706858e-05,
"loss": 0.3905,
"step": 12050
},
{
"epoch": 0.21381491756639748,
"grad_norm": 1.6444882154464722,
"learning_rate": 4.3679684278730046e-05,
"loss": 0.4082,
"step": 12100
},
{
"epoch": 0.214698450283614,
"grad_norm": 1.9731707572937012,
"learning_rate": 4.363059826039151e-05,
"loss": 0.3855,
"step": 12150
},
{
"epoch": 0.2155819830008305,
"grad_norm": 2.66648268699646,
"learning_rate": 4.3581512242052976e-05,
"loss": 0.4567,
"step": 12200
},
{
"epoch": 0.21646551571804704,
"grad_norm": 2.0770373344421387,
"learning_rate": 4.353242622371444e-05,
"loss": 0.4368,
"step": 12250
},
{
"epoch": 0.21734904843526356,
"grad_norm": 1.4739536046981812,
"learning_rate": 4.3483340205375905e-05,
"loss": 0.3686,
"step": 12300
},
{
"epoch": 0.21823258115248007,
"grad_norm": 1.8857239484786987,
"learning_rate": 4.343425418703736e-05,
"loss": 0.4163,
"step": 12350
},
{
"epoch": 0.2191161138696966,
"grad_norm": 1.722424030303955,
"learning_rate": 4.3385168168698834e-05,
"loss": 0.3595,
"step": 12400
},
{
"epoch": 0.21999964658691312,
"grad_norm": 1.5602166652679443,
"learning_rate": 4.333608215036029e-05,
"loss": 0.3326,
"step": 12450
},
{
"epoch": 0.22088317930412962,
"grad_norm": 1.7230535745620728,
"learning_rate": 4.328699613202176e-05,
"loss": 0.3775,
"step": 12500
},
{
"epoch": 0.22176671202134615,
"grad_norm": 1.8666094541549683,
"learning_rate": 4.323791011368322e-05,
"loss": 0.3695,
"step": 12550
},
{
"epoch": 0.22265024473856268,
"grad_norm": 3.1689233779907227,
"learning_rate": 4.318882409534468e-05,
"loss": 0.3545,
"step": 12600
},
{
"epoch": 0.22353377745577918,
"grad_norm": 1.8885284662246704,
"learning_rate": 4.313973807700615e-05,
"loss": 0.3548,
"step": 12650
},
{
"epoch": 0.2244173101729957,
"grad_norm": 1.8508330583572388,
"learning_rate": 4.3090652058667615e-05,
"loss": 0.4847,
"step": 12700
},
{
"epoch": 0.22530084289021224,
"grad_norm": 2.1445882320404053,
"learning_rate": 4.304156604032907e-05,
"loss": 0.4,
"step": 12750
},
{
"epoch": 0.22618437560742874,
"grad_norm": 1.721024990081787,
"learning_rate": 4.299248002199054e-05,
"loss": 0.4755,
"step": 12800
},
{
"epoch": 0.22706790832464527,
"grad_norm": 1.7713844776153564,
"learning_rate": 4.2943394003652e-05,
"loss": 0.3399,
"step": 12850
},
{
"epoch": 0.22795144104186177,
"grad_norm": 1.2936394214630127,
"learning_rate": 4.289528970568024e-05,
"loss": 0.3297,
"step": 12900
},
{
"epoch": 0.2288349737590783,
"grad_norm": 1.6622658967971802,
"learning_rate": 4.28462036873417e-05,
"loss": 0.4071,
"step": 12950
},
{
"epoch": 0.22971850647629483,
"grad_norm": 1.3949196338653564,
"learning_rate": 4.279711766900316e-05,
"loss": 0.4069,
"step": 13000
},
{
"epoch": 0.23060203919351133,
"grad_norm": 1.8681453466415405,
"learning_rate": 4.2748031650664624e-05,
"loss": 0.5156,
"step": 13050
},
{
"epoch": 0.23148557191072786,
"grad_norm": 1.6242793798446655,
"learning_rate": 4.2698945632326096e-05,
"loss": 0.4359,
"step": 13100
},
{
"epoch": 0.23236910462794438,
"grad_norm": 2.897428035736084,
"learning_rate": 4.2649859613987554e-05,
"loss": 0.3702,
"step": 13150
},
{
"epoch": 0.23325263734516088,
"grad_norm": 1.855938196182251,
"learning_rate": 4.260077359564902e-05,
"loss": 0.5026,
"step": 13200
},
{
"epoch": 0.2341361700623774,
"grad_norm": 1.818076252937317,
"learning_rate": 4.2551687577310476e-05,
"loss": 0.5201,
"step": 13250
},
{
"epoch": 0.23501970277959394,
"grad_norm": 1.9688682556152344,
"learning_rate": 4.250260155897194e-05,
"loss": 0.3857,
"step": 13300
},
{
"epoch": 0.23590323549681044,
"grad_norm": 2.4908297061920166,
"learning_rate": 4.245351554063341e-05,
"loss": 0.3555,
"step": 13350
},
{
"epoch": 0.23678676821402697,
"grad_norm": 1.9015276432037354,
"learning_rate": 4.240442952229487e-05,
"loss": 0.381,
"step": 13400
},
{
"epoch": 0.23767030093124347,
"grad_norm": 3.011683225631714,
"learning_rate": 4.2355343503956335e-05,
"loss": 0.3804,
"step": 13450
},
{
"epoch": 0.23855383364846,
"grad_norm": 3.5077691078186035,
"learning_rate": 4.23062574856178e-05,
"loss": 0.3666,
"step": 13500
},
{
"epoch": 0.23943736636567653,
"grad_norm": 2.875953197479248,
"learning_rate": 4.225717146727926e-05,
"loss": 0.3792,
"step": 13550
},
{
"epoch": 0.24032089908289303,
"grad_norm": 2.3432717323303223,
"learning_rate": 4.220808544894073e-05,
"loss": 0.3341,
"step": 13600
},
{
"epoch": 0.24120443180010956,
"grad_norm": 1.6648529767990112,
"learning_rate": 4.2158999430602194e-05,
"loss": 0.4906,
"step": 13650
},
{
"epoch": 0.2420879645173261,
"grad_norm": 2.034646987915039,
"learning_rate": 4.210991341226365e-05,
"loss": 0.541,
"step": 13700
},
{
"epoch": 0.2429714972345426,
"grad_norm": 1.2273883819580078,
"learning_rate": 4.2060827393925116e-05,
"loss": 0.3936,
"step": 13750
},
{
"epoch": 0.24385502995175912,
"grad_norm": 1.6031947135925293,
"learning_rate": 4.201174137558658e-05,
"loss": 0.3871,
"step": 13800
},
{
"epoch": 0.24473856266897565,
"grad_norm": 1.7289350032806396,
"learning_rate": 4.1962655357248045e-05,
"loss": 0.2983,
"step": 13850
},
{
"epoch": 0.24562209538619215,
"grad_norm": 1.792413592338562,
"learning_rate": 4.191356933890951e-05,
"loss": 0.4071,
"step": 13900
},
{
"epoch": 0.24650562810340867,
"grad_norm": 1.5456571578979492,
"learning_rate": 4.186448332057097e-05,
"loss": 0.3434,
"step": 13950
},
{
"epoch": 0.24738916082062518,
"grad_norm": 1.9666177034378052,
"learning_rate": 4.181539730223243e-05,
"loss": 0.3885,
"step": 14000
},
{
"epoch": 0.2482726935378417,
"grad_norm": 2.5290989875793457,
"learning_rate": 4.17663112838939e-05,
"loss": 0.4296,
"step": 14050
},
{
"epoch": 0.24915622625505823,
"grad_norm": 1.9654839038848877,
"learning_rate": 4.171722526555536e-05,
"loss": 0.3853,
"step": 14100
},
{
"epoch": 0.25003975897227476,
"grad_norm": 1.68603515625,
"learning_rate": 4.166813924721683e-05,
"loss": 0.4068,
"step": 14150
},
{
"epoch": 0.25092329168949123,
"grad_norm": 1.9062405824661255,
"learning_rate": 4.161905322887829e-05,
"loss": 0.4071,
"step": 14200
},
{
"epoch": 0.25180682440670776,
"grad_norm": 1.7028473615646362,
"learning_rate": 4.156996721053975e-05,
"loss": 0.3588,
"step": 14250
},
{
"epoch": 0.2526903571239243,
"grad_norm": 1.6032434701919556,
"learning_rate": 4.1520881192201214e-05,
"loss": 0.4161,
"step": 14300
},
{
"epoch": 0.2535738898411408,
"grad_norm": 1.6103026866912842,
"learning_rate": 4.147179517386268e-05,
"loss": 0.3431,
"step": 14350
},
{
"epoch": 0.25445742255835735,
"grad_norm": 3.727078914642334,
"learning_rate": 4.142270915552414e-05,
"loss": 0.3576,
"step": 14400
},
{
"epoch": 0.2553409552755739,
"grad_norm": 1.3540493249893188,
"learning_rate": 4.137362313718561e-05,
"loss": 0.3563,
"step": 14450
},
{
"epoch": 0.25622448799279035,
"grad_norm": 1.7373064756393433,
"learning_rate": 4.1324537118847066e-05,
"loss": 0.3406,
"step": 14500
},
{
"epoch": 0.2571080207100069,
"grad_norm": 2.6311392784118652,
"learning_rate": 4.127545110050853e-05,
"loss": 0.4397,
"step": 14550
},
{
"epoch": 0.2579915534272234,
"grad_norm": 1.845186471939087,
"learning_rate": 4.122636508217e-05,
"loss": 0.411,
"step": 14600
},
{
"epoch": 0.25887508614443994,
"grad_norm": 1.5897334814071655,
"learning_rate": 4.117727906383146e-05,
"loss": 0.3742,
"step": 14650
},
{
"epoch": 0.25975861886165647,
"grad_norm": 3.667428970336914,
"learning_rate": 4.1128193045492924e-05,
"loss": 0.3622,
"step": 14700
},
{
"epoch": 0.26064215157887294,
"grad_norm": 1.7393996715545654,
"learning_rate": 4.107910702715439e-05,
"loss": 0.2782,
"step": 14750
},
{
"epoch": 0.26152568429608947,
"grad_norm": 1.6495802402496338,
"learning_rate": 4.103002100881585e-05,
"loss": 0.36,
"step": 14800
},
{
"epoch": 0.262409217013306,
"grad_norm": 1.5133942365646362,
"learning_rate": 4.098093499047732e-05,
"loss": 0.486,
"step": 14850
},
{
"epoch": 0.2632927497305225,
"grad_norm": 1.848177194595337,
"learning_rate": 4.0932830692505546e-05,
"loss": 0.406,
"step": 14900
},
{
"epoch": 0.26417628244773905,
"grad_norm": 3.320469379425049,
"learning_rate": 4.088374467416701e-05,
"loss": 0.357,
"step": 14950
},
{
"epoch": 0.2650598151649556,
"grad_norm": 1.417015790939331,
"learning_rate": 4.0834658655828475e-05,
"loss": 0.2855,
"step": 15000
},
{
"epoch": 0.26594334788217205,
"grad_norm": 1.8597488403320312,
"learning_rate": 4.078557263748994e-05,
"loss": 0.4424,
"step": 15050
},
{
"epoch": 0.2668268805993886,
"grad_norm": 1.651663899421692,
"learning_rate": 4.0736486619151405e-05,
"loss": 0.352,
"step": 15100
},
{
"epoch": 0.2677104133166051,
"grad_norm": 1.452006459236145,
"learning_rate": 4.068740060081286e-05,
"loss": 0.3638,
"step": 15150
},
{
"epoch": 0.26859394603382164,
"grad_norm": 2.7887187004089355,
"learning_rate": 4.063831458247433e-05,
"loss": 0.3727,
"step": 15200
},
{
"epoch": 0.26947747875103817,
"grad_norm": 1.9209206104278564,
"learning_rate": 4.058922856413579e-05,
"loss": 0.3842,
"step": 15250
},
{
"epoch": 0.2703610114682547,
"grad_norm": 1.946022868156433,
"learning_rate": 4.054014254579726e-05,
"loss": 0.3625,
"step": 15300
},
{
"epoch": 0.27124454418547117,
"grad_norm": 1.4893426895141602,
"learning_rate": 4.049105652745872e-05,
"loss": 0.4088,
"step": 15350
},
{
"epoch": 0.2721280769026877,
"grad_norm": 1.7391968965530396,
"learning_rate": 4.0441970509120186e-05,
"loss": 0.4126,
"step": 15400
},
{
"epoch": 0.2730116096199042,
"grad_norm": 1.7254865169525146,
"learning_rate": 4.0392884490781644e-05,
"loss": 0.4662,
"step": 15450
},
{
"epoch": 0.27389514233712076,
"grad_norm": 4.502954483032227,
"learning_rate": 4.034379847244311e-05,
"loss": 0.3889,
"step": 15500
},
{
"epoch": 0.2747786750543373,
"grad_norm": 2.4406206607818604,
"learning_rate": 4.029471245410458e-05,
"loss": 0.3618,
"step": 15550
},
{
"epoch": 0.27566220777155376,
"grad_norm": 1.6272777318954468,
"learning_rate": 4.024562643576604e-05,
"loss": 0.4126,
"step": 15600
},
{
"epoch": 0.2765457404887703,
"grad_norm": 1.5262032747268677,
"learning_rate": 4.01965404174275e-05,
"loss": 0.3771,
"step": 15650
},
{
"epoch": 0.2774292732059868,
"grad_norm": 1.8245854377746582,
"learning_rate": 4.014745439908896e-05,
"loss": 0.4377,
"step": 15700
},
{
"epoch": 0.27831280592320334,
"grad_norm": 2.8566267490386963,
"learning_rate": 4.0098368380750425e-05,
"loss": 0.4041,
"step": 15750
},
{
"epoch": 0.27919633864041987,
"grad_norm": 2.0167641639709473,
"learning_rate": 4.00492823624119e-05,
"loss": 0.375,
"step": 15800
},
{
"epoch": 0.2800798713576364,
"grad_norm": 1.9363830089569092,
"learning_rate": 4.0000196344073355e-05,
"loss": 0.3339,
"step": 15850
},
{
"epoch": 0.2809634040748529,
"grad_norm": 2.208641767501831,
"learning_rate": 3.995111032573482e-05,
"loss": 0.348,
"step": 15900
},
{
"epoch": 0.2818469367920694,
"grad_norm": 1.5789657831192017,
"learning_rate": 3.9902024307396284e-05,
"loss": 0.367,
"step": 15950
},
{
"epoch": 0.28273046950928593,
"grad_norm": 1.6666336059570312,
"learning_rate": 3.985293828905775e-05,
"loss": 0.3427,
"step": 16000
},
{
"epoch": 0.28361400222650246,
"grad_norm": 3.725020170211792,
"learning_rate": 3.980385227071921e-05,
"loss": 0.3637,
"step": 16050
},
{
"epoch": 0.284497534943719,
"grad_norm": 1.5958735942840576,
"learning_rate": 3.975476625238068e-05,
"loss": 0.3489,
"step": 16100
},
{
"epoch": 0.28538106766093546,
"grad_norm": 1.3779951333999634,
"learning_rate": 3.9705680234042136e-05,
"loss": 0.4209,
"step": 16150
},
{
"epoch": 0.286264600378152,
"grad_norm": 1.6636724472045898,
"learning_rate": 3.96565942157036e-05,
"loss": 0.2984,
"step": 16200
},
{
"epoch": 0.2871481330953685,
"grad_norm": 1.705592155456543,
"learning_rate": 3.9607508197365065e-05,
"loss": 0.3877,
"step": 16250
},
{
"epoch": 0.28803166581258505,
"grad_norm": 1.5367944240570068,
"learning_rate": 3.955842217902653e-05,
"loss": 0.3508,
"step": 16300
},
{
"epoch": 0.2889151985298016,
"grad_norm": 3.140960693359375,
"learning_rate": 3.9509336160687994e-05,
"loss": 0.3443,
"step": 16350
},
{
"epoch": 0.2897987312470181,
"grad_norm": 1.2341272830963135,
"learning_rate": 3.946025014234945e-05,
"loss": 0.4346,
"step": 16400
},
{
"epoch": 0.2906822639642346,
"grad_norm": 1.9500783681869507,
"learning_rate": 3.941116412401092e-05,
"loss": 0.4262,
"step": 16450
},
{
"epoch": 0.2915657966814511,
"grad_norm": 1.344519853591919,
"learning_rate": 3.936207810567238e-05,
"loss": 0.3065,
"step": 16500
},
{
"epoch": 0.29244932939866763,
"grad_norm": 1.4747456312179565,
"learning_rate": 3.9312992087333846e-05,
"loss": 0.4003,
"step": 16550
},
{
"epoch": 0.29333286211588416,
"grad_norm": 1.5639158487319946,
"learning_rate": 3.926390606899531e-05,
"loss": 0.5295,
"step": 16600
},
{
"epoch": 0.2942163948331007,
"grad_norm": 1.9425716400146484,
"learning_rate": 3.9214820050656776e-05,
"loss": 0.3582,
"step": 16650
},
{
"epoch": 0.29509992755031716,
"grad_norm": 3.003871440887451,
"learning_rate": 3.9165734032318234e-05,
"loss": 0.3299,
"step": 16700
},
{
"epoch": 0.2959834602675337,
"grad_norm": 3.689194679260254,
"learning_rate": 3.91166480139797e-05,
"loss": 0.3493,
"step": 16750
},
{
"epoch": 0.2968669929847502,
"grad_norm": 1.9439842700958252,
"learning_rate": 3.906756199564116e-05,
"loss": 0.2752,
"step": 16800
},
{
"epoch": 0.29775052570196675,
"grad_norm": 1.8846018314361572,
"learning_rate": 3.901847597730263e-05,
"loss": 0.3254,
"step": 16850
},
{
"epoch": 0.2986340584191833,
"grad_norm": 2.9167964458465576,
"learning_rate": 3.896938995896409e-05,
"loss": 0.3352,
"step": 16900
},
{
"epoch": 0.2995175911363998,
"grad_norm": 2.6470940113067627,
"learning_rate": 3.892128566099233e-05,
"loss": 0.3812,
"step": 16950
},
{
"epoch": 0.3004011238536163,
"grad_norm": 2.1021623611450195,
"learning_rate": 3.887219964265379e-05,
"loss": 0.3332,
"step": 17000
},
{
"epoch": 0.3012846565708328,
"grad_norm": 1.9923433065414429,
"learning_rate": 3.882311362431525e-05,
"loss": 0.3472,
"step": 17050
},
{
"epoch": 0.30216818928804934,
"grad_norm": 1.5736125707626343,
"learning_rate": 3.8774027605976714e-05,
"loss": 0.4207,
"step": 17100
},
{
"epoch": 0.30305172200526587,
"grad_norm": 2.2181496620178223,
"learning_rate": 3.872494158763818e-05,
"loss": 0.3849,
"step": 17150
},
{
"epoch": 0.3039352547224824,
"grad_norm": 1.5112169981002808,
"learning_rate": 3.867585556929964e-05,
"loss": 0.3272,
"step": 17200
},
{
"epoch": 0.30481878743969887,
"grad_norm": 1.5218919515609741,
"learning_rate": 3.862676955096111e-05,
"loss": 0.3037,
"step": 17250
},
{
"epoch": 0.3057023201569154,
"grad_norm": 1.5864076614379883,
"learning_rate": 3.857768353262257e-05,
"loss": 0.2924,
"step": 17300
},
{
"epoch": 0.3065858528741319,
"grad_norm": 1.8895894289016724,
"learning_rate": 3.852859751428403e-05,
"loss": 0.4029,
"step": 17350
},
{
"epoch": 0.30746938559134845,
"grad_norm": 1.4156498908996582,
"learning_rate": 3.8479511495945495e-05,
"loss": 0.5016,
"step": 17400
},
{
"epoch": 0.308352918308565,
"grad_norm": 1.4788236618041992,
"learning_rate": 3.843042547760696e-05,
"loss": 0.3648,
"step": 17450
},
{
"epoch": 0.3092364510257815,
"grad_norm": 1.7631937265396118,
"learning_rate": 3.8381339459268424e-05,
"loss": 0.3045,
"step": 17500
},
{
"epoch": 0.310119983742998,
"grad_norm": 1.9122941493988037,
"learning_rate": 3.833225344092989e-05,
"loss": 0.3271,
"step": 17550
},
{
"epoch": 0.3110035164602145,
"grad_norm": 1.6838266849517822,
"learning_rate": 3.828316742259135e-05,
"loss": 0.519,
"step": 17600
},
{
"epoch": 0.31188704917743104,
"grad_norm": 4.507582187652588,
"learning_rate": 3.823408140425281e-05,
"loss": 0.341,
"step": 17650
},
{
"epoch": 0.31277058189464757,
"grad_norm": 1.3272327184677124,
"learning_rate": 3.8184995385914276e-05,
"loss": 0.3352,
"step": 17700
},
{
"epoch": 0.3136541146118641,
"grad_norm": 2.516676664352417,
"learning_rate": 3.813590936757574e-05,
"loss": 0.4406,
"step": 17750
},
{
"epoch": 0.31453764732908057,
"grad_norm": 1.8230887651443481,
"learning_rate": 3.8086823349237206e-05,
"loss": 0.3822,
"step": 17800
},
{
"epoch": 0.3154211800462971,
"grad_norm": 1.5267698764801025,
"learning_rate": 3.803773733089867e-05,
"loss": 0.287,
"step": 17850
},
{
"epoch": 0.31630471276351363,
"grad_norm": 2.647895574569702,
"learning_rate": 3.798865131256013e-05,
"loss": 0.4349,
"step": 17900
},
{
"epoch": 0.31718824548073016,
"grad_norm": 1.5159648656845093,
"learning_rate": 3.793956529422159e-05,
"loss": 0.3633,
"step": 17950
},
{
"epoch": 0.3180717781979467,
"grad_norm": 1.9135470390319824,
"learning_rate": 3.7890479275883064e-05,
"loss": 0.3431,
"step": 18000
},
{
"epoch": 0.3189553109151632,
"grad_norm": 1.6438477039337158,
"learning_rate": 3.784139325754452e-05,
"loss": 0.3986,
"step": 18050
},
{
"epoch": 0.3198388436323797,
"grad_norm": 1.6794339418411255,
"learning_rate": 3.779230723920599e-05,
"loss": 0.3279,
"step": 18100
},
{
"epoch": 0.3207223763495962,
"grad_norm": 1.5067431926727295,
"learning_rate": 3.7743221220867445e-05,
"loss": 0.3062,
"step": 18150
},
{
"epoch": 0.32160590906681275,
"grad_norm": 1.6953719854354858,
"learning_rate": 3.7694135202528916e-05,
"loss": 0.2973,
"step": 18200
},
{
"epoch": 0.3224894417840293,
"grad_norm": 2.819748640060425,
"learning_rate": 3.764504918419038e-05,
"loss": 0.4078,
"step": 18250
},
{
"epoch": 0.3233729745012458,
"grad_norm": 1.5743447542190552,
"learning_rate": 3.759596316585184e-05,
"loss": 0.31,
"step": 18300
},
{
"epoch": 0.3242565072184623,
"grad_norm": 1.8966853618621826,
"learning_rate": 3.7546877147513303e-05,
"loss": 0.306,
"step": 18350
},
{
"epoch": 0.3251400399356788,
"grad_norm": 2.7652056217193604,
"learning_rate": 3.749779112917477e-05,
"loss": 0.3426,
"step": 18400
},
{
"epoch": 0.32602357265289533,
"grad_norm": 3.006504535675049,
"learning_rate": 3.744870511083623e-05,
"loss": 0.2807,
"step": 18450
},
{
"epoch": 0.32690710537011186,
"grad_norm": 1.5666753053665161,
"learning_rate": 3.73996190924977e-05,
"loss": 0.3856,
"step": 18500
},
{
"epoch": 0.3277906380873284,
"grad_norm": 1.9692752361297607,
"learning_rate": 3.735053307415916e-05,
"loss": 0.3575,
"step": 18550
},
{
"epoch": 0.3286741708045449,
"grad_norm": 3.517622232437134,
"learning_rate": 3.730144705582062e-05,
"loss": 0.347,
"step": 18600
},
{
"epoch": 0.3295577035217614,
"grad_norm": 1.8076531887054443,
"learning_rate": 3.7252361037482085e-05,
"loss": 0.3195,
"step": 18650
},
{
"epoch": 0.3304412362389779,
"grad_norm": 1.8082791566848755,
"learning_rate": 3.720327501914355e-05,
"loss": 0.3543,
"step": 18700
},
{
"epoch": 0.33132476895619445,
"grad_norm": 1.3712306022644043,
"learning_rate": 3.7154189000805014e-05,
"loss": 0.3642,
"step": 18750
},
{
"epoch": 0.332208301673411,
"grad_norm": 1.5654476881027222,
"learning_rate": 3.710510298246648e-05,
"loss": 0.3415,
"step": 18800
},
{
"epoch": 0.3330918343906275,
"grad_norm": 1.4388914108276367,
"learning_rate": 3.7056016964127937e-05,
"loss": 0.3069,
"step": 18850
},
{
"epoch": 0.333975367107844,
"grad_norm": 1.5527664422988892,
"learning_rate": 3.70069309457894e-05,
"loss": 0.2962,
"step": 18900
},
{
"epoch": 0.3348588998250605,
"grad_norm": 1.6680736541748047,
"learning_rate": 3.6957844927450866e-05,
"loss": 0.3156,
"step": 18950
},
{
"epoch": 0.33574243254227704,
"grad_norm": 2.266108274459839,
"learning_rate": 3.69097406294791e-05,
"loss": 0.3791,
"step": 19000
},
{
"epoch": 0.33662596525949356,
"grad_norm": 1.4146838188171387,
"learning_rate": 3.6860654611140565e-05,
"loss": 0.3287,
"step": 19050
},
{
"epoch": 0.3375094979767101,
"grad_norm": 1.640153169631958,
"learning_rate": 3.681156859280202e-05,
"loss": 0.4034,
"step": 19100
},
{
"epoch": 0.3383930306939266,
"grad_norm": 1.670589804649353,
"learning_rate": 3.6762482574463494e-05,
"loss": 0.3476,
"step": 19150
},
{
"epoch": 0.3392765634111431,
"grad_norm": 3.375941753387451,
"learning_rate": 3.671339655612496e-05,
"loss": 0.363,
"step": 19200
},
{
"epoch": 0.3401600961283596,
"grad_norm": 1.965834379196167,
"learning_rate": 3.666431053778642e-05,
"loss": 0.3182,
"step": 19250
},
{
"epoch": 0.34104362884557615,
"grad_norm": 1.607900857925415,
"learning_rate": 3.661522451944788e-05,
"loss": 0.3238,
"step": 19300
},
{
"epoch": 0.3419271615627927,
"grad_norm": 1.4051165580749512,
"learning_rate": 3.6566138501109346e-05,
"loss": 0.3043,
"step": 19350
},
{
"epoch": 0.3428106942800092,
"grad_norm": 1.4679523706436157,
"learning_rate": 3.651705248277081e-05,
"loss": 0.3902,
"step": 19400
},
{
"epoch": 0.3436942269972257,
"grad_norm": 1.5135536193847656,
"learning_rate": 3.6467966464432276e-05,
"loss": 0.3085,
"step": 19450
},
{
"epoch": 0.3445777597144422,
"grad_norm": 2.2533581256866455,
"learning_rate": 3.6418880446093734e-05,
"loss": 0.3162,
"step": 19500
},
{
"epoch": 0.34546129243165874,
"grad_norm": 1.625067949295044,
"learning_rate": 3.63697944277552e-05,
"loss": 0.345,
"step": 19550
},
{
"epoch": 0.34634482514887527,
"grad_norm": 1.1573612689971924,
"learning_rate": 3.632070840941666e-05,
"loss": 0.3017,
"step": 19600
},
{
"epoch": 0.3472283578660918,
"grad_norm": 3.46663498878479,
"learning_rate": 3.627162239107813e-05,
"loss": 0.4232,
"step": 19650
},
{
"epoch": 0.3481118905833083,
"grad_norm": 1.5614382028579712,
"learning_rate": 3.622253637273959e-05,
"loss": 0.3363,
"step": 19700
},
{
"epoch": 0.3489954233005248,
"grad_norm": 1.3841484785079956,
"learning_rate": 3.617345035440106e-05,
"loss": 0.3484,
"step": 19750
},
{
"epoch": 0.3498789560177413,
"grad_norm": 1.941517949104309,
"learning_rate": 3.6124364336062515e-05,
"loss": 0.3719,
"step": 19800
},
{
"epoch": 0.35076248873495786,
"grad_norm": 4.908963680267334,
"learning_rate": 3.607527831772398e-05,
"loss": 0.3226,
"step": 19850
},
{
"epoch": 0.3516460214521744,
"grad_norm": 1.5221627950668335,
"learning_rate": 3.6026192299385444e-05,
"loss": 0.3636,
"step": 19900
},
{
"epoch": 0.3525295541693909,
"grad_norm": 1.8089814186096191,
"learning_rate": 3.597710628104691e-05,
"loss": 0.3704,
"step": 19950
},
{
"epoch": 0.3534130868866074,
"grad_norm": 2.786560535430908,
"learning_rate": 3.5928020262708373e-05,
"loss": 0.3459,
"step": 20000
},
{
"epoch": 0.3542966196038239,
"grad_norm": 2.97851824760437,
"learning_rate": 3.587893424436983e-05,
"loss": 0.3226,
"step": 20050
},
{
"epoch": 0.35518015232104044,
"grad_norm": 2.1979775428771973,
"learning_rate": 3.5829848226031296e-05,
"loss": 0.3256,
"step": 20100
},
{
"epoch": 0.35606368503825697,
"grad_norm": 1.762453556060791,
"learning_rate": 3.578076220769276e-05,
"loss": 0.3179,
"step": 20150
},
{
"epoch": 0.3569472177554735,
"grad_norm": 1.4908533096313477,
"learning_rate": 3.5731676189354225e-05,
"loss": 0.4226,
"step": 20200
},
{
"epoch": 0.35783075047269003,
"grad_norm": 1.3192092180252075,
"learning_rate": 3.568259017101569e-05,
"loss": 0.4196,
"step": 20250
},
{
"epoch": 0.3587142831899065,
"grad_norm": 1.421736717224121,
"learning_rate": 3.5633504152677155e-05,
"loss": 0.3618,
"step": 20300
},
{
"epoch": 0.35959781590712303,
"grad_norm": 2.0631330013275146,
"learning_rate": 3.558441813433861e-05,
"loss": 0.4093,
"step": 20350
},
{
"epoch": 0.36048134862433956,
"grad_norm": 1.6250920295715332,
"learning_rate": 3.5535332116000084e-05,
"loss": 0.3051,
"step": 20400
},
{
"epoch": 0.3613648813415561,
"grad_norm": 1.4659417867660522,
"learning_rate": 3.548624609766155e-05,
"loss": 0.3379,
"step": 20450
},
{
"epoch": 0.3622484140587726,
"grad_norm": 1.520573616027832,
"learning_rate": 3.5437160079323007e-05,
"loss": 0.3582,
"step": 20500
},
{
"epoch": 0.3631319467759891,
"grad_norm": 2.158830165863037,
"learning_rate": 3.538807406098447e-05,
"loss": 0.4004,
"step": 20550
},
{
"epoch": 0.3640154794932056,
"grad_norm": 1.7503968477249146,
"learning_rate": 3.533898804264593e-05,
"loss": 0.33,
"step": 20600
},
{
"epoch": 0.36489901221042215,
"grad_norm": 1.5064153671264648,
"learning_rate": 3.52899020243074e-05,
"loss": 0.3072,
"step": 20650
},
{
"epoch": 0.3657825449276387,
"grad_norm": 3.5023598670959473,
"learning_rate": 3.5240816005968865e-05,
"loss": 0.35,
"step": 20700
},
{
"epoch": 0.3666660776448552,
"grad_norm": 1.7911083698272705,
"learning_rate": 3.519172998763032e-05,
"loss": 0.3241,
"step": 20750
},
{
"epoch": 0.36754961036207173,
"grad_norm": 1.50026273727417,
"learning_rate": 3.514264396929179e-05,
"loss": 0.37,
"step": 20800
},
{
"epoch": 0.3684331430792882,
"grad_norm": 1.5556259155273438,
"learning_rate": 3.509355795095325e-05,
"loss": 0.2689,
"step": 20850
},
{
"epoch": 0.36931667579650473,
"grad_norm": 1.6530933380126953,
"learning_rate": 3.504447193261472e-05,
"loss": 0.4061,
"step": 20900
},
{
"epoch": 0.37020020851372126,
"grad_norm": 1.250317931175232,
"learning_rate": 3.499538591427618e-05,
"loss": 0.3412,
"step": 20950
},
{
"epoch": 0.3710837412309378,
"grad_norm": 1.9599151611328125,
"learning_rate": 3.494728161630441e-05,
"loss": 0.3619,
"step": 21000
},
{
"epoch": 0.3719672739481543,
"grad_norm": 1.3728086948394775,
"learning_rate": 3.4898195597965874e-05,
"loss": 0.314,
"step": 21050
},
{
"epoch": 0.3728508066653708,
"grad_norm": 1.6389710903167725,
"learning_rate": 3.4849109579627346e-05,
"loss": 0.2912,
"step": 21100
},
{
"epoch": 0.3737343393825873,
"grad_norm": 3.552582025527954,
"learning_rate": 3.4800023561288803e-05,
"loss": 0.3402,
"step": 21150
},
{
"epoch": 0.37461787209980385,
"grad_norm": 1.6479156017303467,
"learning_rate": 3.475093754295027e-05,
"loss": 0.3462,
"step": 21200
},
{
"epoch": 0.3755014048170204,
"grad_norm": 1.593705415725708,
"learning_rate": 3.470185152461173e-05,
"loss": 0.2775,
"step": 21250
},
{
"epoch": 0.3763849375342369,
"grad_norm": 2.1807069778442383,
"learning_rate": 3.465276550627319e-05,
"loss": 0.3825,
"step": 21300
},
{
"epoch": 0.37726847025145344,
"grad_norm": 1.6359409093856812,
"learning_rate": 3.460367948793466e-05,
"loss": 0.3931,
"step": 21350
},
{
"epoch": 0.3781520029686699,
"grad_norm": 1.5960018634796143,
"learning_rate": 3.455459346959612e-05,
"loss": 0.4059,
"step": 21400
},
{
"epoch": 0.37903553568588644,
"grad_norm": 3.367835283279419,
"learning_rate": 3.4505507451257585e-05,
"loss": 0.3264,
"step": 21450
},
{
"epoch": 0.37991906840310297,
"grad_norm": 1.5965161323547363,
"learning_rate": 3.445642143291905e-05,
"loss": 0.2605,
"step": 21500
},
{
"epoch": 0.3808026011203195,
"grad_norm": 1.5011396408081055,
"learning_rate": 3.440733541458051e-05,
"loss": 0.3658,
"step": 21550
},
{
"epoch": 0.381686133837536,
"grad_norm": 1.5021259784698486,
"learning_rate": 3.435824939624198e-05,
"loss": 0.3274,
"step": 21600
},
{
"epoch": 0.3825696665547525,
"grad_norm": 1.5224860906600952,
"learning_rate": 3.430916337790344e-05,
"loss": 0.3094,
"step": 21650
},
{
"epoch": 0.383453199271969,
"grad_norm": 3.36433482170105,
"learning_rate": 3.42600773595649e-05,
"loss": 0.3556,
"step": 21700
},
{
"epoch": 0.38433673198918555,
"grad_norm": 1.9824773073196411,
"learning_rate": 3.4210991341226366e-05,
"loss": 0.2877,
"step": 21750
},
{
"epoch": 0.3852202647064021,
"grad_norm": 1.5103614330291748,
"learning_rate": 3.416190532288783e-05,
"loss": 0.3203,
"step": 21800
},
{
"epoch": 0.3861037974236186,
"grad_norm": 1.1625959873199463,
"learning_rate": 3.4112819304549295e-05,
"loss": 0.2553,
"step": 21850
},
{
"epoch": 0.38698733014083514,
"grad_norm": 1.5695985555648804,
"learning_rate": 3.406373328621076e-05,
"loss": 0.4425,
"step": 21900
},
{
"epoch": 0.3878708628580516,
"grad_norm": 1.6758594512939453,
"learning_rate": 3.401464726787222e-05,
"loss": 0.3249,
"step": 21950
},
{
"epoch": 0.38875439557526814,
"grad_norm": 3.6129748821258545,
"learning_rate": 3.396556124953368e-05,
"loss": 0.3649,
"step": 22000
},
{
"epoch": 0.38963792829248467,
"grad_norm": 1.6155461072921753,
"learning_rate": 3.391647523119515e-05,
"loss": 0.3621,
"step": 22050
},
{
"epoch": 0.3905214610097012,
"grad_norm": 1.7477047443389893,
"learning_rate": 3.386738921285661e-05,
"loss": 0.4232,
"step": 22100
},
{
"epoch": 0.3914049937269177,
"grad_norm": 3.0512797832489014,
"learning_rate": 3.3818303194518076e-05,
"loss": 0.266,
"step": 22150
},
{
"epoch": 0.3922885264441342,
"grad_norm": 1.4074236154556274,
"learning_rate": 3.376921717617954e-05,
"loss": 0.3767,
"step": 22200
},
{
"epoch": 0.39317205916135073,
"grad_norm": 1.7168455123901367,
"learning_rate": 3.3720131157841e-05,
"loss": 0.366,
"step": 22250
},
{
"epoch": 0.39405559187856726,
"grad_norm": 3.360104560852051,
"learning_rate": 3.3671045139502464e-05,
"loss": 0.3211,
"step": 22300
},
{
"epoch": 0.3949391245957838,
"grad_norm": 1.527031660079956,
"learning_rate": 3.3621959121163935e-05,
"loss": 0.2505,
"step": 22350
},
{
"epoch": 0.3958226573130003,
"grad_norm": 1.7586029767990112,
"learning_rate": 3.357287310282539e-05,
"loss": 0.3824,
"step": 22400
},
{
"epoch": 0.39670619003021684,
"grad_norm": 2.3490004539489746,
"learning_rate": 3.352378708448686e-05,
"loss": 0.331,
"step": 22450
},
{
"epoch": 0.3975897227474333,
"grad_norm": 1.5686146020889282,
"learning_rate": 3.3474701066148316e-05,
"loss": 0.3136,
"step": 22500
},
{
"epoch": 0.39847325546464984,
"grad_norm": 1.5068285465240479,
"learning_rate": 3.342561504780978e-05,
"loss": 0.297,
"step": 22550
},
{
"epoch": 0.3993567881818664,
"grad_norm": 1.81602942943573,
"learning_rate": 3.337652902947125e-05,
"loss": 0.2933,
"step": 22600
},
{
"epoch": 0.4002403208990829,
"grad_norm": 3.4516189098358154,
"learning_rate": 3.332744301113271e-05,
"loss": 0.4026,
"step": 22650
},
{
"epoch": 0.40112385361629943,
"grad_norm": 1.5759230852127075,
"learning_rate": 3.3278356992794174e-05,
"loss": 0.3567,
"step": 22700
},
{
"epoch": 0.4020073863335159,
"grad_norm": 1.9385254383087158,
"learning_rate": 3.322927097445564e-05,
"loss": 0.3711,
"step": 22750
},
{
"epoch": 0.40289091905073243,
"grad_norm": 1.6334116458892822,
"learning_rate": 3.31801849561171e-05,
"loss": 0.378,
"step": 22800
}
],
"logging_steps": 50,
"max_steps": 56591,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}