flex-percept-coldv2-3e / trainer_state.json
mhan's picture
Upload folder using huggingface_hub
140655a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999946314489719,
"eval_steps": 500,
"global_step": 27939,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005368551028077522,
"grad_norm": 173.3736114501953,
"learning_rate": 5.959475566150179e-07,
"loss": 8.043,
"step": 50
},
{
"epoch": 0.010737102056155044,
"grad_norm": 34.7974739074707,
"learning_rate": 1.1918951132300359e-06,
"loss": 6.5425,
"step": 100
},
{
"epoch": 0.016105653084232566,
"grad_norm": 31.309803009033203,
"learning_rate": 1.7878426698450538e-06,
"loss": 5.5084,
"step": 150
},
{
"epoch": 0.02147420411231009,
"grad_norm": 30.26763153076172,
"learning_rate": 2.3837902264600717e-06,
"loss": 4.6124,
"step": 200
},
{
"epoch": 0.02684275514038761,
"grad_norm": 24.171663284301758,
"learning_rate": 2.9797377830750894e-06,
"loss": 4.0755,
"step": 250
},
{
"epoch": 0.03221130616846513,
"grad_norm": 19.654661178588867,
"learning_rate": 3.5756853396901076e-06,
"loss": 3.4494,
"step": 300
},
{
"epoch": 0.03757985719654265,
"grad_norm": 19.887666702270508,
"learning_rate": 4.171632896305126e-06,
"loss": 3.401,
"step": 350
},
{
"epoch": 0.04294840822462018,
"grad_norm": 27.6500301361084,
"learning_rate": 4.7675804529201435e-06,
"loss": 3.0908,
"step": 400
},
{
"epoch": 0.048316959252697696,
"grad_norm": 20.856115341186523,
"learning_rate": 5.363528009535162e-06,
"loss": 3.3005,
"step": 450
},
{
"epoch": 0.05368551028077522,
"grad_norm": 22.7480525970459,
"learning_rate": 5.959475566150179e-06,
"loss": 2.8446,
"step": 500
},
{
"epoch": 0.05905406130885274,
"grad_norm": 20.75531578063965,
"learning_rate": 6.5554231227651975e-06,
"loss": 2.7879,
"step": 550
},
{
"epoch": 0.06442261233693027,
"grad_norm": 25.86386489868164,
"learning_rate": 7.151370679380215e-06,
"loss": 2.7259,
"step": 600
},
{
"epoch": 0.06979116336500779,
"grad_norm": 18.4228458404541,
"learning_rate": 7.747318235995233e-06,
"loss": 2.7685,
"step": 650
},
{
"epoch": 0.0751597143930853,
"grad_norm": 25.71515655517578,
"learning_rate": 8.343265792610251e-06,
"loss": 2.8611,
"step": 700
},
{
"epoch": 0.08052826542116283,
"grad_norm": 14.991035461425781,
"learning_rate": 8.939213349225268e-06,
"loss": 2.7934,
"step": 750
},
{
"epoch": 0.08589681644924035,
"grad_norm": 11.806473731994629,
"learning_rate": 9.535160905840287e-06,
"loss": 2.6654,
"step": 800
},
{
"epoch": 0.09126536747731787,
"grad_norm": 17.34215545654297,
"learning_rate": 9.999995934757252e-06,
"loss": 2.552,
"step": 850
},
{
"epoch": 0.09663391850539539,
"grad_norm": 16.925975799560547,
"learning_rate": 9.999874985890243e-06,
"loss": 2.425,
"step": 900
},
{
"epoch": 0.10200246953347292,
"grad_norm": 26.792736053466797,
"learning_rate": 9.99958605643335e-06,
"loss": 2.4877,
"step": 950
},
{
"epoch": 0.10737102056155044,
"grad_norm": 12.235079765319824,
"learning_rate": 9.999129156093722e-06,
"loss": 2.3378,
"step": 1000
},
{
"epoch": 0.11273957158962795,
"grad_norm": 15.029677391052246,
"learning_rate": 9.998504300221821e-06,
"loss": 2.5849,
"step": 1050
},
{
"epoch": 0.11810812261770548,
"grad_norm": 17.80719757080078,
"learning_rate": 9.997711509810904e-06,
"loss": 2.3923,
"step": 1100
},
{
"epoch": 0.123476673645783,
"grad_norm": 13.069009780883789,
"learning_rate": 9.996750811496319e-06,
"loss": 2.3446,
"step": 1150
},
{
"epoch": 0.12884522467386053,
"grad_norm": 12.205385208129883,
"learning_rate": 9.995622237554603e-06,
"loss": 2.627,
"step": 1200
},
{
"epoch": 0.13421377570193804,
"grad_norm": 17.56980323791504,
"learning_rate": 9.994325825902411e-06,
"loss": 2.3043,
"step": 1250
},
{
"epoch": 0.13958232673001558,
"grad_norm": 11.382326126098633,
"learning_rate": 9.992861620095228e-06,
"loss": 2.2816,
"step": 1300
},
{
"epoch": 0.1449508777580931,
"grad_norm": 10.679113388061523,
"learning_rate": 9.991229669325917e-06,
"loss": 2.6668,
"step": 1350
},
{
"epoch": 0.1503194287861706,
"grad_norm": 15.005497932434082,
"learning_rate": 9.98943002842306e-06,
"loss": 2.5911,
"step": 1400
},
{
"epoch": 0.15568797981424815,
"grad_norm": 10.959671020507812,
"learning_rate": 9.987462757849114e-06,
"loss": 2.4963,
"step": 1450
},
{
"epoch": 0.16105653084232566,
"grad_norm": 13.526322364807129,
"learning_rate": 9.985327923698387e-06,
"loss": 2.4527,
"step": 1500
},
{
"epoch": 0.16642508187040317,
"grad_norm": 11.613245964050293,
"learning_rate": 9.983025597694814e-06,
"loss": 2.3502,
"step": 1550
},
{
"epoch": 0.1717936328984807,
"grad_norm": 19.931095123291016,
"learning_rate": 9.980555857189542e-06,
"loss": 2.2207,
"step": 1600
},
{
"epoch": 0.17716218392655822,
"grad_norm": 13.534950256347656,
"learning_rate": 9.977918785158341e-06,
"loss": 2.2449,
"step": 1650
},
{
"epoch": 0.18253073495463573,
"grad_norm": 21.363548278808594,
"learning_rate": 9.97511447019881e-06,
"loss": 2.6087,
"step": 1700
},
{
"epoch": 0.18789928598271327,
"grad_norm": 22.931884765625,
"learning_rate": 9.9721430065274e-06,
"loss": 2.4396,
"step": 1750
},
{
"epoch": 0.19326783701079078,
"grad_norm": 12.380692481994629,
"learning_rate": 9.96900449397625e-06,
"loss": 2.4232,
"step": 1800
},
{
"epoch": 0.19863638803886832,
"grad_norm": 13.600720405578613,
"learning_rate": 9.965699037989835e-06,
"loss": 2.6398,
"step": 1850
},
{
"epoch": 0.20400493906694583,
"grad_norm": 23.817028045654297,
"learning_rate": 9.962226749621423e-06,
"loss": 1.9173,
"step": 1900
},
{
"epoch": 0.20937349009502335,
"grad_norm": 22.198123931884766,
"learning_rate": 9.958587745529338e-06,
"loss": 2.02,
"step": 1950
},
{
"epoch": 0.21474204112310089,
"grad_norm": 19.710922241210938,
"learning_rate": 9.954782147973048e-06,
"loss": 1.9257,
"step": 2000
},
{
"epoch": 0.2201105921511784,
"grad_norm": 11.56153678894043,
"learning_rate": 9.950810084809056e-06,
"loss": 2.4561,
"step": 2050
},
{
"epoch": 0.2254791431792559,
"grad_norm": 11.210532188415527,
"learning_rate": 9.946671689486598e-06,
"loss": 2.2825,
"step": 2100
},
{
"epoch": 0.23084769420733345,
"grad_norm": 22.60122299194336,
"learning_rate": 9.94236710104317e-06,
"loss": 2.3433,
"step": 2150
},
{
"epoch": 0.23621624523541096,
"grad_norm": 21.053020477294922,
"learning_rate": 9.937896464099849e-06,
"loss": 2.3371,
"step": 2200
},
{
"epoch": 0.24158479626348847,
"grad_norm": 11.02442455291748,
"learning_rate": 9.933259928856438e-06,
"loss": 1.9893,
"step": 2250
},
{
"epoch": 0.246953347291566,
"grad_norm": 23.212339401245117,
"learning_rate": 9.928457651086414e-06,
"loss": 2.2982,
"step": 2300
},
{
"epoch": 0.2523218983196435,
"grad_norm": 12.233504295349121,
"learning_rate": 9.923489792131701e-06,
"loss": 2.3542,
"step": 2350
},
{
"epoch": 0.25769044934772106,
"grad_norm": 22.132793426513672,
"learning_rate": 9.918356518897252e-06,
"loss": 2.217,
"step": 2400
},
{
"epoch": 0.26305900037579855,
"grad_norm": 9.651283264160156,
"learning_rate": 9.91305800384543e-06,
"loss": 2.2106,
"step": 2450
},
{
"epoch": 0.2684275514038761,
"grad_norm": 11.630431175231934,
"learning_rate": 9.907594424990221e-06,
"loss": 2.4414,
"step": 2500
},
{
"epoch": 0.2737961024319536,
"grad_norm": 11.314878463745117,
"learning_rate": 9.901965965891255e-06,
"loss": 2.1554,
"step": 2550
},
{
"epoch": 0.27916465346003116,
"grad_norm": 8.078882217407227,
"learning_rate": 9.896172815647638e-06,
"loss": 2.17,
"step": 2600
},
{
"epoch": 0.28453320448810865,
"grad_norm": 11.758085250854492,
"learning_rate": 9.890215168891593e-06,
"loss": 2.5758,
"step": 2650
},
{
"epoch": 0.2899017555161862,
"grad_norm": 7.850560188293457,
"learning_rate": 9.88409322578193e-06,
"loss": 2.2779,
"step": 2700
},
{
"epoch": 0.2952703065442637,
"grad_norm": 9.131479263305664,
"learning_rate": 9.877807191997314e-06,
"loss": 2.1076,
"step": 2750
},
{
"epoch": 0.3006388575723412,
"grad_norm": 10.000419616699219,
"learning_rate": 9.871357278729355e-06,
"loss": 2.2511,
"step": 2800
},
{
"epoch": 0.30600740860041875,
"grad_norm": 8.93471622467041,
"learning_rate": 9.86474370267552e-06,
"loss": 2.764,
"step": 2850
},
{
"epoch": 0.3113759596284963,
"grad_norm": 12.075305938720703,
"learning_rate": 9.857966686031848e-06,
"loss": 2.3072,
"step": 2900
},
{
"epoch": 0.3167445106565738,
"grad_norm": 20.350278854370117,
"learning_rate": 9.85102645648548e-06,
"loss": 1.9735,
"step": 2950
},
{
"epoch": 0.3221130616846513,
"grad_norm": 19.958290100097656,
"learning_rate": 9.843923247207018e-06,
"loss": 2.3562,
"step": 3000
},
{
"epoch": 0.32748161271272885,
"grad_norm": 19.133190155029297,
"learning_rate": 9.836657296842684e-06,
"loss": 2.2974,
"step": 3050
},
{
"epoch": 0.33285016374080634,
"grad_norm": 8.62128734588623,
"learning_rate": 9.82922884950631e-06,
"loss": 2.0612,
"step": 3100
},
{
"epoch": 0.3382187147688839,
"grad_norm": 17.61878204345703,
"learning_rate": 9.821638154771125e-06,
"loss": 2.164,
"step": 3150
},
{
"epoch": 0.3435872657969614,
"grad_norm": 12.352161407470703,
"learning_rate": 9.813885467661386e-06,
"loss": 2.3342,
"step": 3200
},
{
"epoch": 0.3489558168250389,
"grad_norm": 17.905899047851562,
"learning_rate": 9.805971048643792e-06,
"loss": 2.5445,
"step": 3250
},
{
"epoch": 0.35432436785311644,
"grad_norm": 12.0066499710083,
"learning_rate": 9.79789516361875e-06,
"loss": 1.9144,
"step": 3300
},
{
"epoch": 0.359692918881194,
"grad_norm": 8.261287689208984,
"learning_rate": 9.789658083911428e-06,
"loss": 2.2585,
"step": 3350
},
{
"epoch": 0.36506146990927146,
"grad_norm": 9.940215110778809,
"learning_rate": 9.781260086262645e-06,
"loss": 2.3342,
"step": 3400
},
{
"epoch": 0.370430020937349,
"grad_norm": 14.417253494262695,
"learning_rate": 9.77270145281958e-06,
"loss": 2.121,
"step": 3450
},
{
"epoch": 0.37579857196542654,
"grad_norm": 8.75595474243164,
"learning_rate": 9.763982471126277e-06,
"loss": 2.5303,
"step": 3500
},
{
"epoch": 0.381167122993504,
"grad_norm": 10.554227828979492,
"learning_rate": 9.755103434113998e-06,
"loss": 2.3582,
"step": 3550
},
{
"epoch": 0.38653567402158157,
"grad_norm": 7.218326091766357,
"learning_rate": 9.74606464009138e-06,
"loss": 2.1692,
"step": 3600
},
{
"epoch": 0.3919042250496591,
"grad_norm": 9.922089576721191,
"learning_rate": 9.736866392734402e-06,
"loss": 2.3558,
"step": 3650
},
{
"epoch": 0.39727277607773664,
"grad_norm": 7.799227714538574,
"learning_rate": 9.727509001076197e-06,
"loss": 2.3932,
"step": 3700
},
{
"epoch": 0.40264132710581413,
"grad_norm": 20.772369384765625,
"learning_rate": 9.71799277949666e-06,
"loss": 1.7941,
"step": 3750
},
{
"epoch": 0.40800987813389167,
"grad_norm": 7.913593292236328,
"learning_rate": 9.708318047711883e-06,
"loss": 2.0766,
"step": 3800
},
{
"epoch": 0.4133784291619692,
"grad_norm": 11.947430610656738,
"learning_rate": 9.698485130763428e-06,
"loss": 1.9943,
"step": 3850
},
{
"epoch": 0.4187469801900467,
"grad_norm": 6.923953533172607,
"learning_rate": 9.688494359007392e-06,
"loss": 2.3402,
"step": 3900
},
{
"epoch": 0.42411553121812423,
"grad_norm": 10.6170072555542,
"learning_rate": 9.678346068103312e-06,
"loss": 2.2038,
"step": 3950
},
{
"epoch": 0.42948408224620177,
"grad_norm": 8.604026794433594,
"learning_rate": 9.668040599002893e-06,
"loss": 2.2163,
"step": 4000
},
{
"epoch": 0.43485263327427925,
"grad_norm": 6.9104323387146,
"learning_rate": 9.657578297938547e-06,
"loss": 2.4159,
"step": 4050
},
{
"epoch": 0.4402211843023568,
"grad_norm": 7.583080768585205,
"learning_rate": 9.646959516411765e-06,
"loss": 2.4815,
"step": 4100
},
{
"epoch": 0.44558973533043433,
"grad_norm": 10.75753402709961,
"learning_rate": 9.636184611181301e-06,
"loss": 2.5791,
"step": 4150
},
{
"epoch": 0.4509582863585118,
"grad_norm": 8.691010475158691,
"learning_rate": 9.625253944251193e-06,
"loss": 2.1982,
"step": 4200
},
{
"epoch": 0.45632683738658936,
"grad_norm": 7.544954299926758,
"learning_rate": 9.614167882858602e-06,
"loss": 2.4019,
"step": 4250
},
{
"epoch": 0.4616953884146669,
"grad_norm": 8.283909797668457,
"learning_rate": 9.602926799461466e-06,
"loss": 2.2008,
"step": 4300
},
{
"epoch": 0.4670639394427444,
"grad_norm": 8.905241966247559,
"learning_rate": 9.591531071725992e-06,
"loss": 2.4518,
"step": 4350
},
{
"epoch": 0.4724324904708219,
"grad_norm": 6.864408016204834,
"learning_rate": 9.579981082513963e-06,
"loss": 2.4392,
"step": 4400
},
{
"epoch": 0.47780104149889946,
"grad_norm": 9.704352378845215,
"learning_rate": 9.568277219869887e-06,
"loss": 2.5323,
"step": 4450
},
{
"epoch": 0.48316959252697694,
"grad_norm": 10.778401374816895,
"learning_rate": 9.556419877007938e-06,
"loss": 1.8399,
"step": 4500
},
{
"epoch": 0.4885381435550545,
"grad_norm": 7.707949161529541,
"learning_rate": 9.544409452298773e-06,
"loss": 2.2951,
"step": 4550
},
{
"epoch": 0.493906694583132,
"grad_norm": 10.046951293945312,
"learning_rate": 9.532246349256122e-06,
"loss": 2.2967,
"step": 4600
},
{
"epoch": 0.49927524561120956,
"grad_norm": 6.738362789154053,
"learning_rate": 9.51993097652325e-06,
"loss": 2.5509,
"step": 4650
},
{
"epoch": 0.504643796639287,
"grad_norm": 8.04515552520752,
"learning_rate": 9.507463747859217e-06,
"loss": 1.8399,
"step": 4700
},
{
"epoch": 0.5100123476673646,
"grad_norm": 7.664340019226074,
"learning_rate": 9.49484508212498e-06,
"loss": 1.8151,
"step": 4750
},
{
"epoch": 0.5153808986954421,
"grad_norm": 7.901899814605713,
"learning_rate": 9.48207540326932e-06,
"loss": 2.1866,
"step": 4800
},
{
"epoch": 0.5207494497235197,
"grad_norm": 13.268074989318848,
"learning_rate": 9.469155140314604e-06,
"loss": 1.752,
"step": 4850
},
{
"epoch": 0.5261180007515971,
"grad_norm": 7.320032596588135,
"learning_rate": 9.45608472734236e-06,
"loss": 2.0824,
"step": 4900
},
{
"epoch": 0.5314865517796746,
"grad_norm": 11.113451957702637,
"learning_rate": 9.442864603478709e-06,
"loss": 2.2699,
"step": 4950
},
{
"epoch": 0.5368551028077522,
"grad_norm": 9.071028709411621,
"learning_rate": 9.429495212879593e-06,
"loss": 2.4293,
"step": 5000
},
{
"epoch": 0.5422236538358297,
"grad_norm": 7.577270984649658,
"learning_rate": 9.415977004715868e-06,
"loss": 2.095,
"step": 5050
},
{
"epoch": 0.5475922048639073,
"grad_norm": 10.385214805603027,
"learning_rate": 9.402310433158206e-06,
"loss": 2.2864,
"step": 5100
},
{
"epoch": 0.5529607558919848,
"grad_norm": 8.965862274169922,
"learning_rate": 9.388495957361836e-06,
"loss": 2.8198,
"step": 5150
},
{
"epoch": 0.5583293069200623,
"grad_norm": 10.5585355758667,
"learning_rate": 9.374534041451124e-06,
"loss": 1.8603,
"step": 5200
},
{
"epoch": 0.5636978579481398,
"grad_norm": 9.451448440551758,
"learning_rate": 9.360425154503969e-06,
"loss": 2.0864,
"step": 5250
},
{
"epoch": 0.5690664089762173,
"grad_norm": 13.569538116455078,
"learning_rate": 9.346169770536056e-06,
"loss": 1.8889,
"step": 5300
},
{
"epoch": 0.5744349600042948,
"grad_norm": 16.18905258178711,
"learning_rate": 9.331768368484918e-06,
"loss": 2.2455,
"step": 5350
},
{
"epoch": 0.5798035110323724,
"grad_norm": 8.802836418151855,
"learning_rate": 9.317221432193859e-06,
"loss": 2.1094,
"step": 5400
},
{
"epoch": 0.5851720620604499,
"grad_norm": 18.47078514099121,
"learning_rate": 9.302529450395682e-06,
"loss": 2.9082,
"step": 5450
},
{
"epoch": 0.5905406130885275,
"grad_norm": 8.37303352355957,
"learning_rate": 9.287692916696287e-06,
"loss": 2.1908,
"step": 5500
},
{
"epoch": 0.5959091641166049,
"grad_norm": 6.674489498138428,
"learning_rate": 9.27271232955807e-06,
"loss": 1.985,
"step": 5550
},
{
"epoch": 0.6012777151446824,
"grad_norm": 6.357884883880615,
"learning_rate": 9.257588192283189e-06,
"loss": 2.4121,
"step": 5600
},
{
"epoch": 0.60664626617276,
"grad_norm": 20.018545150756836,
"learning_rate": 9.242321012996649e-06,
"loss": 2.3254,
"step": 5650
},
{
"epoch": 0.6120148172008375,
"grad_norm": 6.632571220397949,
"learning_rate": 9.226911304629231e-06,
"loss": 2.0863,
"step": 5700
},
{
"epoch": 0.617383368228915,
"grad_norm": 6.925139427185059,
"learning_rate": 9.211359584900261e-06,
"loss": 2.2034,
"step": 5750
},
{
"epoch": 0.6227519192569926,
"grad_norm": 17.126575469970703,
"learning_rate": 9.195666376300212e-06,
"loss": 2.4107,
"step": 5800
},
{
"epoch": 0.62812047028507,
"grad_norm": 8.107414245605469,
"learning_rate": 9.179832206073152e-06,
"loss": 2.436,
"step": 5850
},
{
"epoch": 0.6334890213131475,
"grad_norm": 7.757444381713867,
"learning_rate": 9.163857606199039e-06,
"loss": 2.3477,
"step": 5900
},
{
"epoch": 0.6388575723412251,
"grad_norm": 18.574289321899414,
"learning_rate": 9.147743113375827e-06,
"loss": 2.0063,
"step": 5950
},
{
"epoch": 0.6442261233693026,
"grad_norm": 8.169580459594727,
"learning_rate": 9.13148926900146e-06,
"loss": 2.4438,
"step": 6000
},
{
"epoch": 0.6495946743973802,
"grad_norm": 6.801086902618408,
"learning_rate": 9.115096619155663e-06,
"loss": 2.0501,
"step": 6050
},
{
"epoch": 0.6549632254254577,
"grad_norm": 10.179924964904785,
"learning_rate": 9.098565714581601e-06,
"loss": 2.3302,
"step": 6100
},
{
"epoch": 0.6603317764535352,
"grad_norm": 9.375894546508789,
"learning_rate": 9.081897110667388e-06,
"loss": 2.4207,
"step": 6150
},
{
"epoch": 0.6657003274816127,
"grad_norm": 18.672060012817383,
"learning_rate": 9.065091367427401e-06,
"loss": 2.4269,
"step": 6200
},
{
"epoch": 0.6710688785096902,
"grad_norm": 9.124485969543457,
"learning_rate": 9.048149049483497e-06,
"loss": 2.6344,
"step": 6250
},
{
"epoch": 0.6764374295377678,
"grad_norm": 17.736295700073242,
"learning_rate": 9.031070726046014e-06,
"loss": 2.0867,
"step": 6300
},
{
"epoch": 0.6818059805658453,
"grad_norm": 10.588594436645508,
"learning_rate": 9.013856970894672e-06,
"loss": 2.3375,
"step": 6350
},
{
"epoch": 0.6871745315939228,
"grad_norm": 19.190420150756836,
"learning_rate": 8.996508362359278e-06,
"loss": 2.3877,
"step": 6400
},
{
"epoch": 0.6925430826220004,
"grad_norm": 17.127389907836914,
"learning_rate": 8.979025483300305e-06,
"loss": 2.4791,
"step": 6450
},
{
"epoch": 0.6979116336500778,
"grad_norm": 17.82516860961914,
"learning_rate": 8.961408921089304e-06,
"loss": 2.13,
"step": 6500
},
{
"epoch": 0.7032801846781553,
"grad_norm": 7.146808624267578,
"learning_rate": 8.943659267589177e-06,
"loss": 2.2693,
"step": 6550
},
{
"epoch": 0.7086487357062329,
"grad_norm": 16.019311904907227,
"learning_rate": 8.925777119134288e-06,
"loss": 2.0913,
"step": 6600
},
{
"epoch": 0.7140172867343104,
"grad_norm": 18.300996780395508,
"learning_rate": 8.90776307651043e-06,
"loss": 2.5383,
"step": 6650
},
{
"epoch": 0.719385837762388,
"grad_norm": 17.373537063598633,
"learning_rate": 8.889617744934632e-06,
"loss": 2.2624,
"step": 6700
},
{
"epoch": 0.7247543887904655,
"grad_norm": 8.753660202026367,
"learning_rate": 8.871341734034835e-06,
"loss": 2.5488,
"step": 6750
},
{
"epoch": 0.7301229398185429,
"grad_norm": 10.378632545471191,
"learning_rate": 8.852935657829414e-06,
"loss": 1.8973,
"step": 6800
},
{
"epoch": 0.7354914908466205,
"grad_norm": 6.974676132202148,
"learning_rate": 8.834400134706538e-06,
"loss": 1.7114,
"step": 6850
},
{
"epoch": 0.740860041874698,
"grad_norm": 17.7410888671875,
"learning_rate": 8.815735787403397e-06,
"loss": 2.063,
"step": 6900
},
{
"epoch": 0.7462285929027755,
"grad_norm": 18.06396484375,
"learning_rate": 8.796943242985283e-06,
"loss": 2.3759,
"step": 6950
},
{
"epoch": 0.7515971439308531,
"grad_norm": 7.956383228302002,
"learning_rate": 8.778023132824523e-06,
"loss": 1.8869,
"step": 7000
},
{
"epoch": 0.7569656949589306,
"grad_norm": 7.235781669616699,
"learning_rate": 8.758976092579263e-06,
"loss": 2.0486,
"step": 7050
},
{
"epoch": 0.762334245987008,
"grad_norm": 8.756217956542969,
"learning_rate": 8.739802762172112e-06,
"loss": 2.3013,
"step": 7100
},
{
"epoch": 0.7677027970150856,
"grad_norm": 10.677332878112793,
"learning_rate": 8.72050378576865e-06,
"loss": 2.2763,
"step": 7150
},
{
"epoch": 0.7730713480431631,
"grad_norm": 7.494720935821533,
"learning_rate": 8.701079811755775e-06,
"loss": 2.4137,
"step": 7200
},
{
"epoch": 0.7784398990712407,
"grad_norm": 8.669584274291992,
"learning_rate": 8.681531492719924e-06,
"loss": 2.0786,
"step": 7250
},
{
"epoch": 0.7838084500993182,
"grad_norm": 6.614152431488037,
"learning_rate": 8.661859485425153e-06,
"loss": 2.0609,
"step": 7300
},
{
"epoch": 0.7891770011273957,
"grad_norm": 6.329990863800049,
"learning_rate": 8.642064450791063e-06,
"loss": 2.1517,
"step": 7350
},
{
"epoch": 0.7945455521554733,
"grad_norm": 8.025002479553223,
"learning_rate": 8.622147053870603e-06,
"loss": 1.9102,
"step": 7400
},
{
"epoch": 0.7999141031835507,
"grad_norm": 6.091482639312744,
"learning_rate": 8.60210796382772e-06,
"loss": 1.824,
"step": 7450
},
{
"epoch": 0.8052826542116283,
"grad_norm": 9.765973091125488,
"learning_rate": 8.58194785391488e-06,
"loss": 2.4761,
"step": 7500
},
{
"epoch": 0.8106512052397058,
"grad_norm": 8.054783821105957,
"learning_rate": 8.561667401450448e-06,
"loss": 2.152,
"step": 7550
},
{
"epoch": 0.8160197562677833,
"grad_norm": 8.503512382507324,
"learning_rate": 8.541267287795936e-06,
"loss": 2.3627,
"step": 7600
},
{
"epoch": 0.8213883072958609,
"grad_norm": 6.793158054351807,
"learning_rate": 8.520748198333104e-06,
"loss": 2.0025,
"step": 7650
},
{
"epoch": 0.8267568583239384,
"grad_norm": 8.554807662963867,
"learning_rate": 8.50011082244094e-06,
"loss": 2.8647,
"step": 7700
},
{
"epoch": 0.8321254093520158,
"grad_norm": 8.009889602661133,
"learning_rate": 8.479355853472492e-06,
"loss": 2.1245,
"step": 7750
},
{
"epoch": 0.8374939603800934,
"grad_norm": 5.670645713806152,
"learning_rate": 8.458483988731585e-06,
"loss": 2.0752,
"step": 7800
},
{
"epoch": 0.8428625114081709,
"grad_norm": 7.713712692260742,
"learning_rate": 8.43749592944938e-06,
"loss": 2.413,
"step": 7850
},
{
"epoch": 0.8482310624362485,
"grad_norm": 8.770386695861816,
"learning_rate": 8.41639238076082e-06,
"loss": 1.9887,
"step": 7900
},
{
"epoch": 0.853599613464326,
"grad_norm": 7.535435199737549,
"learning_rate": 8.39517405168095e-06,
"loss": 1.9605,
"step": 7950
},
{
"epoch": 0.8589681644924035,
"grad_norm": 8.992109298706055,
"learning_rate": 8.373841655081077e-06,
"loss": 1.9232,
"step": 8000
},
{
"epoch": 0.864336715520481,
"grad_norm": 5.412756443023682,
"learning_rate": 8.352395907664832e-06,
"loss": 2.3468,
"step": 8050
},
{
"epoch": 0.8697052665485585,
"grad_norm": 6.559614181518555,
"learning_rate": 8.330837529944093e-06,
"loss": 2.0389,
"step": 8100
},
{
"epoch": 0.875073817576636,
"grad_norm": 8.850929260253906,
"learning_rate": 8.309167246214771e-06,
"loss": 2.3683,
"step": 8150
},
{
"epoch": 0.8804423686047136,
"grad_norm": 17.323158264160156,
"learning_rate": 8.287385784532475e-06,
"loss": 2.2053,
"step": 8200
},
{
"epoch": 0.8858109196327911,
"grad_norm": 6.870123863220215,
"learning_rate": 8.265493876688062e-06,
"loss": 2.4002,
"step": 8250
},
{
"epoch": 0.8911794706608687,
"grad_norm": 8.322813034057617,
"learning_rate": 8.243492258183038e-06,
"loss": 2.2789,
"step": 8300
},
{
"epoch": 0.8965480216889462,
"grad_norm": 6.7904839515686035,
"learning_rate": 8.221381668204858e-06,
"loss": 2.5743,
"step": 8350
},
{
"epoch": 0.9019165727170236,
"grad_norm": 8.629620552062988,
"learning_rate": 8.199162849602083e-06,
"loss": 2.1342,
"step": 8400
},
{
"epoch": 0.9072851237451012,
"grad_norm": 6.57612943649292,
"learning_rate": 8.176836548859426e-06,
"loss": 2.3242,
"step": 8450
},
{
"epoch": 0.9126536747731787,
"grad_norm": 18.26816177368164,
"learning_rate": 8.15440351607268e-06,
"loss": 2.2392,
"step": 8500
},
{
"epoch": 0.9180222258012563,
"grad_norm": 7.219480037689209,
"learning_rate": 8.131864504923501e-06,
"loss": 1.9592,
"step": 8550
},
{
"epoch": 0.9233907768293338,
"grad_norm": 17.576231002807617,
"learning_rate": 8.109220272654103e-06,
"loss": 2.2499,
"step": 8600
},
{
"epoch": 0.9287593278574113,
"grad_norm": 8.521632194519043,
"learning_rate": 8.086471580041806e-06,
"loss": 2.2565,
"step": 8650
},
{
"epoch": 0.9341278788854888,
"grad_norm": 7.680962562561035,
"learning_rate": 8.063619191373478e-06,
"loss": 2.114,
"step": 8700
},
{
"epoch": 0.9394964299135663,
"grad_norm": 6.641688346862793,
"learning_rate": 8.040663874419863e-06,
"loss": 2.3469,
"step": 8750
},
{
"epoch": 0.9448649809416438,
"grad_norm": 7.556726932525635,
"learning_rate": 8.017606400409781e-06,
"loss": 2.233,
"step": 8800
},
{
"epoch": 0.9502335319697214,
"grad_norm": 7.783173561096191,
"learning_rate": 7.994447544004215e-06,
"loss": 2.0393,
"step": 8850
},
{
"epoch": 0.9556020829977989,
"grad_norm": 17.22361946105957,
"learning_rate": 7.971188083270294e-06,
"loss": 2.0588,
"step": 8900
},
{
"epoch": 0.9609706340258765,
"grad_norm": 8.3529052734375,
"learning_rate": 7.947828799655142e-06,
"loss": 1.9373,
"step": 8950
},
{
"epoch": 0.9663391850539539,
"grad_norm": 15.621068954467773,
"learning_rate": 7.92437047795963e-06,
"loss": 1.8505,
"step": 9000
},
{
"epoch": 0.9717077360820314,
"grad_norm": 7.267556190490723,
"learning_rate": 7.900813906312004e-06,
"loss": 1.8633,
"step": 9050
},
{
"epoch": 0.977076287110109,
"grad_norm": 6.835626602172852,
"learning_rate": 7.877159876141415e-06,
"loss": 2.0578,
"step": 9100
},
{
"epoch": 0.9824448381381865,
"grad_norm": 6.728379726409912,
"learning_rate": 7.853409182151321e-06,
"loss": 2.4776,
"step": 9150
},
{
"epoch": 0.987813389166264,
"grad_norm": 7.951884746551514,
"learning_rate": 7.829562622292788e-06,
"loss": 2.405,
"step": 9200
},
{
"epoch": 0.9931819401943416,
"grad_norm": 7.328428268432617,
"learning_rate": 7.805620997737691e-06,
"loss": 2.5417,
"step": 9250
},
{
"epoch": 0.9985504912224191,
"grad_norm": 6.334090709686279,
"learning_rate": 7.781585112851778e-06,
"loss": 1.9777,
"step": 9300
},
{
"epoch": 1.0039727277607773,
"grad_norm": 15.359502792358398,
"learning_rate": 7.757455775167669e-06,
"loss": 1.7902,
"step": 9350
},
{
"epoch": 1.009341278788855,
"grad_norm": 7.517678737640381,
"learning_rate": 7.733233795357706e-06,
"loss": 1.7571,
"step": 9400
},
{
"epoch": 1.0147098298169324,
"grad_norm": 9.919081687927246,
"learning_rate": 7.708919987206727e-06,
"loss": 1.6278,
"step": 9450
},
{
"epoch": 1.02007838084501,
"grad_norm": 16.150758743286133,
"learning_rate": 7.684515167584725e-06,
"loss": 1.9163,
"step": 9500
},
{
"epoch": 1.0254469318730874,
"grad_norm": 5.594913482666016,
"learning_rate": 7.660020156419398e-06,
"loss": 2.0141,
"step": 9550
},
{
"epoch": 1.0308154829011649,
"grad_norm": 6.763994216918945,
"learning_rate": 7.63543577666861e-06,
"loss": 1.7438,
"step": 9600
},
{
"epoch": 1.0361840339292425,
"grad_norm": 7.240082740783691,
"learning_rate": 7.6107628542927305e-06,
"loss": 1.859,
"step": 9650
},
{
"epoch": 1.04155258495732,
"grad_norm": 8.067387580871582,
"learning_rate": 7.5860022182269e-06,
"loss": 1.7783,
"step": 9700
},
{
"epoch": 1.0469211359853976,
"grad_norm": 6.464083194732666,
"learning_rate": 7.561154700353166e-06,
"loss": 1.4824,
"step": 9750
},
{
"epoch": 1.052289687013475,
"grad_norm": 6.964838027954102,
"learning_rate": 7.536221135472545e-06,
"loss": 1.533,
"step": 9800
},
{
"epoch": 1.0576582380415527,
"grad_norm": 6.4511823654174805,
"learning_rate": 7.511202361276966e-06,
"loss": 1.5956,
"step": 9850
},
{
"epoch": 1.06302678906963,
"grad_norm": 17.920740127563477,
"learning_rate": 7.486099218321138e-06,
"loss": 1.7306,
"step": 9900
},
{
"epoch": 1.0683953400977075,
"grad_norm": 8.541760444641113,
"learning_rate": 7.4609125499942995e-06,
"loss": 1.7691,
"step": 9950
},
{
"epoch": 1.0737638911257852,
"grad_norm": 6.779469966888428,
"learning_rate": 7.435643202491884e-06,
"loss": 1.7331,
"step": 10000
},
{
"epoch": 1.0791324421538626,
"grad_norm": 8.191193580627441,
"learning_rate": 7.410292024787106e-06,
"loss": 1.7757,
"step": 10050
},
{
"epoch": 1.0845009931819403,
"grad_norm": 7.109296798706055,
"learning_rate": 7.384859868602411e-06,
"loss": 1.5935,
"step": 10100
},
{
"epoch": 1.0898695442100177,
"grad_norm": 6.142228126525879,
"learning_rate": 7.359347588380886e-06,
"loss": 1.5366,
"step": 10150
},
{
"epoch": 1.0952380952380953,
"grad_norm": 6.949032306671143,
"learning_rate": 7.333756041257537e-06,
"loss": 1.5345,
"step": 10200
},
{
"epoch": 1.1006066462661728,
"grad_norm": 8.204275131225586,
"learning_rate": 7.308086087030498e-06,
"loss": 1.6411,
"step": 10250
},
{
"epoch": 1.1059751972942502,
"grad_norm": 9.171077728271484,
"learning_rate": 7.282338588132143e-06,
"loss": 1.5583,
"step": 10300
},
{
"epoch": 1.1113437483223279,
"grad_norm": 7.4853105545043945,
"learning_rate": 7.256514409600108e-06,
"loss": 1.6944,
"step": 10350
},
{
"epoch": 1.1167122993504053,
"grad_norm": 5.683228492736816,
"learning_rate": 7.23061441904824e-06,
"loss": 1.7684,
"step": 10400
},
{
"epoch": 1.122080850378483,
"grad_norm": 7.478188514709473,
"learning_rate": 7.2046394866374295e-06,
"loss": 1.8792,
"step": 10450
},
{
"epoch": 1.1274494014065604,
"grad_norm": 6.323553085327148,
"learning_rate": 7.17859048504639e-06,
"loss": 1.6032,
"step": 10500
},
{
"epoch": 1.132817952434638,
"grad_norm": 17.744308471679688,
"learning_rate": 7.152468289442334e-06,
"loss": 1.3883,
"step": 10550
},
{
"epoch": 1.1381865034627154,
"grad_norm": 18.162912368774414,
"learning_rate": 7.126273777451572e-06,
"loss": 2.0817,
"step": 10600
},
{
"epoch": 1.1435550544907929,
"grad_norm": 7.005634307861328,
"learning_rate": 7.100007829130021e-06,
"loss": 2.039,
"step": 10650
},
{
"epoch": 1.1489236055188705,
"grad_norm": 7.600114345550537,
"learning_rate": 7.073671326933645e-06,
"loss": 1.7712,
"step": 10700
},
{
"epoch": 1.154292156546948,
"grad_norm": 5.979006290435791,
"learning_rate": 7.047265155688798e-06,
"loss": 1.6261,
"step": 10750
},
{
"epoch": 1.1596607075750256,
"grad_norm": 7.593403339385986,
"learning_rate": 7.020790202562513e-06,
"loss": 1.6303,
"step": 10800
},
{
"epoch": 1.165029258603103,
"grad_norm": 6.739507675170898,
"learning_rate": 6.994247357032672e-06,
"loss": 1.7067,
"step": 10850
},
{
"epoch": 1.1703978096311805,
"grad_norm": 7.7881598472595215,
"learning_rate": 6.967637510858145e-06,
"loss": 1.7556,
"step": 10900
},
{
"epoch": 1.175766360659258,
"grad_norm": 8.353170394897461,
"learning_rate": 6.940961558048814e-06,
"loss": 1.7988,
"step": 10950
},
{
"epoch": 1.1811349116873355,
"grad_norm": 17.190288543701172,
"learning_rate": 6.914220394835547e-06,
"loss": 1.8766,
"step": 11000
},
{
"epoch": 1.1865034627154132,
"grad_norm": 7.401528835296631,
"learning_rate": 6.88741491964008e-06,
"loss": 1.5798,
"step": 11050
},
{
"epoch": 1.1918720137434906,
"grad_norm": 9.883387565612793,
"learning_rate": 6.860546033044836e-06,
"loss": 1.6789,
"step": 11100
},
{
"epoch": 1.197240564771568,
"grad_norm": 6.592789173126221,
"learning_rate": 6.833614637762671e-06,
"loss": 1.6847,
"step": 11150
},
{
"epoch": 1.2026091157996457,
"grad_norm": 8.399685859680176,
"learning_rate": 6.806621638606541e-06,
"loss": 1.9617,
"step": 11200
},
{
"epoch": 1.2079776668277231,
"grad_norm": 9.318310737609863,
"learning_rate": 6.779567942459106e-06,
"loss": 1.6214,
"step": 11250
},
{
"epoch": 1.2133462178558008,
"grad_norm": 9.2510404586792,
"learning_rate": 6.7524544582422556e-06,
"loss": 1.7572,
"step": 11300
},
{
"epoch": 1.2187147688838782,
"grad_norm": 7.251592636108398,
"learning_rate": 6.725282096886584e-06,
"loss": 1.6631,
"step": 11350
},
{
"epoch": 1.2240833199119558,
"grad_norm": 7.429468631744385,
"learning_rate": 6.698051771300772e-06,
"loss": 1.8303,
"step": 11400
},
{
"epoch": 1.2294518709400333,
"grad_norm": 8.812094688415527,
"learning_rate": 6.670764396340924e-06,
"loss": 1.7698,
"step": 11450
},
{
"epoch": 1.2348204219681107,
"grad_norm": 6.430805206298828,
"learning_rate": 6.643420888779832e-06,
"loss": 1.6816,
"step": 11500
},
{
"epoch": 1.2401889729961884,
"grad_norm": 5.964927673339844,
"learning_rate": 6.61602216727617e-06,
"loss": 1.9627,
"step": 11550
},
{
"epoch": 1.2455575240242658,
"grad_norm": 8.360040664672852,
"learning_rate": 6.588569152343636e-06,
"loss": 1.6678,
"step": 11600
},
{
"epoch": 1.2509260750523434,
"grad_norm": 8.492232322692871,
"learning_rate": 6.561062766320015e-06,
"loss": 1.5811,
"step": 11650
},
{
"epoch": 1.2562946260804209,
"grad_norm": 6.507018566131592,
"learning_rate": 6.533503933336207e-06,
"loss": 1.8282,
"step": 11700
},
{
"epoch": 1.2616631771084985,
"grad_norm": 6.434554100036621,
"learning_rate": 6.505893579285164e-06,
"loss": 1.6284,
"step": 11750
},
{
"epoch": 1.267031728136576,
"grad_norm": 5.991467475891113,
"learning_rate": 6.478232631790792e-06,
"loss": 1.6377,
"step": 11800
},
{
"epoch": 1.2724002791646534,
"grad_norm": 17.064252853393555,
"learning_rate": 6.45052202017678e-06,
"loss": 1.5149,
"step": 11850
},
{
"epoch": 1.277768830192731,
"grad_norm": 18.899658203125,
"learning_rate": 6.422762675435387e-06,
"loss": 1.9017,
"step": 11900
},
{
"epoch": 1.2831373812208084,
"grad_norm": 6.97517728805542,
"learning_rate": 6.3949555301961474e-06,
"loss": 1.5649,
"step": 11950
},
{
"epoch": 1.288505932248886,
"grad_norm": 18.426116943359375,
"learning_rate": 6.367101518694554e-06,
"loss": 1.8782,
"step": 12000
},
{
"epoch": 1.2938744832769635,
"grad_norm": 18.379648208618164,
"learning_rate": 6.3392015767406626e-06,
"loss": 1.8358,
"step": 12050
},
{
"epoch": 1.2992430343050412,
"grad_norm": 10.581155776977539,
"learning_rate": 6.311256641687648e-06,
"loss": 1.8926,
"step": 12100
},
{
"epoch": 1.3046115853331186,
"grad_norm": 6.98642635345459,
"learning_rate": 6.283267652400323e-06,
"loss": 1.6466,
"step": 12150
},
{
"epoch": 1.309980136361196,
"grad_norm": 6.588789939880371,
"learning_rate": 6.25523554922358e-06,
"loss": 1.825,
"step": 12200
},
{
"epoch": 1.3153486873892737,
"grad_norm": 19.694665908813477,
"learning_rate": 6.227161273950818e-06,
"loss": 1.7454,
"step": 12250
},
{
"epoch": 1.3207172384173511,
"grad_norm": 8.332229614257812,
"learning_rate": 6.199045769792279e-06,
"loss": 1.7778,
"step": 12300
},
{
"epoch": 1.3260857894454285,
"grad_norm": 7.123226642608643,
"learning_rate": 6.170889981343378e-06,
"loss": 1.8883,
"step": 12350
},
{
"epoch": 1.3314543404735062,
"grad_norm": 11.233098030090332,
"learning_rate": 6.142694854552957e-06,
"loss": 1.7369,
"step": 12400
},
{
"epoch": 1.3368228915015838,
"grad_norm": 6.440243244171143,
"learning_rate": 6.114461336691505e-06,
"loss": 1.5687,
"step": 12450
},
{
"epoch": 1.3421914425296613,
"grad_norm": 19.100027084350586,
"learning_rate": 6.0861903763193374e-06,
"loss": 1.8765,
"step": 12500
},
{
"epoch": 1.3475599935577387,
"grad_norm": 17.5304012298584,
"learning_rate": 6.05788292325472e-06,
"loss": 1.4992,
"step": 12550
},
{
"epoch": 1.3529285445858164,
"grad_norm": 17.435745239257812,
"learning_rate": 6.029539928541965e-06,
"loss": 1.7109,
"step": 12600
},
{
"epoch": 1.3582970956138938,
"grad_norm": 19.75895118713379,
"learning_rate": 6.001162344419477e-06,
"loss": 1.7342,
"step": 12650
},
{
"epoch": 1.3636656466419712,
"grad_norm": 6.659576892852783,
"learning_rate": 5.9727511242877565e-06,
"loss": 1.4889,
"step": 12700
},
{
"epoch": 1.3690341976700489,
"grad_norm": 7.136165142059326,
"learning_rate": 5.944307222677372e-06,
"loss": 1.6115,
"step": 12750
},
{
"epoch": 1.3744027486981265,
"grad_norm": 7.5163092613220215,
"learning_rate": 5.915831595216894e-06,
"loss": 1.5552,
"step": 12800
},
{
"epoch": 1.379771299726204,
"grad_norm": 6.865508079528809,
"learning_rate": 5.88732519860078e-06,
"loss": 1.8203,
"step": 12850
},
{
"epoch": 1.3851398507542814,
"grad_norm": 6.863709449768066,
"learning_rate": 5.858788990557239e-06,
"loss": 1.7349,
"step": 12900
},
{
"epoch": 1.390508401782359,
"grad_norm": 7.734783172607422,
"learning_rate": 5.8302239298160565e-06,
"loss": 1.5744,
"step": 12950
},
{
"epoch": 1.3958769528104364,
"grad_norm": 19.227140426635742,
"learning_rate": 5.8016309760763755e-06,
"loss": 1.7058,
"step": 13000
},
{
"epoch": 1.4012455038385139,
"grad_norm": 6.083110332489014,
"learning_rate": 5.773011089974464e-06,
"loss": 1.7991,
"step": 13050
},
{
"epoch": 1.4066140548665915,
"grad_norm": 7.670594215393066,
"learning_rate": 5.7443652330514335e-06,
"loss": 1.6516,
"step": 13100
},
{
"epoch": 1.411982605894669,
"grad_norm": 6.880539894104004,
"learning_rate": 5.715694367720932e-06,
"loss": 1.7352,
"step": 13150
},
{
"epoch": 1.4173511569227466,
"grad_norm": 7.04241418838501,
"learning_rate": 5.686999457236823e-06,
"loss": 1.6688,
"step": 13200
},
{
"epoch": 1.422719707950824,
"grad_norm": 6.067574977874756,
"learning_rate": 5.658281465660804e-06,
"loss": 1.5615,
"step": 13250
},
{
"epoch": 1.4280882589789017,
"grad_norm": 6.4984331130981445,
"learning_rate": 5.629541357830035e-06,
"loss": 1.7143,
"step": 13300
},
{
"epoch": 1.433456810006979,
"grad_norm": 8.660819053649902,
"learning_rate": 5.600780099324711e-06,
"loss": 1.7287,
"step": 13350
},
{
"epoch": 1.4388253610350565,
"grad_norm": 7.762180328369141,
"learning_rate": 5.571998656435624e-06,
"loss": 1.5272,
"step": 13400
},
{
"epoch": 1.4441939120631342,
"grad_norm": 5.679063320159912,
"learning_rate": 5.543197996131704e-06,
"loss": 1.9106,
"step": 13450
},
{
"epoch": 1.4495624630912116,
"grad_norm": 18.31028175354004,
"learning_rate": 5.514379086027525e-06,
"loss": 1.5766,
"step": 13500
},
{
"epoch": 1.4549310141192893,
"grad_norm": 7.93739128112793,
"learning_rate": 5.485542894350797e-06,
"loss": 1.6056,
"step": 13550
},
{
"epoch": 1.4602995651473667,
"grad_norm": 19.02863883972168,
"learning_rate": 5.456690389909844e-06,
"loss": 1.8463,
"step": 13600
},
{
"epoch": 1.4656681161754443,
"grad_norm": 20.583894729614258,
"learning_rate": 5.427822542061043e-06,
"loss": 1.8393,
"step": 13650
},
{
"epoch": 1.4710366672035218,
"grad_norm": 7.795589447021484,
"learning_rate": 5.398940320676268e-06,
"loss": 1.5547,
"step": 13700
},
{
"epoch": 1.4764052182315992,
"grad_norm": 17.66240882873535,
"learning_rate": 5.3700446961102945e-06,
"loss": 1.8426,
"step": 13750
},
{
"epoch": 1.4817737692596769,
"grad_norm": 7.43621826171875,
"learning_rate": 5.3411366391682114e-06,
"loss": 1.6478,
"step": 13800
},
{
"epoch": 1.4871423202877543,
"grad_norm": 8.103897094726562,
"learning_rate": 5.31221712107279e-06,
"loss": 1.5381,
"step": 13850
},
{
"epoch": 1.492510871315832,
"grad_norm": 20.500654220581055,
"learning_rate": 5.283287113431867e-06,
"loss": 1.69,
"step": 13900
},
{
"epoch": 1.4978794223439094,
"grad_norm": 6.227882385253906,
"learning_rate": 5.2543475882056936e-06,
"loss": 1.7197,
"step": 13950
},
{
"epoch": 1.503247973371987,
"grad_norm": 6.347196578979492,
"learning_rate": 5.225399517674282e-06,
"loss": 1.6418,
"step": 14000
},
{
"epoch": 1.5086165244000644,
"grad_norm": 6.504974365234375,
"learning_rate": 5.196443874404744e-06,
"loss": 1.6719,
"step": 14050
},
{
"epoch": 1.5139850754281419,
"grad_norm": 6.531764507293701,
"learning_rate": 5.167481631218608e-06,
"loss": 1.7098,
"step": 14100
},
{
"epoch": 1.5193536264562195,
"grad_norm": 10.471376419067383,
"learning_rate": 5.138513761159144e-06,
"loss": 1.5199,
"step": 14150
},
{
"epoch": 1.524722177484297,
"grad_norm": 9.339461326599121,
"learning_rate": 5.109541237458664e-06,
"loss": 1.5637,
"step": 14200
},
{
"epoch": 1.5300907285123744,
"grad_norm": 6.214099884033203,
"learning_rate": 5.08056503350583e-06,
"loss": 1.6646,
"step": 14250
},
{
"epoch": 1.535459279540452,
"grad_norm": 7.6688055992126465,
"learning_rate": 5.0515861228129495e-06,
"loss": 1.7032,
"step": 14300
},
{
"epoch": 1.5408278305685297,
"grad_norm": 6.576687335968018,
"learning_rate": 5.022605478983268e-06,
"loss": 1.6774,
"step": 14350
},
{
"epoch": 1.546196381596607,
"grad_norm": 7.902665138244629,
"learning_rate": 4.993624075678259e-06,
"loss": 1.5937,
"step": 14400
},
{
"epoch": 1.5515649326246845,
"grad_norm": 19.9635009765625,
"learning_rate": 4.964642886584911e-06,
"loss": 1.6069,
"step": 14450
},
{
"epoch": 1.5569334836527622,
"grad_norm": 6.8427958488464355,
"learning_rate": 4.935662885383017e-06,
"loss": 1.5762,
"step": 14500
},
{
"epoch": 1.5623020346808396,
"grad_norm": 8.082759857177734,
"learning_rate": 4.906685045712461e-06,
"loss": 1.5261,
"step": 14550
},
{
"epoch": 1.567670585708917,
"grad_norm": 5.131589412689209,
"learning_rate": 4.877710341140504e-06,
"loss": 1.4827,
"step": 14600
},
{
"epoch": 1.5730391367369947,
"grad_norm": 17.903608322143555,
"learning_rate": 4.84873974512908e-06,
"loss": 1.689,
"step": 14650
},
{
"epoch": 1.5784076877650723,
"grad_norm": 5.094648838043213,
"learning_rate": 4.819774231002085e-06,
"loss": 1.8171,
"step": 14700
},
{
"epoch": 1.5837762387931498,
"grad_norm": 7.117594242095947,
"learning_rate": 4.790814771912681e-06,
"loss": 1.6111,
"step": 14750
},
{
"epoch": 1.5891447898212272,
"grad_norm": 6.486269950866699,
"learning_rate": 4.7618623408105956e-06,
"loss": 1.4813,
"step": 14800
},
{
"epoch": 1.5945133408493048,
"grad_norm": 7.995445728302002,
"learning_rate": 4.7329179104094456e-06,
"loss": 1.6475,
"step": 14850
},
{
"epoch": 1.5998818918773823,
"grad_norm": 7.533879280090332,
"learning_rate": 4.703982453154041e-06,
"loss": 1.6606,
"step": 14900
},
{
"epoch": 1.6052504429054597,
"grad_norm": 17.664257049560547,
"learning_rate": 4.6750569411877244e-06,
"loss": 1.6459,
"step": 14950
},
{
"epoch": 1.6106189939335374,
"grad_norm": 20.12204933166504,
"learning_rate": 4.646142346319705e-06,
"loss": 1.5996,
"step": 15000
},
{
"epoch": 1.615987544961615,
"grad_norm": 6.255960464477539,
"learning_rate": 4.617239639992411e-06,
"loss": 1.7002,
"step": 15050
},
{
"epoch": 1.6213560959896924,
"grad_norm": 20.118432998657227,
"learning_rate": 4.588349793248856e-06,
"loss": 1.6454,
"step": 15100
},
{
"epoch": 1.6267246470177699,
"grad_norm": 7.044247627258301,
"learning_rate": 4.559473776700007e-06,
"loss": 1.7084,
"step": 15150
},
{
"epoch": 1.6320931980458475,
"grad_norm": 7.60048246383667,
"learning_rate": 4.530612560492178e-06,
"loss": 1.9412,
"step": 15200
},
{
"epoch": 1.637461749073925,
"grad_norm": 18.018789291381836,
"learning_rate": 4.501767114274436e-06,
"loss": 1.7135,
"step": 15250
},
{
"epoch": 1.6428303001020024,
"grad_norm": 7.586131572723389,
"learning_rate": 4.4729384071660295e-06,
"loss": 1.5691,
"step": 15300
},
{
"epoch": 1.64819885113008,
"grad_norm": 13.612800598144531,
"learning_rate": 4.444127407723819e-06,
"loss": 1.9744,
"step": 15350
},
{
"epoch": 1.6535674021581577,
"grad_norm": 7.3399577140808105,
"learning_rate": 4.4153350839097415e-06,
"loss": 1.4499,
"step": 15400
},
{
"epoch": 1.6589359531862349,
"grad_norm": 6.747891426086426,
"learning_rate": 4.386562403058292e-06,
"loss": 1.8181,
"step": 15450
},
{
"epoch": 1.6643045042143125,
"grad_norm": 7.361255645751953,
"learning_rate": 4.357810331844017e-06,
"loss": 1.6833,
"step": 15500
},
{
"epoch": 1.6696730552423902,
"grad_norm": 5.927125453948975,
"learning_rate": 4.329079836249051e-06,
"loss": 1.7711,
"step": 15550
},
{
"epoch": 1.6750416062704676,
"grad_norm": 7.615528106689453,
"learning_rate": 4.300371881530645e-06,
"loss": 1.5959,
"step": 15600
},
{
"epoch": 1.680410157298545,
"grad_norm": 20.07931900024414,
"learning_rate": 4.271687432188749e-06,
"loss": 1.7049,
"step": 15650
},
{
"epoch": 1.6857787083266227,
"grad_norm": 7.868457794189453,
"learning_rate": 4.243027451933599e-06,
"loss": 1.6376,
"step": 15700
},
{
"epoch": 1.6911472593547001,
"grad_norm": 8.05305290222168,
"learning_rate": 4.214392903653351e-06,
"loss": 1.6639,
"step": 15750
},
{
"epoch": 1.6965158103827775,
"grad_norm": 9.143363952636719,
"learning_rate": 4.185784749381721e-06,
"loss": 1.6835,
"step": 15800
},
{
"epoch": 1.7018843614108552,
"grad_norm": 7.359554767608643,
"learning_rate": 4.157203950265665e-06,
"loss": 1.414,
"step": 15850
},
{
"epoch": 1.7072529124389328,
"grad_norm": 7.3185834884643555,
"learning_rate": 4.12865146653309e-06,
"loss": 1.8536,
"step": 15900
},
{
"epoch": 1.7126214634670103,
"grad_norm": 7.609386920928955,
"learning_rate": 4.100128257460595e-06,
"loss": 1.8839,
"step": 15950
},
{
"epoch": 1.7179900144950877,
"grad_norm": 17.042022705078125,
"learning_rate": 4.071635281341235e-06,
"loss": 1.7974,
"step": 16000
},
{
"epoch": 1.7233585655231654,
"grad_norm": 9.913634300231934,
"learning_rate": 4.043173495452332e-06,
"loss": 1.6566,
"step": 16050
},
{
"epoch": 1.7287271165512428,
"grad_norm": 6.3825907707214355,
"learning_rate": 4.0147438560233134e-06,
"loss": 1.9744,
"step": 16100
},
{
"epoch": 1.7340956675793202,
"grad_norm": 9.182840347290039,
"learning_rate": 3.986347318203575e-06,
"loss": 1.7298,
"step": 16150
},
{
"epoch": 1.7394642186073979,
"grad_norm": 5.4667582511901855,
"learning_rate": 3.957984836030413e-06,
"loss": 1.6783,
"step": 16200
},
{
"epoch": 1.7448327696354755,
"grad_norm": 7.24221134185791,
"learning_rate": 3.929657362396945e-06,
"loss": 1.9194,
"step": 16250
},
{
"epoch": 1.750201320663553,
"grad_norm": 8.579157829284668,
"learning_rate": 3.9013658490201125e-06,
"loss": 1.717,
"step": 16300
},
{
"epoch": 1.7555698716916304,
"grad_norm": 6.769927024841309,
"learning_rate": 3.8731112464087025e-06,
"loss": 1.7442,
"step": 16350
},
{
"epoch": 1.760938422719708,
"grad_norm": 6.856928825378418,
"learning_rate": 3.844894503831414e-06,
"loss": 1.8871,
"step": 16400
},
{
"epoch": 1.7663069737477854,
"grad_norm": 6.8800859451293945,
"learning_rate": 3.816716569284961e-06,
"loss": 1.642,
"step": 16450
},
{
"epoch": 1.7716755247758629,
"grad_norm": 20.131942749023438,
"learning_rate": 3.7885783894622275e-06,
"loss": 1.6477,
"step": 16500
},
{
"epoch": 1.7770440758039405,
"grad_norm": 6.804838180541992,
"learning_rate": 3.7604809097204573e-06,
"loss": 1.6398,
"step": 16550
},
{
"epoch": 1.7824126268320182,
"grad_norm": 8.101078033447266,
"learning_rate": 3.7324250740494965e-06,
"loss": 1.6002,
"step": 16600
},
{
"epoch": 1.7877811778600956,
"grad_norm": 18.09836769104004,
"learning_rate": 3.7044118250400817e-06,
"loss": 1.788,
"step": 16650
},
{
"epoch": 1.793149728888173,
"grad_norm": 7.4799346923828125,
"learning_rate": 3.6764421038521605e-06,
"loss": 1.4205,
"step": 16700
},
{
"epoch": 1.7985182799162507,
"grad_norm": 7.4728498458862305,
"learning_rate": 3.648516850183281e-06,
"loss": 1.7957,
"step": 16750
},
{
"epoch": 1.8038868309443281,
"grad_norm": 6.709610462188721,
"learning_rate": 3.6206370022370154e-06,
"loss": 1.5291,
"step": 16800
},
{
"epoch": 1.8092553819724055,
"grad_norm": 9.4188814163208,
"learning_rate": 3.5928034966914488e-06,
"loss": 1.7005,
"step": 16850
},
{
"epoch": 1.8146239330004832,
"grad_norm": 8.755097389221191,
"learning_rate": 3.5650172686676955e-06,
"loss": 1.7735,
"step": 16900
},
{
"epoch": 1.8199924840285608,
"grad_norm": 7.697582721710205,
"learning_rate": 3.5372792516984915e-06,
"loss": 1.7826,
"step": 16950
},
{
"epoch": 1.825361035056638,
"grad_norm": 19.516481399536133,
"learning_rate": 3.5095903776968277e-06,
"loss": 1.536,
"step": 17000
},
{
"epoch": 1.8307295860847157,
"grad_norm": 17.64841651916504,
"learning_rate": 3.4819515769246398e-06,
"loss": 1.8811,
"step": 17050
},
{
"epoch": 1.8360981371127933,
"grad_norm": 7.726692199707031,
"learning_rate": 3.4543637779615574e-06,
"loss": 1.3036,
"step": 17100
},
{
"epoch": 1.8414666881408708,
"grad_norm": 7.466884136199951,
"learning_rate": 3.4268279076737042e-06,
"loss": 1.8029,
"step": 17150
},
{
"epoch": 1.8468352391689482,
"grad_norm": 18.704017639160156,
"learning_rate": 3.3993448911825577e-06,
"loss": 1.5885,
"step": 17200
},
{
"epoch": 1.8522037901970259,
"grad_norm": 9.161140441894531,
"learning_rate": 3.371915651833866e-06,
"loss": 1.7349,
"step": 17250
},
{
"epoch": 1.8575723412251035,
"grad_norm": 17.896249771118164,
"learning_rate": 3.3445411111666343e-06,
"loss": 2.0384,
"step": 17300
},
{
"epoch": 1.8629408922531807,
"grad_norm": 7.49798583984375,
"learning_rate": 3.317222188882154e-06,
"loss": 1.6774,
"step": 17350
},
{
"epoch": 1.8683094432812584,
"grad_norm": 10.30838680267334,
"learning_rate": 3.289959802813111e-06,
"loss": 1.7086,
"step": 17400
},
{
"epoch": 1.873677994309336,
"grad_norm": 9.715036392211914,
"learning_rate": 3.262754868892742e-06,
"loss": 1.9072,
"step": 17450
},
{
"epoch": 1.8790465453374134,
"grad_norm": 8.555960655212402,
"learning_rate": 3.235608301124071e-06,
"loss": 1.8842,
"step": 17500
},
{
"epoch": 1.8844150963654909,
"grad_norm": 8.199530601501465,
"learning_rate": 3.2085210115491966e-06,
"loss": 1.5834,
"step": 17550
},
{
"epoch": 1.8897836473935685,
"grad_norm": 10.413174629211426,
"learning_rate": 3.1814939102186472e-06,
"loss": 1.8812,
"step": 17600
},
{
"epoch": 1.895152198421646,
"grad_norm": 10.915915489196777,
"learning_rate": 3.1545279051608113e-06,
"loss": 1.7046,
"step": 17650
},
{
"epoch": 1.9005207494497234,
"grad_norm": 6.244101047515869,
"learning_rate": 3.1276239023514255e-06,
"loss": 1.5147,
"step": 17700
},
{
"epoch": 1.905889300477801,
"grad_norm": 18.936601638793945,
"learning_rate": 3.1007828056831467e-06,
"loss": 1.6445,
"step": 17750
},
{
"epoch": 1.9112578515058787,
"grad_norm": 17.59870147705078,
"learning_rate": 3.07400551693517e-06,
"loss": 1.8792,
"step": 17800
},
{
"epoch": 1.916626402533956,
"grad_norm": 8.35571002960205,
"learning_rate": 3.0472929357429414e-06,
"loss": 1.7538,
"step": 17850
},
{
"epoch": 1.9219949535620335,
"grad_norm": 19.334714889526367,
"learning_rate": 3.020645959567926e-06,
"loss": 1.7389,
"step": 17900
},
{
"epoch": 1.9273635045901112,
"grad_norm": 8.158848762512207,
"learning_rate": 2.994065483667468e-06,
"loss": 1.6112,
"step": 17950
},
{
"epoch": 1.9327320556181886,
"grad_norm": 9.61613941192627,
"learning_rate": 2.9675524010646974e-06,
"loss": 1.9104,
"step": 18000
},
{
"epoch": 1.938100606646266,
"grad_norm": 9.808588027954102,
"learning_rate": 2.9411076025185366e-06,
"loss": 1.4322,
"step": 18050
},
{
"epoch": 1.9434691576743437,
"grad_norm": 7.1503729820251465,
"learning_rate": 2.9147319764937725e-06,
"loss": 1.6654,
"step": 18100
},
{
"epoch": 1.9488377087024213,
"grad_norm": 7.38853120803833,
"learning_rate": 2.888426409131201e-06,
"loss": 1.5595,
"step": 18150
},
{
"epoch": 1.9542062597304988,
"grad_norm": 6.872980117797852,
"learning_rate": 2.8621917842178693e-06,
"loss": 1.5195,
"step": 18200
},
{
"epoch": 1.9595748107585762,
"grad_norm": 6.323190689086914,
"learning_rate": 2.836028983157365e-06,
"loss": 1.5121,
"step": 18250
},
{
"epoch": 1.9649433617866539,
"grad_norm": 5.4187469482421875,
"learning_rate": 2.809938884940219e-06,
"loss": 1.4725,
"step": 18300
},
{
"epoch": 1.9703119128147313,
"grad_norm": 5.585220813751221,
"learning_rate": 2.7839223661143606e-06,
"loss": 1.7173,
"step": 18350
},
{
"epoch": 1.9756804638428087,
"grad_norm": 5.7172017097473145,
"learning_rate": 2.757980300755685e-06,
"loss": 1.6042,
"step": 18400
},
{
"epoch": 1.9810490148708864,
"grad_norm": 8.703761100769043,
"learning_rate": 2.7321135604386713e-06,
"loss": 1.9222,
"step": 18450
},
{
"epoch": 1.986417565898964,
"grad_norm": 6.097997665405273,
"learning_rate": 2.706323014207106e-06,
"loss": 1.918,
"step": 18500
},
{
"epoch": 1.9917861169270414,
"grad_norm": 6.828339576721191,
"learning_rate": 2.6806095285448887e-06,
"loss": 1.664,
"step": 18550
},
{
"epoch": 1.9971546679551189,
"grad_norm": 7.003544330596924,
"learning_rate": 2.654973967346914e-06,
"loss": 1.6381,
"step": 18600
},
{
"epoch": 2.0025769044934774,
"grad_norm": 6.588607311248779,
"learning_rate": 2.6294171918900592e-06,
"loss": 1.4981,
"step": 18650
},
{
"epoch": 2.0079454555215546,
"grad_norm": 7.204668998718262,
"learning_rate": 2.603940060804234e-06,
"loss": 1.2035,
"step": 18700
},
{
"epoch": 2.013314006549632,
"grad_norm": 8.035957336425781,
"learning_rate": 2.5785434300435406e-06,
"loss": 1.0785,
"step": 18750
},
{
"epoch": 2.01868255757771,
"grad_norm": 9.361004829406738,
"learning_rate": 2.5532281528575154e-06,
"loss": 1.2245,
"step": 18800
},
{
"epoch": 2.0240511086057875,
"grad_norm": 6.5703253746032715,
"learning_rate": 2.5279950797624654e-06,
"loss": 1.1247,
"step": 18850
},
{
"epoch": 2.0294196596338647,
"grad_norm": 6.012766361236572,
"learning_rate": 2.5028450585128854e-06,
"loss": 1.1375,
"step": 18900
},
{
"epoch": 2.0347882106619424,
"grad_norm": 5.649380683898926,
"learning_rate": 2.4777789340729836e-06,
"loss": 1.1421,
"step": 18950
},
{
"epoch": 2.04015676169002,
"grad_norm": 16.015594482421875,
"learning_rate": 2.45279754858829e-06,
"loss": 1.1345,
"step": 19000
},
{
"epoch": 2.0455253127180972,
"grad_norm": 16.384191513061523,
"learning_rate": 2.4279017413573606e-06,
"loss": 1.075,
"step": 19050
},
{
"epoch": 2.050893863746175,
"grad_norm": 8.830488204956055,
"learning_rate": 2.4030923488035896e-06,
"loss": 1.1915,
"step": 19100
},
{
"epoch": 2.0562624147742525,
"grad_norm": 6.353893756866455,
"learning_rate": 2.3783702044470948e-06,
"loss": 1.1907,
"step": 19150
},
{
"epoch": 2.0616309658023297,
"grad_norm": 8.547567367553711,
"learning_rate": 2.3537361388767215e-06,
"loss": 1.1703,
"step": 19200
},
{
"epoch": 2.0669995168304074,
"grad_norm": 7.108630180358887,
"learning_rate": 2.329190979722134e-06,
"loss": 1.1327,
"step": 19250
},
{
"epoch": 2.072368067858485,
"grad_norm": 8.99742603302002,
"learning_rate": 2.304735551626017e-06,
"loss": 1.1121,
"step": 19300
},
{
"epoch": 2.0777366188865627,
"grad_norm": 6.972029209136963,
"learning_rate": 2.2803706762163603e-06,
"loss": 1.116,
"step": 19350
},
{
"epoch": 2.08310516991464,
"grad_norm": 6.052910804748535,
"learning_rate": 2.2560971720788577e-06,
"loss": 1.106,
"step": 19400
},
{
"epoch": 2.0884737209427175,
"grad_norm": 14.27530574798584,
"learning_rate": 2.2319158547294096e-06,
"loss": 1.2028,
"step": 19450
},
{
"epoch": 2.093842271970795,
"grad_norm": 9.214370727539062,
"learning_rate": 2.2078275365867162e-06,
"loss": 1.1224,
"step": 19500
},
{
"epoch": 2.0992108229988724,
"grad_norm": 17.302961349487305,
"learning_rate": 2.183833026944995e-06,
"loss": 1.1746,
"step": 19550
},
{
"epoch": 2.10457937402695,
"grad_norm": 6.132236480712891,
"learning_rate": 2.159933131946777e-06,
"loss": 1.1734,
"step": 19600
},
{
"epoch": 2.1099479250550277,
"grad_norm": 8.298233032226562,
"learning_rate": 2.1361286545558295e-06,
"loss": 1.1839,
"step": 19650
},
{
"epoch": 2.1153164760831054,
"grad_norm": 8.142345428466797,
"learning_rate": 2.1124203945301786e-06,
"loss": 1.1485,
"step": 19700
},
{
"epoch": 2.1206850271111826,
"grad_norm": 7.334796905517578,
"learning_rate": 2.0888091483952433e-06,
"loss": 1.253,
"step": 19750
},
{
"epoch": 2.12605357813926,
"grad_norm": 7.935271739959717,
"learning_rate": 2.065295709417067e-06,
"loss": 1.2222,
"step": 19800
},
{
"epoch": 2.131422129167338,
"grad_norm": 7.456075191497803,
"learning_rate": 2.041880867575671e-06,
"loss": 1.1955,
"step": 19850
},
{
"epoch": 2.136790680195415,
"grad_norm": 6.429117679595947,
"learning_rate": 2.0185654095385124e-06,
"loss": 1.1424,
"step": 19900
},
{
"epoch": 2.1421592312234927,
"grad_norm": 10.556108474731445,
"learning_rate": 1.995350118634058e-06,
"loss": 1.1228,
"step": 19950
},
{
"epoch": 2.1475277822515704,
"grad_norm": 8.033760070800781,
"learning_rate": 1.9722357748254593e-06,
"loss": 1.1683,
"step": 20000
},
{
"epoch": 2.152896333279648,
"grad_norm": 6.466451168060303,
"learning_rate": 1.949223154684355e-06,
"loss": 1.1262,
"step": 20050
},
{
"epoch": 2.1582648843077252,
"grad_norm": 9.730595588684082,
"learning_rate": 1.9263130313647765e-06,
"loss": 1.1169,
"step": 20100
},
{
"epoch": 2.163633435335803,
"grad_norm": 8.086485862731934,
"learning_rate": 1.9035061745771744e-06,
"loss": 1.1748,
"step": 20150
},
{
"epoch": 2.1690019863638805,
"grad_norm": 14.71091365814209,
"learning_rate": 1.88080335056256e-06,
"loss": 1.1721,
"step": 20200
},
{
"epoch": 2.1743705373919577,
"grad_norm": 18.664920806884766,
"learning_rate": 1.8582053220667573e-06,
"loss": 1.1807,
"step": 20250
},
{
"epoch": 2.1797390884200354,
"grad_norm": 6.661670684814453,
"learning_rate": 1.8357128483147806e-06,
"loss": 1.1184,
"step": 20300
},
{
"epoch": 2.185107639448113,
"grad_norm": 16.419658660888672,
"learning_rate": 1.8133266849853247e-06,
"loss": 1.1751,
"step": 20350
},
{
"epoch": 2.1904761904761907,
"grad_norm": 16.346141815185547,
"learning_rate": 1.7910475841853786e-06,
"loss": 1.1732,
"step": 20400
},
{
"epoch": 2.195844741504268,
"grad_norm": 6.543334484100342,
"learning_rate": 1.7688762944249582e-06,
"loss": 1.1495,
"step": 20450
},
{
"epoch": 2.2012132925323455,
"grad_norm": 7.164591312408447,
"learning_rate": 1.7468135605919528e-06,
"loss": 1.1306,
"step": 20500
},
{
"epoch": 2.206581843560423,
"grad_norm": 9.77757453918457,
"learning_rate": 1.7248601239271045e-06,
"loss": 1.1395,
"step": 20550
},
{
"epoch": 2.2119503945885004,
"grad_norm": 18.20372200012207,
"learning_rate": 1.703016721999103e-06,
"loss": 1.1361,
"step": 20600
},
{
"epoch": 2.217318945616578,
"grad_norm": 18.00674819946289,
"learning_rate": 1.6812840886798043e-06,
"loss": 1.1528,
"step": 20650
},
{
"epoch": 2.2226874966446557,
"grad_norm": 8.286600112915039,
"learning_rate": 1.6596629541195787e-06,
"loss": 1.111,
"step": 20700
},
{
"epoch": 2.228056047672733,
"grad_norm": 11.050477027893066,
"learning_rate": 1.6381540447227728e-06,
"loss": 1.095,
"step": 20750
},
{
"epoch": 2.2334245987008106,
"grad_norm": 8.50864315032959,
"learning_rate": 1.6167580831233166e-06,
"loss": 1.1602,
"step": 20800
},
{
"epoch": 2.238793149728888,
"grad_norm": 7.250463962554932,
"learning_rate": 1.595475788160431e-06,
"loss": 1.1188,
"step": 20850
},
{
"epoch": 2.244161700756966,
"grad_norm": 9.344785690307617,
"learning_rate": 1.5743078748544854e-06,
"loss": 1.1872,
"step": 20900
},
{
"epoch": 2.249530251785043,
"grad_norm": 10.801837921142578,
"learning_rate": 1.553255054382975e-06,
"loss": 1.1003,
"step": 20950
},
{
"epoch": 2.2548988028131207,
"grad_norm": 9.372284889221191,
"learning_rate": 1.5323180340566247e-06,
"loss": 1.1206,
"step": 21000
},
{
"epoch": 2.2602673538411984,
"grad_norm": 12.740575790405273,
"learning_rate": 1.5114975172956247e-06,
"loss": 1.1476,
"step": 21050
},
{
"epoch": 2.265635904869276,
"grad_norm": 16.75154685974121,
"learning_rate": 1.4907942036060057e-06,
"loss": 1.1752,
"step": 21100
},
{
"epoch": 2.2710044558973532,
"grad_norm": 17.161603927612305,
"learning_rate": 1.470208788556126e-06,
"loss": 1.1481,
"step": 21150
},
{
"epoch": 2.276373006925431,
"grad_norm": 9.21768569946289,
"learning_rate": 1.4497419637533116e-06,
"loss": 1.1411,
"step": 21200
},
{
"epoch": 2.2817415579535085,
"grad_norm": 10.822429656982422,
"learning_rate": 1.429394416820613e-06,
"loss": 1.147,
"step": 21250
},
{
"epoch": 2.2871101089815857,
"grad_norm": 9.155590057373047,
"learning_rate": 1.4091668313737133e-06,
"loss": 1.1169,
"step": 21300
},
{
"epoch": 2.2924786600096634,
"grad_norm": 16.39679527282715,
"learning_rate": 1.3890598869979494e-06,
"loss": 1.1333,
"step": 21350
},
{
"epoch": 2.297847211037741,
"grad_norm": 7.832981109619141,
"learning_rate": 1.3690742592254874e-06,
"loss": 1.1509,
"step": 21400
},
{
"epoch": 2.3032157620658182,
"grad_norm": 18.698701858520508,
"learning_rate": 1.3492106195126237e-06,
"loss": 1.1706,
"step": 21450
},
{
"epoch": 2.308584313093896,
"grad_norm": 9.106189727783203,
"learning_rate": 1.3294696352172258e-06,
"loss": 1.0814,
"step": 21500
},
{
"epoch": 2.3139528641219735,
"grad_norm": 8.098555564880371,
"learning_rate": 1.3098519695763169e-06,
"loss": 1.2489,
"step": 21550
},
{
"epoch": 2.319321415150051,
"grad_norm": 8.390243530273438,
"learning_rate": 1.2903582816837844e-06,
"loss": 1.1502,
"step": 21600
},
{
"epoch": 2.3246899661781284,
"grad_norm": 8.757095336914062,
"learning_rate": 1.2709892264682412e-06,
"loss": 1.1508,
"step": 21650
},
{
"epoch": 2.330058517206206,
"grad_norm": 7.823190689086914,
"learning_rate": 1.25174545467102e-06,
"loss": 1.2113,
"step": 21700
},
{
"epoch": 2.3354270682342837,
"grad_norm": 9.674703598022461,
"learning_rate": 1.2326276128243175e-06,
"loss": 1.1866,
"step": 21750
},
{
"epoch": 2.340795619262361,
"grad_norm": 8.03213882446289,
"learning_rate": 1.2136363432294607e-06,
"loss": 1.1158,
"step": 21800
},
{
"epoch": 2.3461641702904386,
"grad_norm": 9.820610046386719,
"learning_rate": 1.1947722839353375e-06,
"loss": 1.1573,
"step": 21850
},
{
"epoch": 2.351532721318516,
"grad_norm": 6.820501804351807,
"learning_rate": 1.176036068716953e-06,
"loss": 1.1848,
"step": 21900
},
{
"epoch": 2.356901272346594,
"grad_norm": 10.612143516540527,
"learning_rate": 1.157428327054147e-06,
"loss": 1.1719,
"step": 21950
},
{
"epoch": 2.362269823374671,
"grad_norm": 6.224195957183838,
"learning_rate": 1.138949684110432e-06,
"loss": 1.1361,
"step": 22000
},
{
"epoch": 2.3676383744027487,
"grad_norm": 8.215645790100098,
"learning_rate": 1.1206007607119989e-06,
"loss": 1.0933,
"step": 22050
},
{
"epoch": 2.3730069254308264,
"grad_norm": 16.23103141784668,
"learning_rate": 1.1023821733268576e-06,
"loss": 1.1502,
"step": 22100
},
{
"epoch": 2.3783754764589036,
"grad_norm": 7.224851131439209,
"learning_rate": 1.0842945340441207e-06,
"loss": 1.1564,
"step": 22150
},
{
"epoch": 2.383744027486981,
"grad_norm": 6.062022686004639,
"learning_rate": 1.0663384505534486e-06,
"loss": 1.1801,
"step": 22200
},
{
"epoch": 2.389112578515059,
"grad_norm": 6.085788726806641,
"learning_rate": 1.0485145261246222e-06,
"loss": 1.1397,
"step": 22250
},
{
"epoch": 2.394481129543136,
"grad_norm": 21.13031578063965,
"learning_rate": 1.0308233595872823e-06,
"loss": 1.1892,
"step": 22300
},
{
"epoch": 2.3998496805712137,
"grad_norm": 8.92026138305664,
"learning_rate": 1.013265545310807e-06,
"loss": 1.102,
"step": 22350
},
{
"epoch": 2.4052182315992914,
"grad_norm": 32.13631820678711,
"learning_rate": 9.958416731843467e-07,
"loss": 1.142,
"step": 22400
},
{
"epoch": 2.410586782627369,
"grad_norm": 8.617265701293945,
"learning_rate": 9.78552328597001e-07,
"loss": 1.0888,
"step": 22450
},
{
"epoch": 2.4159553336554462,
"grad_norm": 6.967668056488037,
"learning_rate": 9.613980924181531e-07,
"loss": 1.1972,
"step": 22500
},
{
"epoch": 2.421323884683524,
"grad_norm": 14.281668663024902,
"learning_rate": 9.44379540977956e-07,
"loss": 1.2271,
"step": 22550
},
{
"epoch": 2.4266924357116015,
"grad_norm": 9.496726036071777,
"learning_rate": 9.274972460479659e-07,
"loss": 1.1356,
"step": 22600
},
{
"epoch": 2.432060986739679,
"grad_norm": 7.084106922149658,
"learning_rate": 9.107517748219391e-07,
"loss": 1.1693,
"step": 22650
},
{
"epoch": 2.4374295377677564,
"grad_norm": 10.713268280029297,
"learning_rate": 8.941436898967676e-07,
"loss": 1.1308,
"step": 22700
},
{
"epoch": 2.442798088795834,
"grad_norm": 6.2390031814575195,
"learning_rate": 8.776735492535827e-07,
"loss": 1.1522,
"step": 22750
},
{
"epoch": 2.4481666398239117,
"grad_norm": 17.40299415588379,
"learning_rate": 8.613419062390072e-07,
"loss": 1.179,
"step": 22800
},
{
"epoch": 2.453535190851989,
"grad_norm": 7.331134796142578,
"learning_rate": 8.451493095465674e-07,
"loss": 1.0968,
"step": 22850
},
{
"epoch": 2.4589037418800666,
"grad_norm": 6.012197017669678,
"learning_rate": 8.290963031982535e-07,
"loss": 1.1,
"step": 22900
},
{
"epoch": 2.464272292908144,
"grad_norm": 18.07862663269043,
"learning_rate": 8.131834265262451e-07,
"loss": 1.1539,
"step": 22950
},
{
"epoch": 2.4696408439362214,
"grad_norm": 10.868839263916016,
"learning_rate": 7.974112141547912e-07,
"loss": 1.1659,
"step": 23000
},
{
"epoch": 2.475009394964299,
"grad_norm": 18.099262237548828,
"learning_rate": 7.81780195982248e-07,
"loss": 1.1514,
"step": 23050
},
{
"epoch": 2.4803779459923767,
"grad_norm": 6.631985187530518,
"learning_rate": 7.662908971632777e-07,
"loss": 1.1449,
"step": 23100
},
{
"epoch": 2.4857464970204544,
"grad_norm": 10.29295539855957,
"learning_rate": 7.509438380912021e-07,
"loss": 1.1482,
"step": 23150
},
{
"epoch": 2.4911150480485316,
"grad_norm": 7.769371509552002,
"learning_rate": 7.35739534380519e-07,
"loss": 1.124,
"step": 23200
},
{
"epoch": 2.496483599076609,
"grad_norm": 8.667695045471191,
"learning_rate": 7.206784968495823e-07,
"loss": 1.1227,
"step": 23250
},
{
"epoch": 2.501852150104687,
"grad_norm": 16.578887939453125,
"learning_rate": 7.057612315034367e-07,
"loss": 1.1566,
"step": 23300
},
{
"epoch": 2.5072207011327645,
"grad_norm": 7.591117858886719,
"learning_rate": 6.909882395168205e-07,
"loss": 1.1766,
"step": 23350
},
{
"epoch": 2.5125892521608417,
"grad_norm": 6.038825035095215,
"learning_rate": 6.763600172173229e-07,
"loss": 1.1973,
"step": 23400
},
{
"epoch": 2.5179578031889194,
"grad_norm": 9.154902458190918,
"learning_rate": 6.61877056068716e-07,
"loss": 1.1941,
"step": 23450
},
{
"epoch": 2.523326354216997,
"grad_norm": 16.231149673461914,
"learning_rate": 6.475398426544372e-07,
"loss": 1.1128,
"step": 23500
},
{
"epoch": 2.5286949052450742,
"grad_norm": 8.24044418334961,
"learning_rate": 6.33348858661243e-07,
"loss": 1.122,
"step": 23550
},
{
"epoch": 2.534063456273152,
"grad_norm": 6.951202392578125,
"learning_rate": 6.193045808630255e-07,
"loss": 1.1067,
"step": 23600
},
{
"epoch": 2.5394320073012295,
"grad_norm": 48.8431510925293,
"learning_rate": 6.054074811047972e-07,
"loss": 1.1233,
"step": 23650
},
{
"epoch": 2.5448005583293067,
"grad_norm": 20.162029266357422,
"learning_rate": 5.916580262868338e-07,
"loss": 1.2323,
"step": 23700
},
{
"epoch": 2.5501691093573844,
"grad_norm": 7.568118095397949,
"learning_rate": 5.780566783489927e-07,
"loss": 1.184,
"step": 23750
},
{
"epoch": 2.555537660385462,
"grad_norm": 18.249082565307617,
"learning_rate": 5.646038942551885e-07,
"loss": 1.1772,
"step": 23800
},
{
"epoch": 2.5609062114135392,
"grad_norm": 8.595870018005371,
"learning_rate": 5.513001259780432e-07,
"loss": 1.1604,
"step": 23850
},
{
"epoch": 2.566274762441617,
"grad_norm": 6.2978196144104,
"learning_rate": 5.381458204836998e-07,
"loss": 1.1265,
"step": 23900
},
{
"epoch": 2.5716433134696945,
"grad_norm": 8.599082946777344,
"learning_rate": 5.251414197168097e-07,
"loss": 1.1018,
"step": 23950
},
{
"epoch": 2.577011864497772,
"grad_norm": 9.089366912841797,
"learning_rate": 5.122873605856788e-07,
"loss": 1.0831,
"step": 24000
},
{
"epoch": 2.58238041552585,
"grad_norm": 17.699949264526367,
"learning_rate": 4.995840749475906e-07,
"loss": 1.1397,
"step": 24050
},
{
"epoch": 2.587748966553927,
"grad_norm": 9.30536937713623,
"learning_rate": 4.870319895942993e-07,
"loss": 1.1431,
"step": 24100
},
{
"epoch": 2.5931175175820047,
"grad_norm": 7.9017229080200195,
"learning_rate": 4.746315262376894e-07,
"loss": 1.1171,
"step": 24150
},
{
"epoch": 2.5984860686100824,
"grad_norm": 9.600709915161133,
"learning_rate": 4.6238310149560815e-07,
"loss": 1.1862,
"step": 24200
},
{
"epoch": 2.6038546196381596,
"grad_norm": 15.521000862121582,
"learning_rate": 4.5028712687786637e-07,
"loss": 1.0834,
"step": 24250
},
{
"epoch": 2.609223170666237,
"grad_norm": 17.337730407714844,
"learning_rate": 4.3834400877241557e-07,
"loss": 1.1675,
"step": 24300
},
{
"epoch": 2.614591721694315,
"grad_norm": 9.940308570861816,
"learning_rate": 4.2655414843169207e-07,
"loss": 1.1595,
"step": 24350
},
{
"epoch": 2.619960272722392,
"grad_norm": 11.373148918151855,
"learning_rate": 4.1491794195914036e-07,
"loss": 1.1601,
"step": 24400
},
{
"epoch": 2.6253288237504697,
"grad_norm": 9.516183853149414,
"learning_rate": 4.034357802958999e-07,
"loss": 1.0979,
"step": 24450
},
{
"epoch": 2.6306973747785474,
"grad_norm": 7.607344627380371,
"learning_rate": 3.921080492076729e-07,
"loss": 1.0952,
"step": 24500
},
{
"epoch": 2.6360659258066246,
"grad_norm": 8.217754364013672,
"learning_rate": 3.809351292717656e-07,
"loss": 1.1319,
"step": 24550
},
{
"epoch": 2.6414344768347022,
"grad_norm": 7.739314079284668,
"learning_rate": 3.6991739586429875e-07,
"loss": 1.1889,
"step": 24600
},
{
"epoch": 2.64680302786278,
"grad_norm": 7.91011381149292,
"learning_rate": 3.590552191476004e-07,
"loss": 1.1452,
"step": 24650
},
{
"epoch": 2.652171578890857,
"grad_norm": 18.33525848388672,
"learning_rate": 3.483489640577653e-07,
"loss": 1.1669,
"step": 24700
},
{
"epoch": 2.6575401299189347,
"grad_norm": 10.001716613769531,
"learning_rate": 3.3779899029239504e-07,
"loss": 1.1424,
"step": 24750
},
{
"epoch": 2.6629086809470124,
"grad_norm": 16.244098663330078,
"learning_rate": 3.2740565229851473e-07,
"loss": 1.1258,
"step": 24800
},
{
"epoch": 2.66827723197509,
"grad_norm": 8.882587432861328,
"learning_rate": 3.1716929926066563e-07,
"loss": 1.0455,
"step": 24850
},
{
"epoch": 2.6736457830031677,
"grad_norm": 18.185453414916992,
"learning_rate": 3.070902750891708e-07,
"loss": 1.1575,
"step": 24900
},
{
"epoch": 2.679014334031245,
"grad_norm": 16.914825439453125,
"learning_rate": 2.971689184085813e-07,
"loss": 1.176,
"step": 24950
},
{
"epoch": 2.6843828850593225,
"grad_norm": 7.199150562286377,
"learning_rate": 2.8740556254630126e-07,
"loss": 1.1321,
"step": 25000
},
{
"epoch": 2.6897514360874,
"grad_norm": 7.281803607940674,
"learning_rate": 2.778005355213859e-07,
"loss": 1.1242,
"step": 25050
},
{
"epoch": 2.6951199871154774,
"grad_norm": 7.644674301147461,
"learning_rate": 2.683541600335271e-07,
"loss": 1.1219,
"step": 25100
},
{
"epoch": 2.700488538143555,
"grad_norm": 7.411130428314209,
"learning_rate": 2.59066753452204e-07,
"loss": 1.1559,
"step": 25150
},
{
"epoch": 2.7058570891716327,
"grad_norm": 7.964588642120361,
"learning_rate": 2.499386278060262e-07,
"loss": 1.1,
"step": 25200
},
{
"epoch": 2.71122564019971,
"grad_norm": 7.902915000915527,
"learning_rate": 2.409700897722456e-07,
"loss": 1.1239,
"step": 25250
},
{
"epoch": 2.7165941912277876,
"grad_norm": 6.715937614440918,
"learning_rate": 2.3216144066646073e-07,
"loss": 1.0599,
"step": 25300
},
{
"epoch": 2.721962742255865,
"grad_norm": 7.483630180358887,
"learning_rate": 2.2351297643248337e-07,
"loss": 1.1114,
"step": 25350
},
{
"epoch": 2.7273312932839424,
"grad_norm": 7.820849418640137,
"learning_rate": 2.1502498763240453e-07,
"loss": 1.1012,
"step": 25400
},
{
"epoch": 2.73269984431202,
"grad_norm": 17.91743278503418,
"learning_rate": 2.0669775943682634e-07,
"loss": 1.1322,
"step": 25450
},
{
"epoch": 2.7380683953400977,
"grad_norm": 8.192693710327148,
"learning_rate": 1.9853157161528468e-07,
"loss": 1.0981,
"step": 25500
},
{
"epoch": 2.7434369463681754,
"grad_norm": 17.21625328063965,
"learning_rate": 1.9052669852684945e-07,
"loss": 1.1364,
"step": 25550
},
{
"epoch": 2.748805497396253,
"grad_norm": 18.796228408813477,
"learning_rate": 1.8268340911090533e-07,
"loss": 1.1526,
"step": 25600
},
{
"epoch": 2.7541740484243302,
"grad_norm": 19.57398796081543,
"learning_rate": 1.7500196687811776e-07,
"loss": 1.1529,
"step": 25650
},
{
"epoch": 2.759542599452408,
"grad_norm": 7.654101371765137,
"learning_rate": 1.674826299015775e-07,
"loss": 1.1793,
"step": 25700
},
{
"epoch": 2.7649111504804855,
"grad_norm": 9.904139518737793,
"learning_rate": 1.60125650808135e-07,
"loss": 1.1499,
"step": 25750
},
{
"epoch": 2.7702797015085627,
"grad_norm": 12.009369850158691,
"learning_rate": 1.529312767699065e-07,
"loss": 1.0964,
"step": 25800
},
{
"epoch": 2.7756482525366404,
"grad_norm": 10.86778450012207,
"learning_rate": 1.4589974949597463e-07,
"loss": 1.1427,
"step": 25850
},
{
"epoch": 2.781016803564718,
"grad_norm": 8.0640869140625,
"learning_rate": 1.3903130522426589e-07,
"loss": 1.1492,
"step": 25900
},
{
"epoch": 2.7863853545927952,
"grad_norm": 9.057573318481445,
"learning_rate": 1.3232617471361452e-07,
"loss": 1.1511,
"step": 25950
},
{
"epoch": 2.791753905620873,
"grad_norm": 8.094735145568848,
"learning_rate": 1.2578458323600774e-07,
"loss": 1.1299,
"step": 26000
},
{
"epoch": 2.7971224566489505,
"grad_norm": 6.8636155128479,
"learning_rate": 1.194067505690194e-07,
"loss": 1.085,
"step": 26050
},
{
"epoch": 2.8024910076770277,
"grad_norm": 14.724271774291992,
"learning_rate": 1.1319289098842667e-07,
"loss": 1.0734,
"step": 26100
},
{
"epoch": 2.8078595587051054,
"grad_norm": 8.863556861877441,
"learning_rate": 1.0714321326100895e-07,
"loss": 1.1229,
"step": 26150
},
{
"epoch": 2.813228109733183,
"grad_norm": 18.311233520507812,
"learning_rate": 1.0125792063753415e-07,
"loss": 1.1797,
"step": 26200
},
{
"epoch": 2.8185966607612607,
"grad_norm": 10.026941299438477,
"learning_rate": 9.553721084593182e-08,
"loss": 1.1378,
"step": 26250
},
{
"epoch": 2.823965211789338,
"grad_norm": 12.081165313720703,
"learning_rate": 8.998127608464801e-08,
"loss": 1.1052,
"step": 26300
},
{
"epoch": 2.8293337628174156,
"grad_norm": 8.546296119689941,
"learning_rate": 8.459030301618931e-08,
"loss": 1.1473,
"step": 26350
},
{
"epoch": 2.834702313845493,
"grad_norm": 8.367006301879883,
"learning_rate": 7.936447276085224e-08,
"loss": 1.1076,
"step": 26400
},
{
"epoch": 2.840070864873571,
"grad_norm": 8.272223472595215,
"learning_rate": 7.430396089063597e-08,
"loss": 1.1833,
"step": 26450
},
{
"epoch": 2.845439415901648,
"grad_norm": 16.885454177856445,
"learning_rate": 6.940893742334587e-08,
"loss": 1.0555,
"step": 26500
},
{
"epoch": 2.8508079669297257,
"grad_norm": 10.456121444702148,
"learning_rate": 6.46795668168787e-08,
"loss": 1.1435,
"step": 26550
},
{
"epoch": 2.8561765179578034,
"grad_norm": 8.240148544311523,
"learning_rate": 6.011600796370032e-08,
"loss": 1.0777,
"step": 26600
},
{
"epoch": 2.8615450689858806,
"grad_norm": 15.448931694030762,
"learning_rate": 5.5718414185506055e-08,
"loss": 1.1292,
"step": 26650
},
{
"epoch": 2.866913620013958,
"grad_norm": 12.672825813293457,
"learning_rate": 5.148693322806986e-08,
"loss": 1.1192,
"step": 26700
},
{
"epoch": 2.872282171042036,
"grad_norm": 7.322881698608398,
"learning_rate": 4.742170725627881e-08,
"loss": 1.0856,
"step": 26750
},
{
"epoch": 2.877650722070113,
"grad_norm": 6.888855934143066,
"learning_rate": 4.3522872849359744e-08,
"loss": 1.138,
"step": 26800
},
{
"epoch": 2.8830192730981907,
"grad_norm": 9.260125160217285,
"learning_rate": 3.979056099628842e-08,
"loss": 1.2059,
"step": 26850
},
{
"epoch": 2.8883878241262684,
"grad_norm": 10.245002746582031,
"learning_rate": 3.622489709138921e-08,
"loss": 1.1057,
"step": 26900
},
{
"epoch": 2.8937563751543456,
"grad_norm": 8.351652145385742,
"learning_rate": 3.282600093012234e-08,
"loss": 1.1575,
"step": 26950
},
{
"epoch": 2.8991249261824232,
"grad_norm": 6.738492012023926,
"learning_rate": 2.959398670505986e-08,
"loss": 1.1577,
"step": 27000
},
{
"epoch": 2.904493477210501,
"grad_norm": 9.16823959350586,
"learning_rate": 2.652896300204766e-08,
"loss": 1.1307,
"step": 27050
},
{
"epoch": 2.9098620282385785,
"grad_norm": 17.78186798095703,
"learning_rate": 2.363103279655832e-08,
"loss": 1.1314,
"step": 27100
},
{
"epoch": 2.915230579266656,
"grad_norm": 10.113680839538574,
"learning_rate": 2.0900293450231148e-08,
"loss": 1.1145,
"step": 27150
},
{
"epoch": 2.9205991302947334,
"grad_norm": 8.009809494018555,
"learning_rate": 1.8336836707601446e-08,
"loss": 1.1278,
"step": 27200
},
{
"epoch": 2.925967681322811,
"grad_norm": 9.789654731750488,
"learning_rate": 1.5940748693017426e-08,
"loss": 1.0919,
"step": 27250
},
{
"epoch": 2.9313362323508887,
"grad_norm": 16.743701934814453,
"learning_rate": 1.3712109907748073e-08,
"loss": 1.1995,
"step": 27300
},
{
"epoch": 2.936704783378966,
"grad_norm": 6.287803649902344,
"learning_rate": 1.1650995227276974e-08,
"loss": 1.0879,
"step": 27350
},
{
"epoch": 2.9420733344070435,
"grad_norm": 8.777688980102539,
"learning_rate": 9.757473898786562e-09,
"loss": 1.235,
"step": 27400
},
{
"epoch": 2.947441885435121,
"grad_norm": 9.769450187683105,
"learning_rate": 8.031609538834417e-09,
"loss": 1.1746,
"step": 27450
},
{
"epoch": 2.9528104364631984,
"grad_norm": 16.548913955688477,
"learning_rate": 6.473460131212194e-09,
"loss": 1.0666,
"step": 27500
},
{
"epoch": 2.958178987491276,
"grad_norm": 8.09269905090332,
"learning_rate": 5.083078025000521e-09,
"loss": 1.0938,
"step": 27550
},
{
"epoch": 2.9635475385193537,
"grad_norm": 10.418055534362793,
"learning_rate": 3.860509932808732e-09,
"loss": 1.1509,
"step": 27600
},
{
"epoch": 2.968916089547431,
"grad_norm": 8.594873428344727,
"learning_rate": 2.805796929205573e-09,
"loss": 1.1935,
"step": 27650
},
{
"epoch": 2.9742846405755086,
"grad_norm": 7.283506870269775,
"learning_rate": 1.918974449339195e-09,
"loss": 1.1609,
"step": 27700
},
{
"epoch": 2.979653191603586,
"grad_norm": 6.869657039642334,
"learning_rate": 1.2000722877469894e-09,
"loss": 1.12,
"step": 27750
},
{
"epoch": 2.985021742631664,
"grad_norm": 17.424175262451172,
"learning_rate": 6.491145973558377e-10,
"loss": 1.1747,
"step": 27800
},
{
"epoch": 2.990390293659741,
"grad_norm": 6.791327476501465,
"learning_rate": 2.661198886666494e-10,
"loss": 1.1605,
"step": 27850
},
{
"epoch": 2.9957588446878187,
"grad_norm": 7.1723785400390625,
"learning_rate": 5.11010291376346e-11,
"loss": 1.1334,
"step": 27900
},
{
"epoch": 2.999946314489719,
"step": 27939,
"total_flos": 6.674170939930627e+19,
"train_loss": 1.7448459406648975,
"train_runtime": 100944.9227,
"train_samples_per_second": 8.857,
"train_steps_per_second": 0.277
}
],
"logging_steps": 50,
"max_steps": 27939,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.674170939930627e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}