tiny-clinicalbert-qa / trainer_state.json
jon-t's picture
End of training
767bcbe verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 124205,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020128014170121975,
"grad_norm": 4.729732990264893,
"learning_rate": 2.994e-05,
"loss": 4.6998,
"step": 500
},
{
"epoch": 0.04025602834024395,
"grad_norm": 5.319427967071533,
"learning_rate": 2.9878986298047775e-05,
"loss": 3.2594,
"step": 1000
},
{
"epoch": 0.06038404251036593,
"grad_norm": 6.513505458831787,
"learning_rate": 2.975773008366679e-05,
"loss": 3.0934,
"step": 1500
},
{
"epoch": 0.0805120566804879,
"grad_norm": 7.240413665771484,
"learning_rate": 2.9636473869285803e-05,
"loss": 3.0216,
"step": 2000
},
{
"epoch": 0.10064007085060987,
"grad_norm": 7.327127933502197,
"learning_rate": 2.9515217654904812e-05,
"loss": 2.8993,
"step": 2500
},
{
"epoch": 0.12076808502073186,
"grad_norm": 7.550909519195557,
"learning_rate": 2.9393961440523828e-05,
"loss": 2.7834,
"step": 3000
},
{
"epoch": 0.14089609919085383,
"grad_norm": 9.492281913757324,
"learning_rate": 2.927270522614284e-05,
"loss": 2.6516,
"step": 3500
},
{
"epoch": 0.1610241133609758,
"grad_norm": 13.708161354064941,
"learning_rate": 2.9151449011761853e-05,
"loss": 2.5647,
"step": 4000
},
{
"epoch": 0.18115212753109777,
"grad_norm": 8.566939353942871,
"learning_rate": 2.903019279738087e-05,
"loss": 2.4627,
"step": 4500
},
{
"epoch": 0.20128014170121974,
"grad_norm": 13.398711204528809,
"learning_rate": 2.890893658299988e-05,
"loss": 2.427,
"step": 5000
},
{
"epoch": 0.22140815587134174,
"grad_norm": 9.740044593811035,
"learning_rate": 2.878768036861889e-05,
"loss": 2.3102,
"step": 5500
},
{
"epoch": 0.24153617004146372,
"grad_norm": 13.687227249145508,
"learning_rate": 2.8666424154237906e-05,
"loss": 2.2774,
"step": 6000
},
{
"epoch": 0.26166418421158566,
"grad_norm": 10.618597984313965,
"learning_rate": 2.854516793985692e-05,
"loss": 2.2404,
"step": 6500
},
{
"epoch": 0.28179219838170766,
"grad_norm": 9.622278213500977,
"learning_rate": 2.842391172547593e-05,
"loss": 2.1845,
"step": 7000
},
{
"epoch": 0.30192021255182966,
"grad_norm": 12.272369384765625,
"learning_rate": 2.8302655511094947e-05,
"loss": 2.1741,
"step": 7500
},
{
"epoch": 0.3220482267219516,
"grad_norm": 8.921002388000488,
"learning_rate": 2.818139929671396e-05,
"loss": 2.1317,
"step": 8000
},
{
"epoch": 0.3421762408920736,
"grad_norm": 11.55328369140625,
"learning_rate": 2.8060143082332968e-05,
"loss": 2.1168,
"step": 8500
},
{
"epoch": 0.36230425506219555,
"grad_norm": 9.84124755859375,
"learning_rate": 2.7938886867951984e-05,
"loss": 2.1129,
"step": 9000
},
{
"epoch": 0.38243226923231755,
"grad_norm": 9.949904441833496,
"learning_rate": 2.7817630653570996e-05,
"loss": 2.0368,
"step": 9500
},
{
"epoch": 0.4025602834024395,
"grad_norm": 15.386507034301758,
"learning_rate": 2.769637443919001e-05,
"loss": 2.0455,
"step": 10000
},
{
"epoch": 0.4226882975725615,
"grad_norm": 10.55031967163086,
"learning_rate": 2.757511822480902e-05,
"loss": 2.0155,
"step": 10500
},
{
"epoch": 0.4428163117426835,
"grad_norm": 13.076041221618652,
"learning_rate": 2.7453862010428037e-05,
"loss": 1.9655,
"step": 11000
},
{
"epoch": 0.46294432591280543,
"grad_norm": 9.13262939453125,
"learning_rate": 2.7332605796047046e-05,
"loss": 1.9698,
"step": 11500
},
{
"epoch": 0.48307234008292743,
"grad_norm": 14.231966972351074,
"learning_rate": 2.721134958166606e-05,
"loss": 1.9137,
"step": 12000
},
{
"epoch": 0.5032003542530494,
"grad_norm": 13.586121559143066,
"learning_rate": 2.7090093367285074e-05,
"loss": 1.8677,
"step": 12500
},
{
"epoch": 0.5233283684231713,
"grad_norm": 16.752092361450195,
"learning_rate": 2.6968837152904087e-05,
"loss": 1.8927,
"step": 13000
},
{
"epoch": 0.5434563825932933,
"grad_norm": 11.639701843261719,
"learning_rate": 2.68475809385231e-05,
"loss": 1.8398,
"step": 13500
},
{
"epoch": 0.5635843967634153,
"grad_norm": 20.035966873168945,
"learning_rate": 2.6726324724142115e-05,
"loss": 1.8741,
"step": 14000
},
{
"epoch": 0.5837124109335373,
"grad_norm": 12.169594764709473,
"learning_rate": 2.6605068509761127e-05,
"loss": 1.8125,
"step": 14500
},
{
"epoch": 0.6038404251036593,
"grad_norm": 14.39586067199707,
"learning_rate": 2.6483812295380136e-05,
"loss": 1.7955,
"step": 15000
},
{
"epoch": 0.6239684392737812,
"grad_norm": 9.263022422790527,
"learning_rate": 2.6362556080999152e-05,
"loss": 1.8026,
"step": 15500
},
{
"epoch": 0.6440964534439032,
"grad_norm": 10.964536666870117,
"learning_rate": 2.6241299866618165e-05,
"loss": 1.7693,
"step": 16000
},
{
"epoch": 0.6642244676140252,
"grad_norm": 17.373477935791016,
"learning_rate": 2.6120043652237177e-05,
"loss": 1.7218,
"step": 16500
},
{
"epoch": 0.6843524817841472,
"grad_norm": 13.868760108947754,
"learning_rate": 2.5998787437856193e-05,
"loss": 1.7308,
"step": 17000
},
{
"epoch": 0.7044804959542692,
"grad_norm": 16.622940063476562,
"learning_rate": 2.5877531223475205e-05,
"loss": 1.7091,
"step": 17500
},
{
"epoch": 0.7246085101243911,
"grad_norm": 9.113160133361816,
"learning_rate": 2.5756275009094214e-05,
"loss": 1.7317,
"step": 18000
},
{
"epoch": 0.7447365242945131,
"grad_norm": 24.89649200439453,
"learning_rate": 2.563501879471323e-05,
"loss": 1.6873,
"step": 18500
},
{
"epoch": 0.7648645384646351,
"grad_norm": 11.15603256225586,
"learning_rate": 2.5513762580332243e-05,
"loss": 1.6772,
"step": 19000
},
{
"epoch": 0.7849925526347571,
"grad_norm": 19.64437484741211,
"learning_rate": 2.5392506365951255e-05,
"loss": 1.6586,
"step": 19500
},
{
"epoch": 0.805120566804879,
"grad_norm": 14.52999496459961,
"learning_rate": 2.527125015157027e-05,
"loss": 1.6505,
"step": 20000
},
{
"epoch": 0.825248580975001,
"grad_norm": 13.304444313049316,
"learning_rate": 2.5149993937189283e-05,
"loss": 1.615,
"step": 20500
},
{
"epoch": 0.845376595145123,
"grad_norm": 14.634563446044922,
"learning_rate": 2.5028737722808292e-05,
"loss": 1.6244,
"step": 21000
},
{
"epoch": 0.865504609315245,
"grad_norm": 12.946802139282227,
"learning_rate": 2.4907481508427308e-05,
"loss": 1.6158,
"step": 21500
},
{
"epoch": 0.885632623485367,
"grad_norm": 11.765786170959473,
"learning_rate": 2.478622529404632e-05,
"loss": 1.5787,
"step": 22000
},
{
"epoch": 0.9057606376554889,
"grad_norm": 11.961956024169922,
"learning_rate": 2.4664969079665333e-05,
"loss": 1.5621,
"step": 22500
},
{
"epoch": 0.9258886518256109,
"grad_norm": 13.635610580444336,
"learning_rate": 2.454371286528435e-05,
"loss": 1.5692,
"step": 23000
},
{
"epoch": 0.9460166659957329,
"grad_norm": 13.10095500946045,
"learning_rate": 2.442245665090336e-05,
"loss": 1.5458,
"step": 23500
},
{
"epoch": 0.9661446801658549,
"grad_norm": 11.790682792663574,
"learning_rate": 2.430120043652237e-05,
"loss": 1.5394,
"step": 24000
},
{
"epoch": 0.9862726943359769,
"grad_norm": 11.249995231628418,
"learning_rate": 2.4179944222141386e-05,
"loss": 1.5485,
"step": 24500
},
{
"epoch": 1.0064007085060989,
"grad_norm": 13.755157470703125,
"learning_rate": 2.40586880077604e-05,
"loss": 1.5134,
"step": 25000
},
{
"epoch": 1.0265287226762208,
"grad_norm": 15.988091468811035,
"learning_rate": 2.393743179337941e-05,
"loss": 1.4833,
"step": 25500
},
{
"epoch": 1.0466567368463426,
"grad_norm": 11.56142807006836,
"learning_rate": 2.3816175578998427e-05,
"loss": 1.4523,
"step": 26000
},
{
"epoch": 1.0667847510164647,
"grad_norm": 10.849580764770508,
"learning_rate": 2.369491936461744e-05,
"loss": 1.4571,
"step": 26500
},
{
"epoch": 1.0869127651865866,
"grad_norm": 17.24896240234375,
"learning_rate": 2.3573663150236448e-05,
"loss": 1.4886,
"step": 27000
},
{
"epoch": 1.1070407793567087,
"grad_norm": 12.933219909667969,
"learning_rate": 2.345240693585546e-05,
"loss": 1.4485,
"step": 27500
},
{
"epoch": 1.1271687935268306,
"grad_norm": 12.675749778747559,
"learning_rate": 2.3331150721474476e-05,
"loss": 1.4161,
"step": 28000
},
{
"epoch": 1.1472968076969525,
"grad_norm": 21.270776748657227,
"learning_rate": 2.320989450709349e-05,
"loss": 1.4371,
"step": 28500
},
{
"epoch": 1.1674248218670746,
"grad_norm": 17.078645706176758,
"learning_rate": 2.30886382927125e-05,
"loss": 1.3918,
"step": 29000
},
{
"epoch": 1.1875528360371965,
"grad_norm": 23.501638412475586,
"learning_rate": 2.2967382078331517e-05,
"loss": 1.4278,
"step": 29500
},
{
"epoch": 1.2076808502073186,
"grad_norm": 12.903084754943848,
"learning_rate": 2.284612586395053e-05,
"loss": 1.3752,
"step": 30000
},
{
"epoch": 1.2278088643774405,
"grad_norm": 15.732855796813965,
"learning_rate": 2.272486964956954e-05,
"loss": 1.3931,
"step": 30500
},
{
"epoch": 1.2479368785475624,
"grad_norm": 11.898987770080566,
"learning_rate": 2.2603613435188554e-05,
"loss": 1.3587,
"step": 31000
},
{
"epoch": 1.2680648927176845,
"grad_norm": 18.970348358154297,
"learning_rate": 2.2482357220807567e-05,
"loss": 1.3706,
"step": 31500
},
{
"epoch": 1.2881929068878064,
"grad_norm": 13.289978981018066,
"learning_rate": 2.236110100642658e-05,
"loss": 1.3378,
"step": 32000
},
{
"epoch": 1.3083209210579283,
"grad_norm": 25.023792266845703,
"learning_rate": 2.2239844792045595e-05,
"loss": 1.3174,
"step": 32500
},
{
"epoch": 1.3284489352280504,
"grad_norm": 12.036040306091309,
"learning_rate": 2.2118588577664607e-05,
"loss": 1.3293,
"step": 33000
},
{
"epoch": 1.3485769493981723,
"grad_norm": 12.723782539367676,
"learning_rate": 2.1997332363283616e-05,
"loss": 1.3287,
"step": 33500
},
{
"epoch": 1.3687049635682944,
"grad_norm": 11.143896102905273,
"learning_rate": 2.1876076148902632e-05,
"loss": 1.3125,
"step": 34000
},
{
"epoch": 1.3888329777384163,
"grad_norm": 12.347333908081055,
"learning_rate": 2.1754819934521645e-05,
"loss": 1.3472,
"step": 34500
},
{
"epoch": 1.4089609919085384,
"grad_norm": 20.10418701171875,
"learning_rate": 2.1633563720140657e-05,
"loss": 1.3052,
"step": 35000
},
{
"epoch": 1.4290890060786603,
"grad_norm": 17.1345157623291,
"learning_rate": 2.1512307505759673e-05,
"loss": 1.2828,
"step": 35500
},
{
"epoch": 1.4492170202487822,
"grad_norm": 17.451622009277344,
"learning_rate": 2.1391051291378685e-05,
"loss": 1.2787,
"step": 36000
},
{
"epoch": 1.4693450344189043,
"grad_norm": 19.05263900756836,
"learning_rate": 2.1269795076997694e-05,
"loss": 1.3195,
"step": 36500
},
{
"epoch": 1.4894730485890262,
"grad_norm": 12.999706268310547,
"learning_rate": 2.114853886261671e-05,
"loss": 1.2759,
"step": 37000
},
{
"epoch": 1.509601062759148,
"grad_norm": 12.11323356628418,
"learning_rate": 2.1027282648235722e-05,
"loss": 1.2738,
"step": 37500
},
{
"epoch": 1.5297290769292702,
"grad_norm": 10.93237018585205,
"learning_rate": 2.0906026433854735e-05,
"loss": 1.2742,
"step": 38000
},
{
"epoch": 1.5498570910993923,
"grad_norm": 26.265893936157227,
"learning_rate": 2.078477021947375e-05,
"loss": 1.2193,
"step": 38500
},
{
"epoch": 1.569985105269514,
"grad_norm": 17.12728500366211,
"learning_rate": 2.0663514005092763e-05,
"loss": 1.2355,
"step": 39000
},
{
"epoch": 1.590113119439636,
"grad_norm": 15.962538719177246,
"learning_rate": 2.0542257790711772e-05,
"loss": 1.219,
"step": 39500
},
{
"epoch": 1.6102411336097582,
"grad_norm": 19.7554931640625,
"learning_rate": 2.0421001576330788e-05,
"loss": 1.2393,
"step": 40000
},
{
"epoch": 1.63036914777988,
"grad_norm": 17.81658363342285,
"learning_rate": 2.02997453619498e-05,
"loss": 1.22,
"step": 40500
},
{
"epoch": 1.650497161950002,
"grad_norm": 16.0762882232666,
"learning_rate": 2.0178489147568813e-05,
"loss": 1.2359,
"step": 41000
},
{
"epoch": 1.670625176120124,
"grad_norm": 13.652649879455566,
"learning_rate": 2.005723293318783e-05,
"loss": 1.244,
"step": 41500
},
{
"epoch": 1.690753190290246,
"grad_norm": 8.598692893981934,
"learning_rate": 1.993597671880684e-05,
"loss": 1.2153,
"step": 42000
},
{
"epoch": 1.7108812044603678,
"grad_norm": 15.637930870056152,
"learning_rate": 1.981472050442585e-05,
"loss": 1.1855,
"step": 42500
},
{
"epoch": 1.73100921863049,
"grad_norm": 16.582963943481445,
"learning_rate": 1.9693464290044866e-05,
"loss": 1.1908,
"step": 43000
},
{
"epoch": 1.7511372328006118,
"grad_norm": 16.173324584960938,
"learning_rate": 1.957220807566388e-05,
"loss": 1.1659,
"step": 43500
},
{
"epoch": 1.7712652469707337,
"grad_norm": 15.524099349975586,
"learning_rate": 1.945095186128289e-05,
"loss": 1.1853,
"step": 44000
},
{
"epoch": 1.7913932611408558,
"grad_norm": 11.66182804107666,
"learning_rate": 1.9329695646901903e-05,
"loss": 1.1679,
"step": 44500
},
{
"epoch": 1.811521275310978,
"grad_norm": 10.504340171813965,
"learning_rate": 1.920843943252092e-05,
"loss": 1.155,
"step": 45000
},
{
"epoch": 1.8316492894810998,
"grad_norm": 16.5634708404541,
"learning_rate": 1.908718321813993e-05,
"loss": 1.1544,
"step": 45500
},
{
"epoch": 1.8517773036512217,
"grad_norm": 13.282904624938965,
"learning_rate": 1.896592700375894e-05,
"loss": 1.1892,
"step": 46000
},
{
"epoch": 1.8719053178213438,
"grad_norm": 13.532590866088867,
"learning_rate": 1.8844670789377956e-05,
"loss": 1.1728,
"step": 46500
},
{
"epoch": 1.8920333319914657,
"grad_norm": 15.26899242401123,
"learning_rate": 1.872341457499697e-05,
"loss": 1.1733,
"step": 47000
},
{
"epoch": 1.9121613461615876,
"grad_norm": 14.551050186157227,
"learning_rate": 1.860215836061598e-05,
"loss": 1.156,
"step": 47500
},
{
"epoch": 1.9322893603317097,
"grad_norm": 11.31080436706543,
"learning_rate": 1.8480902146234997e-05,
"loss": 1.1405,
"step": 48000
},
{
"epoch": 1.9524173745018316,
"grad_norm": 19.817716598510742,
"learning_rate": 1.835964593185401e-05,
"loss": 1.1488,
"step": 48500
},
{
"epoch": 1.9725453886719535,
"grad_norm": 13.350114822387695,
"learning_rate": 1.823838971747302e-05,
"loss": 1.1094,
"step": 49000
},
{
"epoch": 1.9926734028420756,
"grad_norm": 13.383456230163574,
"learning_rate": 1.8117133503092034e-05,
"loss": 1.1162,
"step": 49500
},
{
"epoch": 2.0128014170121977,
"grad_norm": 14.433093070983887,
"learning_rate": 1.7995877288711047e-05,
"loss": 1.0904,
"step": 50000
},
{
"epoch": 2.0329294311823194,
"grad_norm": 5.4649338722229,
"learning_rate": 1.787462107433006e-05,
"loss": 1.0922,
"step": 50500
},
{
"epoch": 2.0530574453524415,
"grad_norm": 14.124307632446289,
"learning_rate": 1.7753364859949075e-05,
"loss": 1.0895,
"step": 51000
},
{
"epoch": 2.0731854595225636,
"grad_norm": 9.346240043640137,
"learning_rate": 1.7632108645568087e-05,
"loss": 1.0611,
"step": 51500
},
{
"epoch": 2.0933134736926853,
"grad_norm": 16.641101837158203,
"learning_rate": 1.7510852431187096e-05,
"loss": 1.0945,
"step": 52000
},
{
"epoch": 2.1134414878628074,
"grad_norm": 12.283844947814941,
"learning_rate": 1.7389596216806112e-05,
"loss": 1.078,
"step": 52500
},
{
"epoch": 2.1335695020329295,
"grad_norm": 8.219395637512207,
"learning_rate": 1.7268340002425125e-05,
"loss": 1.0482,
"step": 53000
},
{
"epoch": 2.1536975162030516,
"grad_norm": 15.403627395629883,
"learning_rate": 1.7147083788044137e-05,
"loss": 1.0433,
"step": 53500
},
{
"epoch": 2.1738255303731733,
"grad_norm": 15.240696907043457,
"learning_rate": 1.7025827573663153e-05,
"loss": 1.0493,
"step": 54000
},
{
"epoch": 2.1939535445432954,
"grad_norm": 19.72351837158203,
"learning_rate": 1.6904571359282165e-05,
"loss": 1.042,
"step": 54500
},
{
"epoch": 2.2140815587134175,
"grad_norm": 21.067684173583984,
"learning_rate": 1.6783315144901174e-05,
"loss": 1.025,
"step": 55000
},
{
"epoch": 2.234209572883539,
"grad_norm": 14.798884391784668,
"learning_rate": 1.666205893052019e-05,
"loss": 1.0143,
"step": 55500
},
{
"epoch": 2.2543375870536613,
"grad_norm": 15.239629745483398,
"learning_rate": 1.6540802716139202e-05,
"loss": 1.0322,
"step": 56000
},
{
"epoch": 2.2744656012237834,
"grad_norm": 14.908315658569336,
"learning_rate": 1.6419546501758215e-05,
"loss": 1.0826,
"step": 56500
},
{
"epoch": 2.294593615393905,
"grad_norm": 13.52440071105957,
"learning_rate": 1.629829028737723e-05,
"loss": 1.0199,
"step": 57000
},
{
"epoch": 2.314721629564027,
"grad_norm": 20.474451065063477,
"learning_rate": 1.6177034072996243e-05,
"loss": 1.0061,
"step": 57500
},
{
"epoch": 2.3348496437341493,
"grad_norm": 15.805046081542969,
"learning_rate": 1.6055777858615252e-05,
"loss": 1.0141,
"step": 58000
},
{
"epoch": 2.3549776579042714,
"grad_norm": 9.82214641571045,
"learning_rate": 1.5934521644234268e-05,
"loss": 1.0099,
"step": 58500
},
{
"epoch": 2.375105672074393,
"grad_norm": 17.32090950012207,
"learning_rate": 1.581326542985328e-05,
"loss": 0.9851,
"step": 59000
},
{
"epoch": 2.395233686244515,
"grad_norm": 27.325069427490234,
"learning_rate": 1.5692009215472293e-05,
"loss": 0.9991,
"step": 59500
},
{
"epoch": 2.4153617004146373,
"grad_norm": 21.118209838867188,
"learning_rate": 1.557075300109131e-05,
"loss": 1.0082,
"step": 60000
},
{
"epoch": 2.435489714584759,
"grad_norm": 14.355386734008789,
"learning_rate": 1.544949678671032e-05,
"loss": 0.9549,
"step": 60500
},
{
"epoch": 2.455617728754881,
"grad_norm": 16.598129272460938,
"learning_rate": 1.532824057232933e-05,
"loss": 1.0101,
"step": 61000
},
{
"epoch": 2.475745742925003,
"grad_norm": 21.729766845703125,
"learning_rate": 1.5206984357948344e-05,
"loss": 0.9704,
"step": 61500
},
{
"epoch": 2.495873757095125,
"grad_norm": 16.548641204833984,
"learning_rate": 1.5085728143567358e-05,
"loss": 0.9893,
"step": 62000
},
{
"epoch": 2.516001771265247,
"grad_norm": 14.282777786254883,
"learning_rate": 1.496447192918637e-05,
"loss": 0.9721,
"step": 62500
},
{
"epoch": 2.536129785435369,
"grad_norm": 9.005020141601562,
"learning_rate": 1.4843215714805385e-05,
"loss": 0.98,
"step": 63000
},
{
"epoch": 2.5562577996054907,
"grad_norm": 8.10714340209961,
"learning_rate": 1.4721959500424397e-05,
"loss": 0.9887,
"step": 63500
},
{
"epoch": 2.576385813775613,
"grad_norm": 13.707820892333984,
"learning_rate": 1.460070328604341e-05,
"loss": 0.9805,
"step": 64000
},
{
"epoch": 2.596513827945735,
"grad_norm": 20.182363510131836,
"learning_rate": 1.4479447071662424e-05,
"loss": 0.9837,
"step": 64500
},
{
"epoch": 2.6166418421158566,
"grad_norm": 9.87313175201416,
"learning_rate": 1.4358190857281435e-05,
"loss": 0.9609,
"step": 65000
},
{
"epoch": 2.6367698562859787,
"grad_norm": 12.288646697998047,
"learning_rate": 1.4236934642900449e-05,
"loss": 1.0035,
"step": 65500
},
{
"epoch": 2.656897870456101,
"grad_norm": 18.152629852294922,
"learning_rate": 1.4115678428519463e-05,
"loss": 0.9494,
"step": 66000
},
{
"epoch": 2.677025884626223,
"grad_norm": 16.326662063598633,
"learning_rate": 1.3994422214138473e-05,
"loss": 0.946,
"step": 66500
},
{
"epoch": 2.6971538987963446,
"grad_norm": 18.14234733581543,
"learning_rate": 1.3873165999757488e-05,
"loss": 0.9504,
"step": 67000
},
{
"epoch": 2.7172819129664667,
"grad_norm": 20.3934326171875,
"learning_rate": 1.3751909785376502e-05,
"loss": 0.9676,
"step": 67500
},
{
"epoch": 2.737409927136589,
"grad_norm": 11.495948791503906,
"learning_rate": 1.3630653570995512e-05,
"loss": 0.9283,
"step": 68000
},
{
"epoch": 2.757537941306711,
"grad_norm": 20.127979278564453,
"learning_rate": 1.3509397356614527e-05,
"loss": 0.9467,
"step": 68500
},
{
"epoch": 2.7776659554768326,
"grad_norm": 13.345834732055664,
"learning_rate": 1.338814114223354e-05,
"loss": 0.9538,
"step": 69000
},
{
"epoch": 2.7977939696469547,
"grad_norm": 9.327335357666016,
"learning_rate": 1.3266884927852551e-05,
"loss": 0.9437,
"step": 69500
},
{
"epoch": 2.817921983817077,
"grad_norm": 12.741182327270508,
"learning_rate": 1.3145628713471566e-05,
"loss": 0.9291,
"step": 70000
},
{
"epoch": 2.8380499979871985,
"grad_norm": 16.994661331176758,
"learning_rate": 1.302437249909058e-05,
"loss": 0.9147,
"step": 70500
},
{
"epoch": 2.8581780121573206,
"grad_norm": 15.74470043182373,
"learning_rate": 1.2903116284709592e-05,
"loss": 0.9296,
"step": 71000
},
{
"epoch": 2.8783060263274427,
"grad_norm": 13.54488754272461,
"learning_rate": 1.2781860070328604e-05,
"loss": 0.9482,
"step": 71500
},
{
"epoch": 2.8984340404975644,
"grad_norm": 10.650059700012207,
"learning_rate": 1.2660603855947619e-05,
"loss": 0.9516,
"step": 72000
},
{
"epoch": 2.9185620546676865,
"grad_norm": 12.577211380004883,
"learning_rate": 1.2539347641566631e-05,
"loss": 0.9173,
"step": 72500
},
{
"epoch": 2.9386900688378086,
"grad_norm": 14.282366752624512,
"learning_rate": 1.2418091427185643e-05,
"loss": 0.9511,
"step": 73000
},
{
"epoch": 2.9588180830079303,
"grad_norm": 14.529337882995605,
"learning_rate": 1.2296835212804656e-05,
"loss": 0.9302,
"step": 73500
},
{
"epoch": 2.9789460971780524,
"grad_norm": 11.681228637695312,
"learning_rate": 1.217557899842367e-05,
"loss": 0.9097,
"step": 74000
},
{
"epoch": 2.9990741113481745,
"grad_norm": 11.70090389251709,
"learning_rate": 1.2054322784042682e-05,
"loss": 0.9233,
"step": 74500
},
{
"epoch": 3.019202125518296,
"grad_norm": 27.22252655029297,
"learning_rate": 1.1933066569661695e-05,
"loss": 0.8651,
"step": 75000
},
{
"epoch": 3.0393301396884183,
"grad_norm": 14.896398544311523,
"learning_rate": 1.1811810355280709e-05,
"loss": 0.8639,
"step": 75500
},
{
"epoch": 3.0594581538585404,
"grad_norm": 20.037960052490234,
"learning_rate": 1.1690554140899721e-05,
"loss": 0.8606,
"step": 76000
},
{
"epoch": 3.0795861680286625,
"grad_norm": 16.03421974182129,
"learning_rate": 1.1569297926518734e-05,
"loss": 0.8639,
"step": 76500
},
{
"epoch": 3.099714182198784,
"grad_norm": 14.802894592285156,
"learning_rate": 1.1448041712137748e-05,
"loss": 0.8875,
"step": 77000
},
{
"epoch": 3.1198421963689063,
"grad_norm": 9.06533145904541,
"learning_rate": 1.132678549775676e-05,
"loss": 0.8877,
"step": 77500
},
{
"epoch": 3.1399702105390284,
"grad_norm": 13.744263648986816,
"learning_rate": 1.1205529283375773e-05,
"loss": 0.8761,
"step": 78000
},
{
"epoch": 3.16009822470915,
"grad_norm": 12.16555404663086,
"learning_rate": 1.1084273068994787e-05,
"loss": 0.8782,
"step": 78500
},
{
"epoch": 3.180226238879272,
"grad_norm": 29.285688400268555,
"learning_rate": 1.09630168546138e-05,
"loss": 0.8579,
"step": 79000
},
{
"epoch": 3.2003542530493942,
"grad_norm": 14.758946418762207,
"learning_rate": 1.0841760640232812e-05,
"loss": 0.878,
"step": 79500
},
{
"epoch": 3.220482267219516,
"grad_norm": 12.481344223022461,
"learning_rate": 1.0720504425851826e-05,
"loss": 0.8383,
"step": 80000
},
{
"epoch": 3.240610281389638,
"grad_norm": 11.378300666809082,
"learning_rate": 1.0599248211470838e-05,
"loss": 0.866,
"step": 80500
},
{
"epoch": 3.26073829555976,
"grad_norm": 18.51228141784668,
"learning_rate": 1.047799199708985e-05,
"loss": 0.8727,
"step": 81000
},
{
"epoch": 3.2808663097298822,
"grad_norm": 13.013883590698242,
"learning_rate": 1.0356735782708865e-05,
"loss": 0.8497,
"step": 81500
},
{
"epoch": 3.300994323900004,
"grad_norm": 18.66629409790039,
"learning_rate": 1.0235479568327876e-05,
"loss": 0.8817,
"step": 82000
},
{
"epoch": 3.321122338070126,
"grad_norm": 22.02678108215332,
"learning_rate": 1.011422335394689e-05,
"loss": 0.8207,
"step": 82500
},
{
"epoch": 3.341250352240248,
"grad_norm": 21.1297550201416,
"learning_rate": 9.992967139565904e-06,
"loss": 0.834,
"step": 83000
},
{
"epoch": 3.36137836641037,
"grad_norm": 15.060477256774902,
"learning_rate": 9.871710925184914e-06,
"loss": 0.8313,
"step": 83500
},
{
"epoch": 3.381506380580492,
"grad_norm": 20.013944625854492,
"learning_rate": 9.750454710803929e-06,
"loss": 0.8628,
"step": 84000
},
{
"epoch": 3.401634394750614,
"grad_norm": 11.168913841247559,
"learning_rate": 9.629198496422943e-06,
"loss": 0.8261,
"step": 84500
},
{
"epoch": 3.4217624089207357,
"grad_norm": 15.372590065002441,
"learning_rate": 9.507942282041953e-06,
"loss": 0.8618,
"step": 85000
},
{
"epoch": 3.441890423090858,
"grad_norm": 11.604378700256348,
"learning_rate": 9.386686067660968e-06,
"loss": 0.8239,
"step": 85500
},
{
"epoch": 3.46201843726098,
"grad_norm": 9.609265327453613,
"learning_rate": 9.265429853279982e-06,
"loss": 0.8371,
"step": 86000
},
{
"epoch": 3.4821464514311016,
"grad_norm": 15.69279956817627,
"learning_rate": 9.144173638898994e-06,
"loss": 0.8218,
"step": 86500
},
{
"epoch": 3.5022744656012237,
"grad_norm": 14.74257755279541,
"learning_rate": 9.022917424518007e-06,
"loss": 0.8055,
"step": 87000
},
{
"epoch": 3.522402479771346,
"grad_norm": 10.193700790405273,
"learning_rate": 8.90166121013702e-06,
"loss": 0.8566,
"step": 87500
},
{
"epoch": 3.5425304939414675,
"grad_norm": 13.010785102844238,
"learning_rate": 8.780404995756033e-06,
"loss": 0.8443,
"step": 88000
},
{
"epoch": 3.5626585081115896,
"grad_norm": 11.916807174682617,
"learning_rate": 8.659148781375045e-06,
"loss": 0.8272,
"step": 88500
},
{
"epoch": 3.5827865222817117,
"grad_norm": 11.876017570495605,
"learning_rate": 8.53789256699406e-06,
"loss": 0.8518,
"step": 89000
},
{
"epoch": 3.602914536451834,
"grad_norm": 21.5701847076416,
"learning_rate": 8.416636352613072e-06,
"loss": 0.8087,
"step": 89500
},
{
"epoch": 3.623042550621956,
"grad_norm": 11.204216957092285,
"learning_rate": 8.295380138232084e-06,
"loss": 0.8279,
"step": 90000
},
{
"epoch": 3.6431705647920776,
"grad_norm": 11.78646469116211,
"learning_rate": 8.174123923851097e-06,
"loss": 0.8316,
"step": 90500
},
{
"epoch": 3.6632985789621997,
"grad_norm": 12.788416862487793,
"learning_rate": 8.052867709470111e-06,
"loss": 0.8332,
"step": 91000
},
{
"epoch": 3.683426593132322,
"grad_norm": 14.306061744689941,
"learning_rate": 7.931611495089123e-06,
"loss": 0.823,
"step": 91500
},
{
"epoch": 3.7035546073024435,
"grad_norm": 20.168163299560547,
"learning_rate": 7.810355280708136e-06,
"loss": 0.81,
"step": 92000
},
{
"epoch": 3.7236826214725656,
"grad_norm": 20.580291748046875,
"learning_rate": 7.68909906632715e-06,
"loss": 0.822,
"step": 92500
},
{
"epoch": 3.7438106356426877,
"grad_norm": 13.826583862304688,
"learning_rate": 7.567842851946163e-06,
"loss": 0.8378,
"step": 93000
},
{
"epoch": 3.7639386498128093,
"grad_norm": 30.890518188476562,
"learning_rate": 7.446586637565176e-06,
"loss": 0.8311,
"step": 93500
},
{
"epoch": 3.7840666639829315,
"grad_norm": 15.22163200378418,
"learning_rate": 7.325330423184188e-06,
"loss": 0.8138,
"step": 94000
},
{
"epoch": 3.8041946781530536,
"grad_norm": 8.326911926269531,
"learning_rate": 7.204074208803201e-06,
"loss": 0.784,
"step": 94500
},
{
"epoch": 3.8243226923231752,
"grad_norm": 31.577423095703125,
"learning_rate": 7.082817994422215e-06,
"loss": 0.8006,
"step": 95000
},
{
"epoch": 3.8444507064932973,
"grad_norm": 15.388664245605469,
"learning_rate": 6.961561780041227e-06,
"loss": 0.8418,
"step": 95500
},
{
"epoch": 3.8645787206634195,
"grad_norm": 21.28485107421875,
"learning_rate": 6.84030556566024e-06,
"loss": 0.7972,
"step": 96000
},
{
"epoch": 3.884706734833541,
"grad_norm": 11.151982307434082,
"learning_rate": 6.7190493512792536e-06,
"loss": 0.8133,
"step": 96500
},
{
"epoch": 3.9048347490036632,
"grad_norm": 11.545019149780273,
"learning_rate": 6.597793136898266e-06,
"loss": 0.8035,
"step": 97000
},
{
"epoch": 3.9249627631737853,
"grad_norm": 11.109121322631836,
"learning_rate": 6.476536922517279e-06,
"loss": 0.7959,
"step": 97500
},
{
"epoch": 3.945090777343907,
"grad_norm": 12.6671142578125,
"learning_rate": 6.355280708136292e-06,
"loss": 0.8132,
"step": 98000
},
{
"epoch": 3.965218791514029,
"grad_norm": 11.02685260772705,
"learning_rate": 6.234024493755305e-06,
"loss": 0.7959,
"step": 98500
},
{
"epoch": 3.9853468056841512,
"grad_norm": 11.704038619995117,
"learning_rate": 6.112768279374318e-06,
"loss": 0.7837,
"step": 99000
},
{
"epoch": 4.005474819854273,
"grad_norm": 16.34335708618164,
"learning_rate": 5.991512064993331e-06,
"loss": 0.7851,
"step": 99500
},
{
"epoch": 4.0256028340243954,
"grad_norm": 10.739608764648438,
"learning_rate": 5.870255850612345e-06,
"loss": 0.7684,
"step": 100000
},
{
"epoch": 4.045730848194517,
"grad_norm": 15.17026424407959,
"learning_rate": 5.748999636231357e-06,
"loss": 0.7679,
"step": 100500
},
{
"epoch": 4.065858862364639,
"grad_norm": 16.030241012573242,
"learning_rate": 5.62774342185037e-06,
"loss": 0.764,
"step": 101000
},
{
"epoch": 4.085986876534761,
"grad_norm": 15.900766372680664,
"learning_rate": 5.506487207469383e-06,
"loss": 0.7666,
"step": 101500
},
{
"epoch": 4.106114890704883,
"grad_norm": 13.20738410949707,
"learning_rate": 5.385230993088396e-06,
"loss": 0.7686,
"step": 102000
},
{
"epoch": 4.126242904875005,
"grad_norm": 9.8963623046875,
"learning_rate": 5.2639747787074086e-06,
"loss": 0.7589,
"step": 102500
},
{
"epoch": 4.146370919045127,
"grad_norm": 16.053571701049805,
"learning_rate": 5.142718564326422e-06,
"loss": 0.7676,
"step": 103000
},
{
"epoch": 4.166498933215249,
"grad_norm": 12.643793106079102,
"learning_rate": 5.021462349945435e-06,
"loss": 0.7462,
"step": 103500
},
{
"epoch": 4.186626947385371,
"grad_norm": 27.60247230529785,
"learning_rate": 4.9002061355644475e-06,
"loss": 0.7864,
"step": 104000
},
{
"epoch": 4.206754961555493,
"grad_norm": 13.564982414245605,
"learning_rate": 4.778949921183461e-06,
"loss": 0.7693,
"step": 104500
},
{
"epoch": 4.226882975725615,
"grad_norm": 20.11015510559082,
"learning_rate": 4.657693706802474e-06,
"loss": 0.7386,
"step": 105000
},
{
"epoch": 4.247010989895736,
"grad_norm": 15.393072128295898,
"learning_rate": 4.5364374924214865e-06,
"loss": 0.7793,
"step": 105500
},
{
"epoch": 4.267139004065859,
"grad_norm": 19.87403678894043,
"learning_rate": 4.4151812780405e-06,
"loss": 0.7779,
"step": 106000
},
{
"epoch": 4.287267018235981,
"grad_norm": 9.388250350952148,
"learning_rate": 4.293925063659512e-06,
"loss": 0.7681,
"step": 106500
},
{
"epoch": 4.307395032406103,
"grad_norm": 10.060807228088379,
"learning_rate": 4.1726688492785255e-06,
"loss": 0.7509,
"step": 107000
},
{
"epoch": 4.327523046576225,
"grad_norm": 23.562870025634766,
"learning_rate": 4.051412634897539e-06,
"loss": 0.7833,
"step": 107500
},
{
"epoch": 4.3476510607463466,
"grad_norm": 14.926592826843262,
"learning_rate": 3.930156420516551e-06,
"loss": 0.7446,
"step": 108000
},
{
"epoch": 4.367779074916469,
"grad_norm": 11.940516471862793,
"learning_rate": 3.808900206135565e-06,
"loss": 0.754,
"step": 108500
},
{
"epoch": 4.387907089086591,
"grad_norm": 14.217045783996582,
"learning_rate": 3.6876439917545777e-06,
"loss": 0.7731,
"step": 109000
},
{
"epoch": 4.408035103256712,
"grad_norm": 9.447354316711426,
"learning_rate": 3.5663877773735905e-06,
"loss": 0.7597,
"step": 109500
},
{
"epoch": 4.428163117426835,
"grad_norm": 19.97547149658203,
"learning_rate": 3.4451315629926034e-06,
"loss": 0.7657,
"step": 110000
},
{
"epoch": 4.448291131596957,
"grad_norm": 15.066329956054688,
"learning_rate": 3.3238753486116167e-06,
"loss": 0.7619,
"step": 110500
},
{
"epoch": 4.468419145767078,
"grad_norm": 12.446183204650879,
"learning_rate": 3.2026191342306295e-06,
"loss": 0.7656,
"step": 111000
},
{
"epoch": 4.488547159937201,
"grad_norm": 32.365234375,
"learning_rate": 3.0813629198496423e-06,
"loss": 0.7575,
"step": 111500
},
{
"epoch": 4.5086751741073225,
"grad_norm": 12.082524299621582,
"learning_rate": 2.960106705468655e-06,
"loss": 0.7502,
"step": 112000
},
{
"epoch": 4.528803188277444,
"grad_norm": 20.70221519470215,
"learning_rate": 2.8388504910876685e-06,
"loss": 0.7638,
"step": 112500
},
{
"epoch": 4.548931202447567,
"grad_norm": 22.083984375,
"learning_rate": 2.717594276706681e-06,
"loss": 0.7365,
"step": 113000
},
{
"epoch": 4.569059216617688,
"grad_norm": 14.066744804382324,
"learning_rate": 2.596338062325694e-06,
"loss": 0.766,
"step": 113500
},
{
"epoch": 4.58918723078781,
"grad_norm": 24.38865089416504,
"learning_rate": 2.4750818479447074e-06,
"loss": 0.7449,
"step": 114000
},
{
"epoch": 4.609315244957933,
"grad_norm": 11.597355842590332,
"learning_rate": 2.3538256335637203e-06,
"loss": 0.7556,
"step": 114500
},
{
"epoch": 4.629443259128054,
"grad_norm": 10.837632179260254,
"learning_rate": 2.232569419182733e-06,
"loss": 0.7501,
"step": 115000
},
{
"epoch": 4.649571273298177,
"grad_norm": 20.56001853942871,
"learning_rate": 2.111313204801746e-06,
"loss": 0.7234,
"step": 115500
},
{
"epoch": 4.6696992874682985,
"grad_norm": 14.60595703125,
"learning_rate": 1.9900569904207592e-06,
"loss": 0.7695,
"step": 116000
},
{
"epoch": 4.68982730163842,
"grad_norm": 28.349151611328125,
"learning_rate": 1.868800776039772e-06,
"loss": 0.7661,
"step": 116500
},
{
"epoch": 4.709955315808543,
"grad_norm": 10.647957801818848,
"learning_rate": 1.747544561658785e-06,
"loss": 0.7308,
"step": 117000
},
{
"epoch": 4.730083329978664,
"grad_norm": 11.21895980834961,
"learning_rate": 1.6262883472777982e-06,
"loss": 0.7585,
"step": 117500
},
{
"epoch": 4.750211344148786,
"grad_norm": 12.75427532196045,
"learning_rate": 1.505032132896811e-06,
"loss": 0.7553,
"step": 118000
},
{
"epoch": 4.770339358318909,
"grad_norm": 9.93217658996582,
"learning_rate": 1.383775918515824e-06,
"loss": 0.7525,
"step": 118500
},
{
"epoch": 4.79046737248903,
"grad_norm": 13.394769668579102,
"learning_rate": 1.262519704134837e-06,
"loss": 0.7493,
"step": 119000
},
{
"epoch": 4.810595386659152,
"grad_norm": 8.94278335571289,
"learning_rate": 1.1412634897538498e-06,
"loss": 0.7575,
"step": 119500
},
{
"epoch": 4.8307234008292745,
"grad_norm": 16.46908950805664,
"learning_rate": 1.0200072753728628e-06,
"loss": 0.7651,
"step": 120000
},
{
"epoch": 4.850851414999396,
"grad_norm": 27.788360595703125,
"learning_rate": 8.987510609918758e-07,
"loss": 0.7472,
"step": 120500
},
{
"epoch": 4.870979429169518,
"grad_norm": 7.398582458496094,
"learning_rate": 7.774948466108889e-07,
"loss": 0.7696,
"step": 121000
},
{
"epoch": 4.89110744333964,
"grad_norm": 17.573110580444336,
"learning_rate": 6.562386322299018e-07,
"loss": 0.7445,
"step": 121500
},
{
"epoch": 4.911235457509762,
"grad_norm": 5.554362773895264,
"learning_rate": 5.349824178489148e-07,
"loss": 0.7407,
"step": 122000
},
{
"epoch": 4.931363471679884,
"grad_norm": 8.908127784729004,
"learning_rate": 4.137262034679277e-07,
"loss": 0.7352,
"step": 122500
},
{
"epoch": 4.951491485850006,
"grad_norm": 17.096956253051758,
"learning_rate": 2.924699890869407e-07,
"loss": 0.7548,
"step": 123000
},
{
"epoch": 4.971619500020128,
"grad_norm": 15.15579891204834,
"learning_rate": 1.7121377470595367e-07,
"loss": 0.7635,
"step": 123500
},
{
"epoch": 4.99174751419025,
"grad_norm": 14.474600791931152,
"learning_rate": 4.9957560324966654e-08,
"loss": 0.748,
"step": 124000
},
{
"epoch": 5.0,
"step": 124205,
"total_flos": 2.789913716232192e+16,
"train_loss": 1.1867824254838901,
"train_runtime": 21629.6238,
"train_samples_per_second": 91.877,
"train_steps_per_second": 5.742
}
],
"logging_steps": 500,
"max_steps": 124205,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.789913716232192e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}