MiaMao's picture
Add LoRA checkpoints (without PNG loss curves)
b843574
{
"best_global_step": 26250,
"best_metric": 0.9502699810655684,
"best_model_checkpoint": "D:\\Task_design\\Topic\\strategy_train\\outputs\\qwen7b-lora-topic_strategy\\checkpoint-26250",
"epoch": 0.7518231711901361,
"eval_steps": 1250,
"global_step": 26250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014320441356002593,
"grad_norm": 608.0,
"learning_rate": 9.351145038167939e-06,
"loss": 28.2021,
"step": 50
},
{
"epoch": 0.0028640882712005185,
"grad_norm": 326.0,
"learning_rate": 1.8893129770992367e-05,
"loss": 13.4193,
"step": 100
},
{
"epoch": 0.004296132406800777,
"grad_norm": 600.0,
"learning_rate": 2.8435114503816796e-05,
"loss": 8.5573,
"step": 150
},
{
"epoch": 0.005728176542401037,
"grad_norm": 146.0,
"learning_rate": 3.797709923664122e-05,
"loss": 4.1909,
"step": 200
},
{
"epoch": 0.007160220678001296,
"grad_norm": 372.0,
"learning_rate": 4.751908396946565e-05,
"loss": 3.6133,
"step": 250
},
{
"epoch": 0.008592264813601555,
"grad_norm": 36.75,
"learning_rate": 5.7061068702290074e-05,
"loss": 2.8409,
"step": 300
},
{
"epoch": 0.010024308949201815,
"grad_norm": 210.0,
"learning_rate": 6.66030534351145e-05,
"loss": 2.9306,
"step": 350
},
{
"epoch": 0.011456353084802074,
"grad_norm": 0.00037384033203125,
"learning_rate": 7.614503816793893e-05,
"loss": 4.0291,
"step": 400
},
{
"epoch": 0.012888397220402333,
"grad_norm": 24.875,
"learning_rate": 8.568702290076335e-05,
"loss": 4.0621,
"step": 450
},
{
"epoch": 0.014320441356002592,
"grad_norm": 378.0,
"learning_rate": 9.522900763358779e-05,
"loss": 5.0668,
"step": 500
},
{
"epoch": 0.015752485491602852,
"grad_norm": 20.5,
"learning_rate": 0.00010477099236641222,
"loss": 3.8491,
"step": 550
},
{
"epoch": 0.01718452962720311,
"grad_norm": 478.0,
"learning_rate": 0.00011431297709923666,
"loss": 2.9158,
"step": 600
},
{
"epoch": 0.01861657376280337,
"grad_norm": 53.5,
"learning_rate": 0.00012385496183206106,
"loss": 4.3353,
"step": 650
},
{
"epoch": 0.02004861789840363,
"grad_norm": 0.0010986328125,
"learning_rate": 0.0001333969465648855,
"loss": 4.2307,
"step": 700
},
{
"epoch": 0.021480662034003888,
"grad_norm": 44.75,
"learning_rate": 0.0001429389312977099,
"loss": 3.1633,
"step": 750
},
{
"epoch": 0.022912706169604148,
"grad_norm": 396.0,
"learning_rate": 0.00015248091603053436,
"loss": 4.8827,
"step": 800
},
{
"epoch": 0.024344750305204405,
"grad_norm": 3.838539123535156e-05,
"learning_rate": 0.0001620229007633588,
"loss": 3.0475,
"step": 850
},
{
"epoch": 0.025776794440804666,
"grad_norm": 12.625,
"learning_rate": 0.0001715648854961832,
"loss": 4.3648,
"step": 900
},
{
"epoch": 0.027208838576404926,
"grad_norm": 0.009033203125,
"learning_rate": 0.00018110687022900764,
"loss": 3.8295,
"step": 950
},
{
"epoch": 0.028640882712005183,
"grad_norm": 304.0,
"learning_rate": 0.00019064885496183207,
"loss": 5.2518,
"step": 1000
},
{
"epoch": 0.030072926847605444,
"grad_norm": 872.0,
"learning_rate": 0.0001999940947206803,
"loss": 17.7632,
"step": 1050
},
{
"epoch": 0.031504970983205705,
"grad_norm": 328.0,
"learning_rate": 0.00019969883075469472,
"loss": 16.0223,
"step": 1100
},
{
"epoch": 0.03293701511880596,
"grad_norm": 468.0,
"learning_rate": 0.0001994035667887091,
"loss": 10.9938,
"step": 1150
},
{
"epoch": 0.03436905925440622,
"grad_norm": 462.0,
"learning_rate": 0.00019910830282272353,
"loss": 9.1089,
"step": 1200
},
{
"epoch": 0.03580110339000648,
"grad_norm": 43.25,
"learning_rate": 0.00019881303885673795,
"loss": 8.8076,
"step": 1250
},
{
"epoch": 0.03580110339000648,
"eval_accuracy": 0.473,
"eval_loss": 1.0368720293045044,
"eval_macro_f1": 0.3829550887916939,
"eval_runtime": 172.7823,
"eval_samples_per_second": 11.575,
"eval_steps_per_second": 11.575,
"step": 1250
},
{
"epoch": 0.03723314752560674,
"grad_norm": 276.0,
"learning_rate": 0.00019851777489075234,
"loss": 8.0924,
"step": 1300
},
{
"epoch": 0.038665191661207,
"grad_norm": 150.0,
"learning_rate": 0.00019822251092476676,
"loss": 7.168,
"step": 1350
},
{
"epoch": 0.04009723579680726,
"grad_norm": 282.0,
"learning_rate": 0.00019792724695878115,
"loss": 6.6729,
"step": 1400
},
{
"epoch": 0.041529279932407515,
"grad_norm": 155.0,
"learning_rate": 0.00019763198299279557,
"loss": 6.2658,
"step": 1450
},
{
"epoch": 0.042961324068007775,
"grad_norm": 95.0,
"learning_rate": 0.00019733671902680996,
"loss": 4.2749,
"step": 1500
},
{
"epoch": 0.044393368203608036,
"grad_norm": 121.5,
"learning_rate": 0.00019704145506082438,
"loss": 6.0376,
"step": 1550
},
{
"epoch": 0.045825412339208296,
"grad_norm": 484.0,
"learning_rate": 0.0001967461910948388,
"loss": 5.4624,
"step": 1600
},
{
"epoch": 0.04725745647480856,
"grad_norm": 83.0,
"learning_rate": 0.00019645092712885321,
"loss": 4.8571,
"step": 1650
},
{
"epoch": 0.04868950061040881,
"grad_norm": 86.0,
"learning_rate": 0.00019615566316286763,
"loss": 5.2631,
"step": 1700
},
{
"epoch": 0.05012154474600907,
"grad_norm": 0.35546875,
"learning_rate": 0.00019586039919688202,
"loss": 4.2013,
"step": 1750
},
{
"epoch": 0.05155358888160933,
"grad_norm": 58.0,
"learning_rate": 0.00019556513523089644,
"loss": 5.4813,
"step": 1800
},
{
"epoch": 0.05298563301720959,
"grad_norm": 868.0,
"learning_rate": 0.00019526987126491083,
"loss": 4.6324,
"step": 1850
},
{
"epoch": 0.05441767715280985,
"grad_norm": 169.0,
"learning_rate": 0.00019497460729892525,
"loss": 3.9849,
"step": 1900
},
{
"epoch": 0.055849721288410106,
"grad_norm": 4.28125,
"learning_rate": 0.00019467934333293967,
"loss": 3.2505,
"step": 1950
},
{
"epoch": 0.05728176542401037,
"grad_norm": 11.0625,
"learning_rate": 0.00019438407936695406,
"loss": 3.7368,
"step": 2000
},
{
"epoch": 0.05871380955961063,
"grad_norm": 40.75,
"learning_rate": 0.00019408881540096848,
"loss": 4.2252,
"step": 2050
},
{
"epoch": 0.06014585369521089,
"grad_norm": 2240.0,
"learning_rate": 0.00019379355143498287,
"loss": 3.8708,
"step": 2100
},
{
"epoch": 0.06157789783081115,
"grad_norm": 318.0,
"learning_rate": 0.0001934982874689973,
"loss": 3.7427,
"step": 2150
},
{
"epoch": 0.06300994196641141,
"grad_norm": 79.0,
"learning_rate": 0.00019320302350301168,
"loss": 2.5798,
"step": 2200
},
{
"epoch": 0.06444198610201167,
"grad_norm": 188.0,
"learning_rate": 0.0001929077595370261,
"loss": 3.2888,
"step": 2250
},
{
"epoch": 0.06587403023761192,
"grad_norm": 255.0,
"learning_rate": 0.00019261249557104052,
"loss": 3.5956,
"step": 2300
},
{
"epoch": 0.06730607437321218,
"grad_norm": 111.5,
"learning_rate": 0.0001923172316050549,
"loss": 2.6906,
"step": 2350
},
{
"epoch": 0.06873811850881244,
"grad_norm": 8.8125,
"learning_rate": 0.00019202196763906933,
"loss": 2.9821,
"step": 2400
},
{
"epoch": 0.0701701626444127,
"grad_norm": 62.25,
"learning_rate": 0.00019172670367308375,
"loss": 2.9432,
"step": 2450
},
{
"epoch": 0.07160220678001296,
"grad_norm": 268.0,
"learning_rate": 0.00019143143970709817,
"loss": 5.8543,
"step": 2500
},
{
"epoch": 0.07160220678001296,
"eval_accuracy": 0.8855,
"eval_loss": 0.4594672918319702,
"eval_macro_f1": 0.8847948863660271,
"eval_runtime": 174.3198,
"eval_samples_per_second": 11.473,
"eval_steps_per_second": 11.473,
"step": 2500
},
{
"epoch": 0.07303425091561322,
"grad_norm": 1.703125,
"learning_rate": 0.00019113617574111256,
"loss": 4.3718,
"step": 2550
},
{
"epoch": 0.07446629505121348,
"grad_norm": 6.125,
"learning_rate": 0.00019084091177512698,
"loss": 5.6269,
"step": 2600
},
{
"epoch": 0.07589833918681374,
"grad_norm": 2.796875,
"learning_rate": 0.0001905456478091414,
"loss": 4.2341,
"step": 2650
},
{
"epoch": 0.077330383322414,
"grad_norm": 608.0,
"learning_rate": 0.0001902503838431558,
"loss": 3.3186,
"step": 2700
},
{
"epoch": 0.07876242745801426,
"grad_norm": 140.0,
"learning_rate": 0.0001899551198771702,
"loss": 5.9126,
"step": 2750
},
{
"epoch": 0.08019447159361452,
"grad_norm": 824.0,
"learning_rate": 0.0001896598559111846,
"loss": 5.0582,
"step": 2800
},
{
"epoch": 0.08162651572921477,
"grad_norm": 165.0,
"learning_rate": 0.00018936459194519902,
"loss": 3.5105,
"step": 2850
},
{
"epoch": 0.08305855986481503,
"grad_norm": 0.0284423828125,
"learning_rate": 0.0001890693279792134,
"loss": 4.6236,
"step": 2900
},
{
"epoch": 0.08449060400041529,
"grad_norm": 116.5,
"learning_rate": 0.00018877406401322783,
"loss": 3.9021,
"step": 2950
},
{
"epoch": 0.08592264813601555,
"grad_norm": 0.251953125,
"learning_rate": 0.00018847880004724225,
"loss": 3.883,
"step": 3000
},
{
"epoch": 0.08735469227161581,
"grad_norm": 0.0260009765625,
"learning_rate": 0.00018818353608125664,
"loss": 3.9736,
"step": 3050
},
{
"epoch": 0.08878673640721607,
"grad_norm": 290.0,
"learning_rate": 0.00018788827211527106,
"loss": 5.218,
"step": 3100
},
{
"epoch": 0.09021878054281633,
"grad_norm": 1720.0,
"learning_rate": 0.00018759300814928548,
"loss": 3.2961,
"step": 3150
},
{
"epoch": 0.09165082467841659,
"grad_norm": 0.26953125,
"learning_rate": 0.0001872977441832999,
"loss": 3.4482,
"step": 3200
},
{
"epoch": 0.09308286881401685,
"grad_norm": 13.375,
"learning_rate": 0.0001870024802173143,
"loss": 2.928,
"step": 3250
},
{
"epoch": 0.09451491294961711,
"grad_norm": 73.0,
"learning_rate": 0.0001867072162513287,
"loss": 3.4569,
"step": 3300
},
{
"epoch": 0.09594695708521736,
"grad_norm": 9.25,
"learning_rate": 0.00018641195228534313,
"loss": 3.8492,
"step": 3350
},
{
"epoch": 0.09737900122081762,
"grad_norm": 274.0,
"learning_rate": 0.00018611668831935752,
"loss": 3.4008,
"step": 3400
},
{
"epoch": 0.09881104535641788,
"grad_norm": 158.0,
"learning_rate": 0.00018582142435337194,
"loss": 3.6703,
"step": 3450
},
{
"epoch": 0.10024308949201814,
"grad_norm": 264.0,
"learning_rate": 0.00018552616038738633,
"loss": 3.4321,
"step": 3500
},
{
"epoch": 0.1016751336276184,
"grad_norm": 2.015625,
"learning_rate": 0.00018523089642140075,
"loss": 2.4367,
"step": 3550
},
{
"epoch": 0.10310717776321866,
"grad_norm": 270.0,
"learning_rate": 0.00018493563245541514,
"loss": 3.6473,
"step": 3600
},
{
"epoch": 0.10453922189881892,
"grad_norm": 0.0478515625,
"learning_rate": 0.00018464036848942956,
"loss": 2.3759,
"step": 3650
},
{
"epoch": 0.10597126603441918,
"grad_norm": 282.0,
"learning_rate": 0.00018434510452344395,
"loss": 2.5434,
"step": 3700
},
{
"epoch": 0.10740331017001944,
"grad_norm": 100.0,
"learning_rate": 0.00018404984055745837,
"loss": 2.4411,
"step": 3750
},
{
"epoch": 0.10740331017001944,
"eval_accuracy": 0.911,
"eval_loss": 0.6009318232536316,
"eval_macro_f1": 0.9109307309196768,
"eval_runtime": 173.1976,
"eval_samples_per_second": 11.548,
"eval_steps_per_second": 11.548,
"step": 3750
},
{
"epoch": 0.1088353543056197,
"grad_norm": 8.875,
"learning_rate": 0.00018375457659147279,
"loss": 3.7952,
"step": 3800
},
{
"epoch": 0.11026739844121995,
"grad_norm": 408.0,
"learning_rate": 0.00018345931262548718,
"loss": 2.7528,
"step": 3850
},
{
"epoch": 0.11169944257682021,
"grad_norm": 4.65625,
"learning_rate": 0.0001831640486595016,
"loss": 3.0934,
"step": 3900
},
{
"epoch": 0.11313148671242047,
"grad_norm": 0.0274658203125,
"learning_rate": 0.00018286878469351601,
"loss": 3.3618,
"step": 3950
},
{
"epoch": 0.11456353084802073,
"grad_norm": 93.0,
"learning_rate": 0.00018257352072753043,
"loss": 3.635,
"step": 4000
},
{
"epoch": 0.115995574983621,
"grad_norm": 159.0,
"learning_rate": 0.00018227825676154482,
"loss": 2.3589,
"step": 4050
},
{
"epoch": 0.11742761911922125,
"grad_norm": 290.0,
"learning_rate": 0.00018198299279555924,
"loss": 3.9717,
"step": 4100
},
{
"epoch": 0.11885966325482152,
"grad_norm": 8.375,
"learning_rate": 0.00018168772882957366,
"loss": 3.0616,
"step": 4150
},
{
"epoch": 0.12029170739042178,
"grad_norm": 264.0,
"learning_rate": 0.00018139246486358805,
"loss": 3.4315,
"step": 4200
},
{
"epoch": 0.12172375152602204,
"grad_norm": 206.0,
"learning_rate": 0.00018109720089760247,
"loss": 3.3353,
"step": 4250
},
{
"epoch": 0.1231557956616223,
"grad_norm": 0.37109375,
"learning_rate": 0.00018080193693161686,
"loss": 2.7568,
"step": 4300
},
{
"epoch": 0.12458783979722254,
"grad_norm": 314.0,
"learning_rate": 0.00018050667296563128,
"loss": 3.0107,
"step": 4350
},
{
"epoch": 0.12601988393282282,
"grad_norm": 992.0,
"learning_rate": 0.00018021140899964567,
"loss": 2.8247,
"step": 4400
},
{
"epoch": 0.12745192806842306,
"grad_norm": 225.0,
"learning_rate": 0.0001799161450336601,
"loss": 3.3408,
"step": 4450
},
{
"epoch": 0.12888397220402334,
"grad_norm": 0.703125,
"learning_rate": 0.0001796208810676745,
"loss": 2.8974,
"step": 4500
},
{
"epoch": 0.13031601633962359,
"grad_norm": 280.0,
"learning_rate": 0.0001793256171016889,
"loss": 2.8223,
"step": 4550
},
{
"epoch": 0.13174806047522383,
"grad_norm": 180.0,
"learning_rate": 0.00017903035313570332,
"loss": 3.7603,
"step": 4600
},
{
"epoch": 0.1331801046108241,
"grad_norm": 6.28125,
"learning_rate": 0.00017873508916971774,
"loss": 4.2271,
"step": 4650
},
{
"epoch": 0.13461214874642435,
"grad_norm": 338.0,
"learning_rate": 0.00017843982520373216,
"loss": 3.2114,
"step": 4700
},
{
"epoch": 0.13604419288202463,
"grad_norm": 0.6796875,
"learning_rate": 0.00017814456123774655,
"loss": 3.4457,
"step": 4750
},
{
"epoch": 0.13747623701762487,
"grad_norm": 2.34375,
"learning_rate": 0.00017784929727176097,
"loss": 2.2643,
"step": 4800
},
{
"epoch": 0.13890828115322515,
"grad_norm": 288.0,
"learning_rate": 0.0001775540333057754,
"loss": 3.0672,
"step": 4850
},
{
"epoch": 0.1403403252888254,
"grad_norm": 1896.0,
"learning_rate": 0.00017725876933978978,
"loss": 2.8551,
"step": 4900
},
{
"epoch": 0.14177236942442567,
"grad_norm": 88.0,
"learning_rate": 0.0001769635053738042,
"loss": 3.5021,
"step": 4950
},
{
"epoch": 0.14320441356002592,
"grad_norm": 94.0,
"learning_rate": 0.0001766682414078186,
"loss": 2.1413,
"step": 5000
},
{
"epoch": 0.14320441356002592,
"eval_accuracy": 0.917,
"eval_loss": 0.3995007872581482,
"eval_macro_f1": 0.9161602620439439,
"eval_runtime": 179.9592,
"eval_samples_per_second": 11.114,
"eval_steps_per_second": 11.114,
"step": 5000
},
{
"epoch": 0.1446364576956262,
"grad_norm": 0.97265625,
"learning_rate": 0.000176372977441833,
"loss": 2.3626,
"step": 5050
},
{
"epoch": 0.14606850183122644,
"grad_norm": 266.0,
"learning_rate": 0.0001760777134758474,
"loss": 3.3284,
"step": 5100
},
{
"epoch": 0.14750054596682668,
"grad_norm": 0.2314453125,
"learning_rate": 0.00017578244950986182,
"loss": 2.2628,
"step": 5150
},
{
"epoch": 0.14893259010242696,
"grad_norm": 237.0,
"learning_rate": 0.00017548718554387624,
"loss": 2.5359,
"step": 5200
},
{
"epoch": 0.1503646342380272,
"grad_norm": 65.5,
"learning_rate": 0.00017519192157789063,
"loss": 2.5109,
"step": 5250
},
{
"epoch": 0.15179667837362748,
"grad_norm": 0.2197265625,
"learning_rate": 0.00017489665761190505,
"loss": 3.4319,
"step": 5300
},
{
"epoch": 0.15322872250922773,
"grad_norm": 140.0,
"learning_rate": 0.00017460139364591944,
"loss": 2.149,
"step": 5350
},
{
"epoch": 0.154660766644828,
"grad_norm": 74.0,
"learning_rate": 0.00017430612967993386,
"loss": 3.3437,
"step": 5400
},
{
"epoch": 0.15609281078042825,
"grad_norm": 160.0,
"learning_rate": 0.00017401086571394828,
"loss": 3.2952,
"step": 5450
},
{
"epoch": 0.15752485491602852,
"grad_norm": 0.451171875,
"learning_rate": 0.0001737156017479627,
"loss": 2.6442,
"step": 5500
},
{
"epoch": 0.15895689905162877,
"grad_norm": 246.0,
"learning_rate": 0.00017342033778197712,
"loss": 2.1805,
"step": 5550
},
{
"epoch": 0.16038894318722904,
"grad_norm": 0.1669921875,
"learning_rate": 0.0001731250738159915,
"loss": 2.957,
"step": 5600
},
{
"epoch": 0.1618209873228293,
"grad_norm": 11.4375,
"learning_rate": 0.00017282980985000593,
"loss": 3.791,
"step": 5650
},
{
"epoch": 0.16325303145842954,
"grad_norm": 241.0,
"learning_rate": 0.00017253454588402032,
"loss": 2.3945,
"step": 5700
},
{
"epoch": 0.1646850755940298,
"grad_norm": 0.1572265625,
"learning_rate": 0.00017223928191803474,
"loss": 2.3927,
"step": 5750
},
{
"epoch": 0.16611711972963006,
"grad_norm": 0.06494140625,
"learning_rate": 0.00017194401795204913,
"loss": 2.4573,
"step": 5800
},
{
"epoch": 0.16754916386523033,
"grad_norm": 0.34375,
"learning_rate": 0.00017164875398606355,
"loss": 2.6829,
"step": 5850
},
{
"epoch": 0.16898120800083058,
"grad_norm": 7.46875,
"learning_rate": 0.00017135349002007797,
"loss": 3.0156,
"step": 5900
},
{
"epoch": 0.17041325213643085,
"grad_norm": 0.2470703125,
"learning_rate": 0.00017105822605409236,
"loss": 2.5155,
"step": 5950
},
{
"epoch": 0.1718452962720311,
"grad_norm": 3.65625,
"learning_rate": 0.00017076296208810678,
"loss": 2.5886,
"step": 6000
},
{
"epoch": 0.17327734040763138,
"grad_norm": 420.0,
"learning_rate": 0.00017046769812212117,
"loss": 3.7327,
"step": 6050
},
{
"epoch": 0.17470938454323162,
"grad_norm": 88.0,
"learning_rate": 0.00017017243415613559,
"loss": 4.1712,
"step": 6100
},
{
"epoch": 0.17614142867883187,
"grad_norm": 1864.0,
"learning_rate": 0.00016987717019015,
"loss": 3.0617,
"step": 6150
},
{
"epoch": 0.17757347281443214,
"grad_norm": 56.25,
"learning_rate": 0.00016958190622416442,
"loss": 2.6603,
"step": 6200
},
{
"epoch": 0.1790055169500324,
"grad_norm": 25.25,
"learning_rate": 0.00016928664225817884,
"loss": 2.7308,
"step": 6250
},
{
"epoch": 0.1790055169500324,
"eval_accuracy": 0.9195,
"eval_loss": 0.47414371371269226,
"eval_macro_f1": 0.9193664539192946,
"eval_runtime": 182.0886,
"eval_samples_per_second": 10.984,
"eval_steps_per_second": 10.984,
"step": 6250
},
{
"epoch": 0.18043756108563266,
"grad_norm": 66.5,
"learning_rate": 0.00016899137829219323,
"loss": 2.9805,
"step": 6300
},
{
"epoch": 0.1818696052212329,
"grad_norm": 119.0,
"learning_rate": 0.00016869611432620765,
"loss": 2.343,
"step": 6350
},
{
"epoch": 0.18330164935683319,
"grad_norm": 0.14453125,
"learning_rate": 0.00016840085036022204,
"loss": 2.5346,
"step": 6400
},
{
"epoch": 0.18473369349243343,
"grad_norm": 67.5,
"learning_rate": 0.00016810558639423646,
"loss": 2.6565,
"step": 6450
},
{
"epoch": 0.1861657376280337,
"grad_norm": 14.0,
"learning_rate": 0.00016781032242825085,
"loss": 3.2329,
"step": 6500
},
{
"epoch": 0.18759778176363395,
"grad_norm": 1408.0,
"learning_rate": 0.00016751505846226527,
"loss": 2.7886,
"step": 6550
},
{
"epoch": 0.18902982589923423,
"grad_norm": 23.0,
"learning_rate": 0.0001672197944962797,
"loss": 2.2165,
"step": 6600
},
{
"epoch": 0.19046187003483447,
"grad_norm": 88.0,
"learning_rate": 0.00016692453053029408,
"loss": 2.826,
"step": 6650
},
{
"epoch": 0.19189391417043472,
"grad_norm": 7.28125,
"learning_rate": 0.0001666292665643085,
"loss": 2.6884,
"step": 6700
},
{
"epoch": 0.193325958306035,
"grad_norm": 4.3125,
"learning_rate": 0.0001663340025983229,
"loss": 2.3811,
"step": 6750
},
{
"epoch": 0.19475800244163524,
"grad_norm": 2.78125,
"learning_rate": 0.0001660387386323373,
"loss": 2.1648,
"step": 6800
},
{
"epoch": 0.19619004657723552,
"grad_norm": 2.65625,
"learning_rate": 0.0001657434746663517,
"loss": 2.0769,
"step": 6850
},
{
"epoch": 0.19762209071283576,
"grad_norm": 0.337890625,
"learning_rate": 0.00016544821070036612,
"loss": 3.2644,
"step": 6900
},
{
"epoch": 0.19905413484843604,
"grad_norm": 5.15625,
"learning_rate": 0.00016515294673438054,
"loss": 3.1548,
"step": 6950
},
{
"epoch": 0.20048617898403628,
"grad_norm": 52.75,
"learning_rate": 0.00016485768276839496,
"loss": 2.3094,
"step": 7000
},
{
"epoch": 0.20191822311963656,
"grad_norm": 0.15625,
"learning_rate": 0.00016456241880240938,
"loss": 2.2522,
"step": 7050
},
{
"epoch": 0.2033502672552368,
"grad_norm": 0.09521484375,
"learning_rate": 0.00016426715483642377,
"loss": 2.1453,
"step": 7100
},
{
"epoch": 0.20478231139083705,
"grad_norm": 274.0,
"learning_rate": 0.0001639718908704382,
"loss": 2.8386,
"step": 7150
},
{
"epoch": 0.20621435552643733,
"grad_norm": 274.0,
"learning_rate": 0.00016367662690445258,
"loss": 3.5395,
"step": 7200
},
{
"epoch": 0.20764639966203757,
"grad_norm": 81.0,
"learning_rate": 0.000163381362938467,
"loss": 2.668,
"step": 7250
},
{
"epoch": 0.20907844379763785,
"grad_norm": 0.1162109375,
"learning_rate": 0.00016308609897248142,
"loss": 2.2543,
"step": 7300
},
{
"epoch": 0.2105104879332381,
"grad_norm": 0.05517578125,
"learning_rate": 0.0001627908350064958,
"loss": 2.4399,
"step": 7350
},
{
"epoch": 0.21194253206883837,
"grad_norm": 0.283203125,
"learning_rate": 0.00016249557104051023,
"loss": 2.0814,
"step": 7400
},
{
"epoch": 0.21337457620443862,
"grad_norm": 79.0,
"learning_rate": 0.00016220030707452462,
"loss": 3.2041,
"step": 7450
},
{
"epoch": 0.2148066203400389,
"grad_norm": 144.0,
"learning_rate": 0.00016190504310853904,
"loss": 1.962,
"step": 7500
},
{
"epoch": 0.2148066203400389,
"eval_accuracy": 0.93,
"eval_loss": 0.3529609441757202,
"eval_macro_f1": 0.9295120271109343,
"eval_runtime": 175.7548,
"eval_samples_per_second": 11.379,
"eval_steps_per_second": 11.379,
"step": 7500
},
{
"epoch": 0.21623866447563914,
"grad_norm": 2592.0,
"learning_rate": 0.00016160977914255343,
"loss": 2.7684,
"step": 7550
},
{
"epoch": 0.2176707086112394,
"grad_norm": 0.03271484375,
"learning_rate": 0.00016131451517656785,
"loss": 2.5066,
"step": 7600
},
{
"epoch": 0.21910275274683966,
"grad_norm": 0.09423828125,
"learning_rate": 0.00016101925121058227,
"loss": 2.6791,
"step": 7650
},
{
"epoch": 0.2205347968824399,
"grad_norm": 536.0,
"learning_rate": 0.0001607239872445967,
"loss": 3.3268,
"step": 7700
},
{
"epoch": 0.22196684101804018,
"grad_norm": 0.01007080078125,
"learning_rate": 0.0001604287232786111,
"loss": 2.2916,
"step": 7750
},
{
"epoch": 0.22339888515364043,
"grad_norm": 0.1396484375,
"learning_rate": 0.0001601334593126255,
"loss": 2.8402,
"step": 7800
},
{
"epoch": 0.2248309292892407,
"grad_norm": 93.5,
"learning_rate": 0.00015983819534663992,
"loss": 2.5527,
"step": 7850
},
{
"epoch": 0.22626297342484095,
"grad_norm": 0.318359375,
"learning_rate": 0.0001595429313806543,
"loss": 3.0559,
"step": 7900
},
{
"epoch": 0.22769501756044122,
"grad_norm": 276.0,
"learning_rate": 0.00015924766741466873,
"loss": 1.8897,
"step": 7950
},
{
"epoch": 0.22912706169604147,
"grad_norm": 1.7421875,
"learning_rate": 0.00015895240344868315,
"loss": 1.9342,
"step": 8000
},
{
"epoch": 0.23055910583164174,
"grad_norm": 0.036865234375,
"learning_rate": 0.00015865713948269754,
"loss": 2.0979,
"step": 8050
},
{
"epoch": 0.231991149967242,
"grad_norm": 164.0,
"learning_rate": 0.00015836187551671196,
"loss": 2.2929,
"step": 8100
},
{
"epoch": 0.23342319410284226,
"grad_norm": 88.5,
"learning_rate": 0.00015806661155072635,
"loss": 3.0427,
"step": 8150
},
{
"epoch": 0.2348552382384425,
"grad_norm": 1104.0,
"learning_rate": 0.00015777134758474077,
"loss": 2.8966,
"step": 8200
},
{
"epoch": 0.23628728237404276,
"grad_norm": 520.0,
"learning_rate": 0.00015747608361875516,
"loss": 2.0752,
"step": 8250
},
{
"epoch": 0.23771932650964303,
"grad_norm": 0.07568359375,
"learning_rate": 0.00015718081965276958,
"loss": 1.7808,
"step": 8300
},
{
"epoch": 0.23915137064524328,
"grad_norm": 0.06982421875,
"learning_rate": 0.000156885555686784,
"loss": 2.9426,
"step": 8350
},
{
"epoch": 0.24058341478084355,
"grad_norm": 242.0,
"learning_rate": 0.00015659029172079839,
"loss": 2.3159,
"step": 8400
},
{
"epoch": 0.2420154589164438,
"grad_norm": 7.5,
"learning_rate": 0.0001562950277548128,
"loss": 2.6197,
"step": 8450
},
{
"epoch": 0.24344750305204407,
"grad_norm": 57.25,
"learning_rate": 0.00015599976378882722,
"loss": 2.6834,
"step": 8500
},
{
"epoch": 0.24487954718764432,
"grad_norm": 4.0,
"learning_rate": 0.00015570449982284164,
"loss": 2.116,
"step": 8550
},
{
"epoch": 0.2463115913232446,
"grad_norm": 0.11181640625,
"learning_rate": 0.00015540923585685603,
"loss": 3.5668,
"step": 8600
},
{
"epoch": 0.24774363545884484,
"grad_norm": 240.0,
"learning_rate": 0.00015511397189087045,
"loss": 3.1473,
"step": 8650
},
{
"epoch": 0.2491756795944451,
"grad_norm": 117.5,
"learning_rate": 0.00015481870792488487,
"loss": 2.4813,
"step": 8700
},
{
"epoch": 0.25060772373004536,
"grad_norm": 7.46875,
"learning_rate": 0.00015452344395889926,
"loss": 1.8936,
"step": 8750
},
{
"epoch": 0.25060772373004536,
"eval_accuracy": 0.9365,
"eval_loss": 0.328545480966568,
"eval_macro_f1": 0.9360277798015127,
"eval_runtime": 178.5517,
"eval_samples_per_second": 11.201,
"eval_steps_per_second": 11.201,
"step": 8750
},
{
"epoch": 0.25203976786564564,
"grad_norm": 126.0,
"learning_rate": 0.00015422817999291368,
"loss": 2.649,
"step": 8800
},
{
"epoch": 0.25347181200124586,
"grad_norm": 0.06884765625,
"learning_rate": 0.00015393291602692807,
"loss": 2.8102,
"step": 8850
},
{
"epoch": 0.25490385613684613,
"grad_norm": 0.6015625,
"learning_rate": 0.0001536376520609425,
"loss": 2.4762,
"step": 8900
},
{
"epoch": 0.2563359002724464,
"grad_norm": 370.0,
"learning_rate": 0.00015334238809495688,
"loss": 2.1245,
"step": 8950
},
{
"epoch": 0.2577679444080467,
"grad_norm": 238.0,
"learning_rate": 0.0001530471241289713,
"loss": 1.4588,
"step": 9000
},
{
"epoch": 0.2591999885436469,
"grad_norm": 8.5625,
"learning_rate": 0.00015275186016298572,
"loss": 2.7869,
"step": 9050
},
{
"epoch": 0.26063203267924717,
"grad_norm": 118.5,
"learning_rate": 0.0001524565961970001,
"loss": 2.1987,
"step": 9100
},
{
"epoch": 0.26206407681484745,
"grad_norm": 37.25,
"learning_rate": 0.00015216133223101453,
"loss": 2.8539,
"step": 9150
},
{
"epoch": 0.26349612095044767,
"grad_norm": 0.62109375,
"learning_rate": 0.00015186606826502895,
"loss": 2.6421,
"step": 9200
},
{
"epoch": 0.26492816508604794,
"grad_norm": 0.404296875,
"learning_rate": 0.00015157080429904337,
"loss": 3.3623,
"step": 9250
},
{
"epoch": 0.2663602092216482,
"grad_norm": 0.09130859375,
"learning_rate": 0.00015127554033305776,
"loss": 2.6995,
"step": 9300
},
{
"epoch": 0.2677922533572485,
"grad_norm": 82.5,
"learning_rate": 0.00015098027636707218,
"loss": 1.8874,
"step": 9350
},
{
"epoch": 0.2692242974928487,
"grad_norm": 4416.0,
"learning_rate": 0.0001506850124010866,
"loss": 2.2107,
"step": 9400
},
{
"epoch": 0.270656341628449,
"grad_norm": 18.125,
"learning_rate": 0.000150389748435101,
"loss": 3.2056,
"step": 9450
},
{
"epoch": 0.27208838576404926,
"grad_norm": 3.09375,
"learning_rate": 0.0001500944844691154,
"loss": 2.9934,
"step": 9500
},
{
"epoch": 0.27352042989964953,
"grad_norm": 280.0,
"learning_rate": 0.0001497992205031298,
"loss": 2.2205,
"step": 9550
},
{
"epoch": 0.27495247403524975,
"grad_norm": 94.5,
"learning_rate": 0.00014950395653714422,
"loss": 2.5102,
"step": 9600
},
{
"epoch": 0.27638451817085,
"grad_norm": 0.2021484375,
"learning_rate": 0.0001492086925711586,
"loss": 2.0138,
"step": 9650
},
{
"epoch": 0.2778165623064503,
"grad_norm": 1.3359375,
"learning_rate": 0.00014891342860517303,
"loss": 1.556,
"step": 9700
},
{
"epoch": 0.2792486064420505,
"grad_norm": 0.494140625,
"learning_rate": 0.00014861816463918745,
"loss": 2.7351,
"step": 9750
},
{
"epoch": 0.2806806505776508,
"grad_norm": 0.1953125,
"learning_rate": 0.00014832290067320184,
"loss": 2.0641,
"step": 9800
},
{
"epoch": 0.28211269471325107,
"grad_norm": 0.421875,
"learning_rate": 0.00014802763670721626,
"loss": 2.642,
"step": 9850
},
{
"epoch": 0.28354473884885134,
"grad_norm": 292.0,
"learning_rate": 0.00014773237274123065,
"loss": 2.5676,
"step": 9900
},
{
"epoch": 0.28497678298445156,
"grad_norm": 4.40625,
"learning_rate": 0.00014743710877524507,
"loss": 2.5438,
"step": 9950
},
{
"epoch": 0.28640882712005183,
"grad_norm": 129.0,
"learning_rate": 0.0001471418448092595,
"loss": 3.0776,
"step": 10000
},
{
"epoch": 0.28640882712005183,
"eval_accuracy": 0.9335,
"eval_loss": 0.34245818853378296,
"eval_macro_f1": 0.9327568911653952,
"eval_runtime": 181.524,
"eval_samples_per_second": 11.018,
"eval_steps_per_second": 11.018,
"step": 10000
},
{
"epoch": 0.2878408712556521,
"grad_norm": 9.25,
"learning_rate": 0.0001468465808432739,
"loss": 2.2061,
"step": 10050
},
{
"epoch": 0.2892729153912524,
"grad_norm": 0.10693359375,
"learning_rate": 0.00014655131687728832,
"loss": 2.6087,
"step": 10100
},
{
"epoch": 0.2907049595268526,
"grad_norm": 0.10107421875,
"learning_rate": 0.00014625605291130272,
"loss": 2.4579,
"step": 10150
},
{
"epoch": 0.2921370036624529,
"grad_norm": 0.33203125,
"learning_rate": 0.00014596078894531714,
"loss": 2.2037,
"step": 10200
},
{
"epoch": 0.29356904779805315,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014566552497933153,
"loss": 2.3772,
"step": 10250
},
{
"epoch": 0.29500109193365337,
"grad_norm": 80.5,
"learning_rate": 0.00014537026101334595,
"loss": 2.2901,
"step": 10300
},
{
"epoch": 0.29643313606925364,
"grad_norm": 21.0,
"learning_rate": 0.00014507499704736034,
"loss": 2.1736,
"step": 10350
},
{
"epoch": 0.2978651802048539,
"grad_norm": 168.0,
"learning_rate": 0.00014477973308137476,
"loss": 2.3493,
"step": 10400
},
{
"epoch": 0.2992972243404542,
"grad_norm": 229.0,
"learning_rate": 0.00014448446911538915,
"loss": 2.4933,
"step": 10450
},
{
"epoch": 0.3007292684760544,
"grad_norm": 274.0,
"learning_rate": 0.00014418920514940357,
"loss": 2.6936,
"step": 10500
},
{
"epoch": 0.3021613126116547,
"grad_norm": 272.0,
"learning_rate": 0.00014389394118341799,
"loss": 3.3879,
"step": 10550
},
{
"epoch": 0.30359335674725496,
"grad_norm": 0.0103759765625,
"learning_rate": 0.00014359867721743238,
"loss": 1.6445,
"step": 10600
},
{
"epoch": 0.30502540088285524,
"grad_norm": 0.02880859375,
"learning_rate": 0.0001433034132514468,
"loss": 1.5567,
"step": 10650
},
{
"epoch": 0.30645744501845545,
"grad_norm": 292.0,
"learning_rate": 0.00014300814928546121,
"loss": 2.5947,
"step": 10700
},
{
"epoch": 0.30788948915405573,
"grad_norm": 97.5,
"learning_rate": 0.00014271288531947563,
"loss": 2.7865,
"step": 10750
},
{
"epoch": 0.309321533289656,
"grad_norm": 79.5,
"learning_rate": 0.00014241762135349002,
"loss": 2.1275,
"step": 10800
},
{
"epoch": 0.3107535774252562,
"grad_norm": 298.0,
"learning_rate": 0.00014212235738750444,
"loss": 2.0145,
"step": 10850
},
{
"epoch": 0.3121856215608565,
"grad_norm": 0.040771484375,
"learning_rate": 0.00014182709342151886,
"loss": 1.8322,
"step": 10900
},
{
"epoch": 0.31361766569645677,
"grad_norm": 142.0,
"learning_rate": 0.00014153182945553325,
"loss": 1.3864,
"step": 10950
},
{
"epoch": 0.31504970983205705,
"grad_norm": 1.7578125,
"learning_rate": 0.00014123656548954767,
"loss": 2.7755,
"step": 11000
},
{
"epoch": 0.31648175396765726,
"grad_norm": 82.0,
"learning_rate": 0.00014094130152356206,
"loss": 2.5528,
"step": 11050
},
{
"epoch": 0.31791379810325754,
"grad_norm": 80.0,
"learning_rate": 0.00014064603755757648,
"loss": 2.5284,
"step": 11100
},
{
"epoch": 0.3193458422388578,
"grad_norm": 0.05224609375,
"learning_rate": 0.00014035077359159087,
"loss": 2.8708,
"step": 11150
},
{
"epoch": 0.3207778863744581,
"grad_norm": 0.140625,
"learning_rate": 0.0001400555096256053,
"loss": 3.5295,
"step": 11200
},
{
"epoch": 0.3222099305100583,
"grad_norm": 0.050048828125,
"learning_rate": 0.0001397602456596197,
"loss": 3.325,
"step": 11250
},
{
"epoch": 0.3222099305100583,
"eval_accuracy": 0.94,
"eval_loss": 0.2819044888019562,
"eval_macro_f1": 0.9395225640341313,
"eval_runtime": 173.501,
"eval_samples_per_second": 11.527,
"eval_steps_per_second": 11.527,
"step": 11250
},
{
"epoch": 0.3236419746456586,
"grad_norm": 1.078125,
"learning_rate": 0.0001394649816936341,
"loss": 3.0985,
"step": 11300
},
{
"epoch": 0.32507401878125886,
"grad_norm": 116.0,
"learning_rate": 0.00013916971772764852,
"loss": 2.5793,
"step": 11350
},
{
"epoch": 0.3265060629168591,
"grad_norm": 0.1865234375,
"learning_rate": 0.00013887445376166291,
"loss": 2.5646,
"step": 11400
},
{
"epoch": 0.32793810705245935,
"grad_norm": 306.0,
"learning_rate": 0.00013857918979567733,
"loss": 1.9864,
"step": 11450
},
{
"epoch": 0.3293701511880596,
"grad_norm": 274.0,
"learning_rate": 0.00013828392582969175,
"loss": 1.8868,
"step": 11500
},
{
"epoch": 0.3308021953236599,
"grad_norm": 0.08984375,
"learning_rate": 0.00013798866186370617,
"loss": 2.5106,
"step": 11550
},
{
"epoch": 0.3322342394592601,
"grad_norm": 270.0,
"learning_rate": 0.0001376933978977206,
"loss": 1.8537,
"step": 11600
},
{
"epoch": 0.3336662835948604,
"grad_norm": 270.0,
"learning_rate": 0.00013739813393173498,
"loss": 2.3735,
"step": 11650
},
{
"epoch": 0.33509832773046067,
"grad_norm": 0.416015625,
"learning_rate": 0.0001371028699657494,
"loss": 2.0794,
"step": 11700
},
{
"epoch": 0.3365303718660609,
"grad_norm": 0.32421875,
"learning_rate": 0.0001368076059997638,
"loss": 2.5114,
"step": 11750
},
{
"epoch": 0.33796241600166116,
"grad_norm": 0.0595703125,
"learning_rate": 0.0001365123420337782,
"loss": 2.199,
"step": 11800
},
{
"epoch": 0.33939446013726143,
"grad_norm": 61.75,
"learning_rate": 0.0001362170780677926,
"loss": 2.64,
"step": 11850
},
{
"epoch": 0.3408265042728617,
"grad_norm": 9.0,
"learning_rate": 0.00013592181410180702,
"loss": 1.8553,
"step": 11900
},
{
"epoch": 0.3422585484084619,
"grad_norm": 177.0,
"learning_rate": 0.00013562655013582144,
"loss": 1.6963,
"step": 11950
},
{
"epoch": 0.3436905925440622,
"grad_norm": 326.0,
"learning_rate": 0.00013533128616983583,
"loss": 3.007,
"step": 12000
},
{
"epoch": 0.3451226366796625,
"grad_norm": 0.06787109375,
"learning_rate": 0.00013503602220385025,
"loss": 1.6731,
"step": 12050
},
{
"epoch": 0.34655468081526275,
"grad_norm": 5.28125,
"learning_rate": 0.00013474075823786464,
"loss": 2.5167,
"step": 12100
},
{
"epoch": 0.34798672495086297,
"grad_norm": 0.11865234375,
"learning_rate": 0.00013444549427187906,
"loss": 3.4208,
"step": 12150
},
{
"epoch": 0.34941876908646324,
"grad_norm": 0.1337890625,
"learning_rate": 0.00013415023030589348,
"loss": 1.3073,
"step": 12200
},
{
"epoch": 0.3508508132220635,
"grad_norm": 82.5,
"learning_rate": 0.0001338549663399079,
"loss": 2.3618,
"step": 12250
},
{
"epoch": 0.35228285735766374,
"grad_norm": 1.546875,
"learning_rate": 0.00013355970237392232,
"loss": 2.0756,
"step": 12300
},
{
"epoch": 0.353714901493264,
"grad_norm": 120.0,
"learning_rate": 0.0001332644384079367,
"loss": 2.2016,
"step": 12350
},
{
"epoch": 0.3551469456288643,
"grad_norm": 5.34375,
"learning_rate": 0.00013296917444195113,
"loss": 2.7446,
"step": 12400
},
{
"epoch": 0.35657898976446456,
"grad_norm": 0.057861328125,
"learning_rate": 0.00013267391047596552,
"loss": 2.3892,
"step": 12450
},
{
"epoch": 0.3580110339000648,
"grad_norm": 1.6640625,
"learning_rate": 0.00013237864650997994,
"loss": 2.3038,
"step": 12500
},
{
"epoch": 0.3580110339000648,
"eval_accuracy": 0.943,
"eval_loss": 0.2820850610733032,
"eval_macro_f1": 0.9423954094372721,
"eval_runtime": 180.0229,
"eval_samples_per_second": 11.11,
"eval_steps_per_second": 11.11,
"step": 12500
},
{
"epoch": 0.35944307803566505,
"grad_norm": 0.15625,
"learning_rate": 0.00013208338254399433,
"loss": 1.6647,
"step": 12550
},
{
"epoch": 0.36087512217126533,
"grad_norm": 0.06640625,
"learning_rate": 0.00013178811857800875,
"loss": 2.2751,
"step": 12600
},
{
"epoch": 0.3623071663068656,
"grad_norm": 0.984375,
"learning_rate": 0.00013149285461202316,
"loss": 2.7458,
"step": 12650
},
{
"epoch": 0.3637392104424658,
"grad_norm": 420.0,
"learning_rate": 0.00013119759064603756,
"loss": 2.2766,
"step": 12700
},
{
"epoch": 0.3651712545780661,
"grad_norm": 0.1103515625,
"learning_rate": 0.00013090232668005198,
"loss": 1.4627,
"step": 12750
},
{
"epoch": 0.36660329871366637,
"grad_norm": 0.25390625,
"learning_rate": 0.00013060706271406637,
"loss": 1.9492,
"step": 12800
},
{
"epoch": 0.3680353428492666,
"grad_norm": 0.232421875,
"learning_rate": 0.00013031179874808079,
"loss": 1.8883,
"step": 12850
},
{
"epoch": 0.36946738698486686,
"grad_norm": 0.205078125,
"learning_rate": 0.00013001653478209518,
"loss": 2.058,
"step": 12900
},
{
"epoch": 0.37089943112046714,
"grad_norm": 0.53515625,
"learning_rate": 0.0001297212708161096,
"loss": 2.7257,
"step": 12950
},
{
"epoch": 0.3723314752560674,
"grad_norm": 764.0,
"learning_rate": 0.00012942600685012401,
"loss": 2.4923,
"step": 13000
},
{
"epoch": 0.37376351939166763,
"grad_norm": 298.0,
"learning_rate": 0.00012913074288413843,
"loss": 1.8667,
"step": 13050
},
{
"epoch": 0.3751955635272679,
"grad_norm": 282.0,
"learning_rate": 0.00012883547891815285,
"loss": 2.4477,
"step": 13100
},
{
"epoch": 0.3766276076628682,
"grad_norm": 0.02587890625,
"learning_rate": 0.00012854021495216724,
"loss": 1.0699,
"step": 13150
},
{
"epoch": 0.37805965179846845,
"grad_norm": 160.0,
"learning_rate": 0.00012824495098618166,
"loss": 2.8487,
"step": 13200
},
{
"epoch": 0.3794916959340687,
"grad_norm": 216.0,
"learning_rate": 0.00012794968702019605,
"loss": 1.8886,
"step": 13250
},
{
"epoch": 0.38092374006966895,
"grad_norm": 302.0,
"learning_rate": 0.00012765442305421047,
"loss": 2.4619,
"step": 13300
},
{
"epoch": 0.3823557842052692,
"grad_norm": 172.0,
"learning_rate": 0.0001273591590882249,
"loss": 2.8237,
"step": 13350
},
{
"epoch": 0.38378782834086944,
"grad_norm": 91.5,
"learning_rate": 0.00012706389512223928,
"loss": 2.7431,
"step": 13400
},
{
"epoch": 0.3852198724764697,
"grad_norm": 0.275390625,
"learning_rate": 0.0001267686311562537,
"loss": 1.9888,
"step": 13450
},
{
"epoch": 0.38665191661207,
"grad_norm": 194.0,
"learning_rate": 0.0001264733671902681,
"loss": 2.5123,
"step": 13500
},
{
"epoch": 0.38808396074767026,
"grad_norm": 396.0,
"learning_rate": 0.0001261781032242825,
"loss": 1.9384,
"step": 13550
},
{
"epoch": 0.3895160048832705,
"grad_norm": 54.75,
"learning_rate": 0.0001258828392582969,
"loss": 1.9031,
"step": 13600
},
{
"epoch": 0.39094804901887076,
"grad_norm": 0.828125,
"learning_rate": 0.00012558757529231132,
"loss": 2.65,
"step": 13650
},
{
"epoch": 0.39238009315447103,
"grad_norm": 36.25,
"learning_rate": 0.00012529231132632574,
"loss": 2.2794,
"step": 13700
},
{
"epoch": 0.3938121372900713,
"grad_norm": 0.11962890625,
"learning_rate": 0.00012499704736034016,
"loss": 2.1189,
"step": 13750
},
{
"epoch": 0.3938121372900713,
"eval_accuracy": 0.9465,
"eval_loss": 0.31273505091667175,
"eval_macro_f1": 0.9457595736365828,
"eval_runtime": 172.8312,
"eval_samples_per_second": 11.572,
"eval_steps_per_second": 11.572,
"step": 13750
},
{
"epoch": 0.3952441814256715,
"grad_norm": 398.0,
"learning_rate": 0.00012470178339435458,
"loss": 2.5979,
"step": 13800
},
{
"epoch": 0.3966762255612718,
"grad_norm": 266.0,
"learning_rate": 0.00012440651942836897,
"loss": 2.8401,
"step": 13850
},
{
"epoch": 0.3981082696968721,
"grad_norm": 0.1884765625,
"learning_rate": 0.0001241112554623834,
"loss": 1.9365,
"step": 13900
},
{
"epoch": 0.3995403138324723,
"grad_norm": 0.055908203125,
"learning_rate": 0.00012381599149639778,
"loss": 0.9845,
"step": 13950
},
{
"epoch": 0.40097235796807257,
"grad_norm": 130.0,
"learning_rate": 0.0001235207275304122,
"loss": 2.9002,
"step": 14000
},
{
"epoch": 0.40240440210367284,
"grad_norm": 0.0045166015625,
"learning_rate": 0.00012322546356442662,
"loss": 2.1528,
"step": 14050
},
{
"epoch": 0.4038364462392731,
"grad_norm": 0.39453125,
"learning_rate": 0.000122930199598441,
"loss": 2.4575,
"step": 14100
},
{
"epoch": 0.40526849037487334,
"grad_norm": 241.0,
"learning_rate": 0.00012263493563245543,
"loss": 2.475,
"step": 14150
},
{
"epoch": 0.4067005345104736,
"grad_norm": 37.5,
"learning_rate": 0.00012233967166646982,
"loss": 1.6529,
"step": 14200
},
{
"epoch": 0.4081325786460739,
"grad_norm": 272.0,
"learning_rate": 0.00012204440770048424,
"loss": 2.922,
"step": 14250
},
{
"epoch": 0.4095646227816741,
"grad_norm": 3.734375,
"learning_rate": 0.00012174914373449864,
"loss": 2.2361,
"step": 14300
},
{
"epoch": 0.4109966669172744,
"grad_norm": 338.0,
"learning_rate": 0.00012145387976851306,
"loss": 1.5299,
"step": 14350
},
{
"epoch": 0.41242871105287465,
"grad_norm": 0.043212890625,
"learning_rate": 0.00012115861580252748,
"loss": 2.5728,
"step": 14400
},
{
"epoch": 0.4138607551884749,
"grad_norm": 0.15625,
"learning_rate": 0.00012086335183654187,
"loss": 2.122,
"step": 14450
},
{
"epoch": 0.41529279932407515,
"grad_norm": 0.008056640625,
"learning_rate": 0.00012056808787055629,
"loss": 2.4863,
"step": 14500
},
{
"epoch": 0.4167248434596754,
"grad_norm": 32.5,
"learning_rate": 0.00012027282390457068,
"loss": 3.3401,
"step": 14550
},
{
"epoch": 0.4181568875952757,
"grad_norm": 0.2490234375,
"learning_rate": 0.0001199775599385851,
"loss": 2.0398,
"step": 14600
},
{
"epoch": 0.41958893173087597,
"grad_norm": 0.326171875,
"learning_rate": 0.00011968229597259951,
"loss": 2.8364,
"step": 14650
},
{
"epoch": 0.4210209758664762,
"grad_norm": 0.15625,
"learning_rate": 0.00011938703200661391,
"loss": 1.4864,
"step": 14700
},
{
"epoch": 0.42245302000207646,
"grad_norm": 80.0,
"learning_rate": 0.00011909176804062833,
"loss": 2.3397,
"step": 14750
},
{
"epoch": 0.42388506413767674,
"grad_norm": 101.5,
"learning_rate": 0.00011879650407464274,
"loss": 3.3186,
"step": 14800
},
{
"epoch": 0.42531710827327696,
"grad_norm": 2.6875,
"learning_rate": 0.00011850124010865716,
"loss": 2.7605,
"step": 14850
},
{
"epoch": 0.42674915240887723,
"grad_norm": 95.0,
"learning_rate": 0.00011820597614267155,
"loss": 1.9215,
"step": 14900
},
{
"epoch": 0.4281811965444775,
"grad_norm": 0.65625,
"learning_rate": 0.00011791071217668597,
"loss": 2.0689,
"step": 14950
},
{
"epoch": 0.4296132406800778,
"grad_norm": 0.66015625,
"learning_rate": 0.00011761544821070036,
"loss": 3.5633,
"step": 15000
},
{
"epoch": 0.4296132406800778,
"eval_accuracy": 0.942,
"eval_loss": 0.27190178632736206,
"eval_macro_f1": 0.9413541591870516,
"eval_runtime": 181.3524,
"eval_samples_per_second": 11.028,
"eval_steps_per_second": 11.028,
"step": 15000
},
{
"epoch": 0.431045284815678,
"grad_norm": 42.25,
"learning_rate": 0.00011732018424471478,
"loss": 1.8073,
"step": 15050
},
{
"epoch": 0.4324773289512783,
"grad_norm": 0.22265625,
"learning_rate": 0.0001170249202787292,
"loss": 1.777,
"step": 15100
},
{
"epoch": 0.43390937308687855,
"grad_norm": 0.11328125,
"learning_rate": 0.0001167296563127436,
"loss": 1.9598,
"step": 15150
},
{
"epoch": 0.4353414172224788,
"grad_norm": 0.095703125,
"learning_rate": 0.00011643439234675802,
"loss": 2.7789,
"step": 15200
},
{
"epoch": 0.43677346135807904,
"grad_norm": 0.8125,
"learning_rate": 0.00011613912838077241,
"loss": 2.3985,
"step": 15250
},
{
"epoch": 0.4382055054936793,
"grad_norm": 0.22265625,
"learning_rate": 0.00011584386441478683,
"loss": 1.6076,
"step": 15300
},
{
"epoch": 0.4396375496292796,
"grad_norm": 0.578125,
"learning_rate": 0.00011554860044880122,
"loss": 2.9266,
"step": 15350
},
{
"epoch": 0.4410695937648798,
"grad_norm": 0.0130615234375,
"learning_rate": 0.00011525333648281564,
"loss": 1.388,
"step": 15400
},
{
"epoch": 0.4425016379004801,
"grad_norm": 88.5,
"learning_rate": 0.00011495807251683006,
"loss": 2.6264,
"step": 15450
},
{
"epoch": 0.44393368203608036,
"grad_norm": 0.123046875,
"learning_rate": 0.00011466280855084446,
"loss": 1.9447,
"step": 15500
},
{
"epoch": 0.44536572617168063,
"grad_norm": 318.0,
"learning_rate": 0.00011436754458485888,
"loss": 1.4494,
"step": 15550
},
{
"epoch": 0.44679777030728085,
"grad_norm": 0.123046875,
"learning_rate": 0.00011407228061887327,
"loss": 2.4483,
"step": 15600
},
{
"epoch": 0.4482298144428811,
"grad_norm": 260.0,
"learning_rate": 0.00011377701665288769,
"loss": 2.8899,
"step": 15650
},
{
"epoch": 0.4496618585784814,
"grad_norm": 0.07666015625,
"learning_rate": 0.00011348175268690208,
"loss": 2.0935,
"step": 15700
},
{
"epoch": 0.4510939027140817,
"grad_norm": 0.047119140625,
"learning_rate": 0.0001131864887209165,
"loss": 3.0298,
"step": 15750
},
{
"epoch": 0.4525259468496819,
"grad_norm": 18.25,
"learning_rate": 0.00011289122475493092,
"loss": 1.8288,
"step": 15800
},
{
"epoch": 0.45395799098528217,
"grad_norm": 11.75,
"learning_rate": 0.00011259596078894533,
"loss": 2.9182,
"step": 15850
},
{
"epoch": 0.45539003512088244,
"grad_norm": 0.1416015625,
"learning_rate": 0.00011230069682295975,
"loss": 1.9045,
"step": 15900
},
{
"epoch": 0.45682207925648266,
"grad_norm": 0.1982421875,
"learning_rate": 0.00011200543285697414,
"loss": 1.7933,
"step": 15950
},
{
"epoch": 0.45825412339208293,
"grad_norm": 201.0,
"learning_rate": 0.00011171016889098856,
"loss": 2.4752,
"step": 16000
},
{
"epoch": 0.4596861675276832,
"grad_norm": 284.0,
"learning_rate": 0.00011141490492500295,
"loss": 1.7394,
"step": 16050
},
{
"epoch": 0.4611182116632835,
"grad_norm": 0.1162109375,
"learning_rate": 0.00011111964095901737,
"loss": 1.3612,
"step": 16100
},
{
"epoch": 0.4625502557988837,
"grad_norm": 0.30859375,
"learning_rate": 0.00011082437699303178,
"loss": 2.6066,
"step": 16150
},
{
"epoch": 0.463982299934484,
"grad_norm": 23.75,
"learning_rate": 0.00011052911302704618,
"loss": 2.6868,
"step": 16200
},
{
"epoch": 0.46541434407008425,
"grad_norm": 1.5546875,
"learning_rate": 0.0001102338490610606,
"loss": 2.6526,
"step": 16250
},
{
"epoch": 0.46541434407008425,
"eval_accuracy": 0.9445,
"eval_loss": 0.2859993577003479,
"eval_macro_f1": 0.9439942443541156,
"eval_runtime": 175.6054,
"eval_samples_per_second": 11.389,
"eval_steps_per_second": 11.389,
"step": 16250
},
{
"epoch": 0.4668463882056845,
"grad_norm": 0.11181640625,
"learning_rate": 0.000109938585095075,
"loss": 2.041,
"step": 16300
},
{
"epoch": 0.46827843234128474,
"grad_norm": 8.3125,
"learning_rate": 0.00010964332112908942,
"loss": 2.3374,
"step": 16350
},
{
"epoch": 0.469710476476885,
"grad_norm": 268.0,
"learning_rate": 0.00010934805716310381,
"loss": 3.461,
"step": 16400
},
{
"epoch": 0.4711425206124853,
"grad_norm": 270.0,
"learning_rate": 0.00010905279319711823,
"loss": 1.5168,
"step": 16450
},
{
"epoch": 0.4725745647480855,
"grad_norm": 1.6171875,
"learning_rate": 0.00010875752923113265,
"loss": 1.9156,
"step": 16500
},
{
"epoch": 0.4740066088836858,
"grad_norm": 3.25,
"learning_rate": 0.00010846226526514704,
"loss": 1.9988,
"step": 16550
},
{
"epoch": 0.47543865301928606,
"grad_norm": 266.0,
"learning_rate": 0.00010816700129916146,
"loss": 1.9911,
"step": 16600
},
{
"epoch": 0.47687069715488634,
"grad_norm": 0.45703125,
"learning_rate": 0.00010787173733317586,
"loss": 2.2805,
"step": 16650
},
{
"epoch": 0.47830274129048655,
"grad_norm": 0.08740234375,
"learning_rate": 0.00010757647336719028,
"loss": 2.3786,
"step": 16700
},
{
"epoch": 0.47973478542608683,
"grad_norm": 0.0263671875,
"learning_rate": 0.00010728120940120467,
"loss": 2.0964,
"step": 16750
},
{
"epoch": 0.4811668295616871,
"grad_norm": 288.0,
"learning_rate": 0.00010698594543521909,
"loss": 2.5816,
"step": 16800
},
{
"epoch": 0.4825988736972873,
"grad_norm": 0.2138671875,
"learning_rate": 0.00010669068146923348,
"loss": 1.2136,
"step": 16850
},
{
"epoch": 0.4840309178328876,
"grad_norm": 1.1796875,
"learning_rate": 0.0001063954175032479,
"loss": 2.2321,
"step": 16900
},
{
"epoch": 0.48546296196848787,
"grad_norm": 0.259765625,
"learning_rate": 0.00010610015353726232,
"loss": 2.7485,
"step": 16950
},
{
"epoch": 0.48689500610408815,
"grad_norm": 75.5,
"learning_rate": 0.00010580488957127673,
"loss": 3.0909,
"step": 17000
},
{
"epoch": 0.48832705023968836,
"grad_norm": 86.0,
"learning_rate": 0.00010550962560529115,
"loss": 2.0178,
"step": 17050
},
{
"epoch": 0.48975909437528864,
"grad_norm": 0.98046875,
"learning_rate": 0.00010521436163930554,
"loss": 2.0193,
"step": 17100
},
{
"epoch": 0.4911911385108889,
"grad_norm": 68.0,
"learning_rate": 0.00010491909767331996,
"loss": 2.178,
"step": 17150
},
{
"epoch": 0.4926231826464892,
"grad_norm": 0.375,
"learning_rate": 0.00010462383370733435,
"loss": 2.4366,
"step": 17200
},
{
"epoch": 0.4940552267820894,
"grad_norm": 0.322265625,
"learning_rate": 0.00010432856974134877,
"loss": 2.7989,
"step": 17250
},
{
"epoch": 0.4954872709176897,
"grad_norm": 0.1513671875,
"learning_rate": 0.00010403330577536318,
"loss": 1.8681,
"step": 17300
},
{
"epoch": 0.49691931505328996,
"grad_norm": 0.08935546875,
"learning_rate": 0.00010373804180937759,
"loss": 1.9697,
"step": 17350
},
{
"epoch": 0.4983513591888902,
"grad_norm": 0.1328125,
"learning_rate": 0.00010344277784339201,
"loss": 1.7415,
"step": 17400
},
{
"epoch": 0.49978340332449045,
"grad_norm": 0.65234375,
"learning_rate": 0.0001031475138774064,
"loss": 1.6849,
"step": 17450
},
{
"epoch": 0.5012154474600907,
"grad_norm": 86.5,
"learning_rate": 0.00010285224991142082,
"loss": 2.1474,
"step": 17500
},
{
"epoch": 0.5012154474600907,
"eval_accuracy": 0.947,
"eval_loss": 0.2907390892505646,
"eval_macro_f1": 0.9463529866080697,
"eval_runtime": 172.6878,
"eval_samples_per_second": 11.582,
"eval_steps_per_second": 11.582,
"step": 17500
},
{
"epoch": 0.502647491595691,
"grad_norm": 472.0,
"learning_rate": 0.00010255698594543521,
"loss": 1.9173,
"step": 17550
},
{
"epoch": 0.5040795357312913,
"grad_norm": 1.3828125,
"learning_rate": 0.00010226172197944963,
"loss": 3.1869,
"step": 17600
},
{
"epoch": 0.5055115798668915,
"grad_norm": 145.0,
"learning_rate": 0.00010196645801346405,
"loss": 2.5482,
"step": 17650
},
{
"epoch": 0.5069436240024917,
"grad_norm": 0.58203125,
"learning_rate": 0.00010167119404747844,
"loss": 2.8567,
"step": 17700
},
{
"epoch": 0.508375668138092,
"grad_norm": 0.0888671875,
"learning_rate": 0.00010137593008149286,
"loss": 1.7268,
"step": 17750
},
{
"epoch": 0.5098077122736923,
"grad_norm": 286.0,
"learning_rate": 0.00010108066611550726,
"loss": 2.2268,
"step": 17800
},
{
"epoch": 0.5112397564092925,
"grad_norm": 169.0,
"learning_rate": 0.00010078540214952168,
"loss": 1.8245,
"step": 17850
},
{
"epoch": 0.5126718005448928,
"grad_norm": 0.1923828125,
"learning_rate": 0.00010049013818353607,
"loss": 2.3801,
"step": 17900
},
{
"epoch": 0.5141038446804931,
"grad_norm": 77.0,
"learning_rate": 0.00010019487421755049,
"loss": 2.3412,
"step": 17950
},
{
"epoch": 0.5155358888160934,
"grad_norm": 328.0,
"learning_rate": 9.98996102515649e-05,
"loss": 3.1564,
"step": 18000
},
{
"epoch": 0.5169679329516935,
"grad_norm": 0.57421875,
"learning_rate": 9.96043462855793e-05,
"loss": 2.0409,
"step": 18050
},
{
"epoch": 0.5183999770872938,
"grad_norm": 0.8828125,
"learning_rate": 9.930908231959372e-05,
"loss": 1.8093,
"step": 18100
},
{
"epoch": 0.5198320212228941,
"grad_norm": 0.263671875,
"learning_rate": 9.901381835360814e-05,
"loss": 2.1228,
"step": 18150
},
{
"epoch": 0.5212640653584943,
"grad_norm": 13.625,
"learning_rate": 9.871855438762255e-05,
"loss": 1.6072,
"step": 18200
},
{
"epoch": 0.5226961094940946,
"grad_norm": 176.0,
"learning_rate": 9.842329042163695e-05,
"loss": 2.1088,
"step": 18250
},
{
"epoch": 0.5241281536296949,
"grad_norm": 0.09228515625,
"learning_rate": 9.812802645565136e-05,
"loss": 2.2985,
"step": 18300
},
{
"epoch": 0.5255601977652952,
"grad_norm": 5.9375,
"learning_rate": 9.783276248966576e-05,
"loss": 2.8687,
"step": 18350
},
{
"epoch": 0.5269922419008953,
"grad_norm": 0.1142578125,
"learning_rate": 9.753749852368017e-05,
"loss": 1.9855,
"step": 18400
},
{
"epoch": 0.5284242860364956,
"grad_norm": 1.03125,
"learning_rate": 9.724223455769457e-05,
"loss": 2.5827,
"step": 18450
},
{
"epoch": 0.5298563301720959,
"grad_norm": 270.0,
"learning_rate": 9.694697059170899e-05,
"loss": 1.9905,
"step": 18500
},
{
"epoch": 0.5312883743076962,
"grad_norm": 0.2392578125,
"learning_rate": 9.665170662572341e-05,
"loss": 1.9516,
"step": 18550
},
{
"epoch": 0.5327204184432964,
"grad_norm": 116.5,
"learning_rate": 9.635644265973781e-05,
"loss": 1.7887,
"step": 18600
},
{
"epoch": 0.5341524625788967,
"grad_norm": 0.0306396484375,
"learning_rate": 9.606117869375222e-05,
"loss": 1.8686,
"step": 18650
},
{
"epoch": 0.535584506714497,
"grad_norm": 0.7109375,
"learning_rate": 9.576591472776662e-05,
"loss": 1.7828,
"step": 18700
},
{
"epoch": 0.5370165508500973,
"grad_norm": 0.92578125,
"learning_rate": 9.547065076178103e-05,
"loss": 1.8761,
"step": 18750
},
{
"epoch": 0.5370165508500973,
"eval_accuracy": 0.9495,
"eval_loss": 0.26102420687675476,
"eval_macro_f1": 0.9488861373782008,
"eval_runtime": 172.7705,
"eval_samples_per_second": 11.576,
"eval_steps_per_second": 11.576,
"step": 18750
},
{
"epoch": 0.5384485949856974,
"grad_norm": 0.1416015625,
"learning_rate": 9.517538679579543e-05,
"loss": 2.3765,
"step": 18800
},
{
"epoch": 0.5398806391212977,
"grad_norm": 258.0,
"learning_rate": 9.488012282980985e-05,
"loss": 1.5944,
"step": 18850
},
{
"epoch": 0.541312683256898,
"grad_norm": 106.0,
"learning_rate": 9.458485886382427e-05,
"loss": 2.4606,
"step": 18900
},
{
"epoch": 0.5427447273924982,
"grad_norm": 92.5,
"learning_rate": 9.428959489783868e-05,
"loss": 2.6863,
"step": 18950
},
{
"epoch": 0.5441767715280985,
"grad_norm": 207.0,
"learning_rate": 9.399433093185308e-05,
"loss": 1.9462,
"step": 19000
},
{
"epoch": 0.5456088156636988,
"grad_norm": 82.5,
"learning_rate": 9.369906696586749e-05,
"loss": 2.8243,
"step": 19050
},
{
"epoch": 0.5470408597992991,
"grad_norm": 0.1572265625,
"learning_rate": 9.340380299988189e-05,
"loss": 1.8158,
"step": 19100
},
{
"epoch": 0.5484729039348992,
"grad_norm": 0.4609375,
"learning_rate": 9.31085390338963e-05,
"loss": 2.7108,
"step": 19150
},
{
"epoch": 0.5499049480704995,
"grad_norm": 0.62109375,
"learning_rate": 9.281327506791072e-05,
"loss": 1.9835,
"step": 19200
},
{
"epoch": 0.5513369922060998,
"grad_norm": 0.039794921875,
"learning_rate": 9.251801110192512e-05,
"loss": 2.0608,
"step": 19250
},
{
"epoch": 0.5527690363417,
"grad_norm": 1.265625,
"learning_rate": 9.222274713593954e-05,
"loss": 1.9135,
"step": 19300
},
{
"epoch": 0.5542010804773003,
"grad_norm": 0.030517578125,
"learning_rate": 9.192748316995395e-05,
"loss": 2.5224,
"step": 19350
},
{
"epoch": 0.5556331246129006,
"grad_norm": 332.0,
"learning_rate": 9.163221920396835e-05,
"loss": 3.2123,
"step": 19400
},
{
"epoch": 0.5570651687485009,
"grad_norm": 0.046142578125,
"learning_rate": 9.133695523798276e-05,
"loss": 1.9807,
"step": 19450
},
{
"epoch": 0.558497212884101,
"grad_norm": 0.036376953125,
"learning_rate": 9.104169127199716e-05,
"loss": 2.8211,
"step": 19500
},
{
"epoch": 0.5599292570197013,
"grad_norm": 0.2412109375,
"learning_rate": 9.074642730601158e-05,
"loss": 2.7913,
"step": 19550
},
{
"epoch": 0.5613613011553016,
"grad_norm": 0.087890625,
"learning_rate": 9.045116334002599e-05,
"loss": 1.5528,
"step": 19600
},
{
"epoch": 0.5627933452909019,
"grad_norm": 0.21484375,
"learning_rate": 9.01558993740404e-05,
"loss": 2.3024,
"step": 19650
},
{
"epoch": 0.5642253894265021,
"grad_norm": 0.271484375,
"learning_rate": 8.986063540805481e-05,
"loss": 1.305,
"step": 19700
},
{
"epoch": 0.5656574335621024,
"grad_norm": 0.373046875,
"learning_rate": 8.956537144206921e-05,
"loss": 2.1656,
"step": 19750
},
{
"epoch": 0.5670894776977027,
"grad_norm": 84.5,
"learning_rate": 8.927010747608362e-05,
"loss": 1.6671,
"step": 19800
},
{
"epoch": 0.568521521833303,
"grad_norm": 112.0,
"learning_rate": 8.897484351009802e-05,
"loss": 2.4715,
"step": 19850
},
{
"epoch": 0.5699535659689031,
"grad_norm": 0.130859375,
"learning_rate": 8.867957954411244e-05,
"loss": 1.7577,
"step": 19900
},
{
"epoch": 0.5713856101045034,
"grad_norm": 0.1357421875,
"learning_rate": 8.838431557812685e-05,
"loss": 1.6778,
"step": 19950
},
{
"epoch": 0.5728176542401037,
"grad_norm": 0.07080078125,
"learning_rate": 8.808905161214125e-05,
"loss": 1.4789,
"step": 20000
},
{
"epoch": 0.5728176542401037,
"eval_accuracy": 0.949,
"eval_loss": 0.2858292758464813,
"eval_macro_f1": 0.9484543460104051,
"eval_runtime": 172.7421,
"eval_samples_per_second": 11.578,
"eval_steps_per_second": 11.578,
"step": 20000
},
{
"epoch": 0.5742496983757039,
"grad_norm": 0.140625,
"learning_rate": 8.779378764615567e-05,
"loss": 1.6797,
"step": 20050
},
{
"epoch": 0.5756817425113042,
"grad_norm": 0.115234375,
"learning_rate": 8.749852368017008e-05,
"loss": 1.5026,
"step": 20100
},
{
"epoch": 0.5771137866469045,
"grad_norm": 0.02880859375,
"learning_rate": 8.720325971418448e-05,
"loss": 2.1316,
"step": 20150
},
{
"epoch": 0.5785458307825048,
"grad_norm": 0.08544921875,
"learning_rate": 8.690799574819889e-05,
"loss": 1.7517,
"step": 20200
},
{
"epoch": 0.5799778749181049,
"grad_norm": 82.0,
"learning_rate": 8.661273178221331e-05,
"loss": 2.4167,
"step": 20250
},
{
"epoch": 0.5814099190537052,
"grad_norm": 0.138671875,
"learning_rate": 8.631746781622771e-05,
"loss": 1.8565,
"step": 20300
},
{
"epoch": 0.5828419631893055,
"grad_norm": 34.25,
"learning_rate": 8.602220385024212e-05,
"loss": 1.9747,
"step": 20350
},
{
"epoch": 0.5842740073249058,
"grad_norm": 91.0,
"learning_rate": 8.572693988425654e-05,
"loss": 2.3284,
"step": 20400
},
{
"epoch": 0.585706051460506,
"grad_norm": 0.80859375,
"learning_rate": 8.543167591827094e-05,
"loss": 2.1788,
"step": 20450
},
{
"epoch": 0.5871380955961063,
"grad_norm": 446.0,
"learning_rate": 8.513641195228535e-05,
"loss": 1.6187,
"step": 20500
},
{
"epoch": 0.5885701397317066,
"grad_norm": 43.75,
"learning_rate": 8.484114798629975e-05,
"loss": 1.6338,
"step": 20550
},
{
"epoch": 0.5900021838673067,
"grad_norm": 0.91015625,
"learning_rate": 8.454588402031417e-05,
"loss": 2.424,
"step": 20600
},
{
"epoch": 0.591434228002907,
"grad_norm": 0.44921875,
"learning_rate": 8.425062005432858e-05,
"loss": 2.3043,
"step": 20650
},
{
"epoch": 0.5928662721385073,
"grad_norm": 268.0,
"learning_rate": 8.395535608834298e-05,
"loss": 2.5707,
"step": 20700
},
{
"epoch": 0.5942983162741076,
"grad_norm": 4.125,
"learning_rate": 8.366009212235739e-05,
"loss": 1.9577,
"step": 20750
},
{
"epoch": 0.5957303604097078,
"grad_norm": 5.0,
"learning_rate": 8.33648281563718e-05,
"loss": 0.7482,
"step": 20800
},
{
"epoch": 0.5971624045453081,
"grad_norm": 0.04052734375,
"learning_rate": 8.306956419038621e-05,
"loss": 1.5055,
"step": 20850
},
{
"epoch": 0.5985944486809084,
"grad_norm": 1.109375,
"learning_rate": 8.277430022440061e-05,
"loss": 3.3671,
"step": 20900
},
{
"epoch": 0.6000264928165085,
"grad_norm": 0.162109375,
"learning_rate": 8.247903625841503e-05,
"loss": 2.0574,
"step": 20950
},
{
"epoch": 0.6014585369521088,
"grad_norm": 0.03173828125,
"learning_rate": 8.218377229242944e-05,
"loss": 2.1942,
"step": 21000
},
{
"epoch": 0.6028905810877091,
"grad_norm": 0.251953125,
"learning_rate": 8.188850832644384e-05,
"loss": 1.6319,
"step": 21050
},
{
"epoch": 0.6043226252233094,
"grad_norm": 86.5,
"learning_rate": 8.159324436045825e-05,
"loss": 2.1558,
"step": 21100
},
{
"epoch": 0.6057546693589096,
"grad_norm": 2.984375,
"learning_rate": 8.129798039447267e-05,
"loss": 2.2353,
"step": 21150
},
{
"epoch": 0.6071867134945099,
"grad_norm": 0.0908203125,
"learning_rate": 8.100271642848707e-05,
"loss": 1.4975,
"step": 21200
},
{
"epoch": 0.6086187576301102,
"grad_norm": 98.0,
"learning_rate": 8.070745246250148e-05,
"loss": 2.5975,
"step": 21250
},
{
"epoch": 0.6086187576301102,
"eval_accuracy": 0.948,
"eval_loss": 0.2741381525993347,
"eval_macro_f1": 0.9473973559594594,
"eval_runtime": 172.6111,
"eval_samples_per_second": 11.587,
"eval_steps_per_second": 11.587,
"step": 21250
},
{
"epoch": 0.6100508017657105,
"grad_norm": 328.0,
"learning_rate": 8.04121884965159e-05,
"loss": 2.4534,
"step": 21300
},
{
"epoch": 0.6114828459013106,
"grad_norm": 11.125,
"learning_rate": 8.01169245305303e-05,
"loss": 2.1319,
"step": 21350
},
{
"epoch": 0.6129148900369109,
"grad_norm": 0.08642578125,
"learning_rate": 7.982166056454471e-05,
"loss": 2.4199,
"step": 21400
},
{
"epoch": 0.6143469341725112,
"grad_norm": 0.18359375,
"learning_rate": 7.952639659855911e-05,
"loss": 1.7527,
"step": 21450
},
{
"epoch": 0.6157789783081115,
"grad_norm": 0.458984375,
"learning_rate": 7.923113263257352e-05,
"loss": 2.4992,
"step": 21500
},
{
"epoch": 0.6172110224437117,
"grad_norm": 0.671875,
"learning_rate": 7.893586866658794e-05,
"loss": 2.5082,
"step": 21550
},
{
"epoch": 0.618643066579312,
"grad_norm": 272.0,
"learning_rate": 7.864060470060234e-05,
"loss": 2.2187,
"step": 21600
},
{
"epoch": 0.6200751107149123,
"grad_norm": 310.0,
"learning_rate": 7.834534073461675e-05,
"loss": 3.006,
"step": 21650
},
{
"epoch": 0.6215071548505124,
"grad_norm": 536.0,
"learning_rate": 7.805007676863117e-05,
"loss": 2.4535,
"step": 21700
},
{
"epoch": 0.6229391989861127,
"grad_norm": 408.0,
"learning_rate": 7.775481280264557e-05,
"loss": 2.376,
"step": 21750
},
{
"epoch": 0.624371243121713,
"grad_norm": 0.123046875,
"learning_rate": 7.745954883665998e-05,
"loss": 1.3044,
"step": 21800
},
{
"epoch": 0.6258032872573133,
"grad_norm": 0.70703125,
"learning_rate": 7.716428487067438e-05,
"loss": 1.9046,
"step": 21850
},
{
"epoch": 0.6272353313929135,
"grad_norm": 0.189453125,
"learning_rate": 7.68690209046888e-05,
"loss": 1.8825,
"step": 21900
},
{
"epoch": 0.6286673755285138,
"grad_norm": 0.251953125,
"learning_rate": 7.65737569387032e-05,
"loss": 2.353,
"step": 21950
},
{
"epoch": 0.6300994196641141,
"grad_norm": 0.0301513671875,
"learning_rate": 7.627849297271761e-05,
"loss": 1.7222,
"step": 22000
},
{
"epoch": 0.6315314637997143,
"grad_norm": 0.126953125,
"learning_rate": 7.598322900673203e-05,
"loss": 2.4583,
"step": 22050
},
{
"epoch": 0.6329635079353145,
"grad_norm": 1.0234375,
"learning_rate": 7.568796504074643e-05,
"loss": 1.9643,
"step": 22100
},
{
"epoch": 0.6343955520709148,
"grad_norm": 70.5,
"learning_rate": 7.539270107476084e-05,
"loss": 1.6712,
"step": 22150
},
{
"epoch": 0.6358275962065151,
"grad_norm": 8.0,
"learning_rate": 7.509743710877524e-05,
"loss": 2.1964,
"step": 22200
},
{
"epoch": 0.6372596403421154,
"grad_norm": 11.625,
"learning_rate": 7.480217314278965e-05,
"loss": 2.0319,
"step": 22250
},
{
"epoch": 0.6386916844777156,
"grad_norm": 0.09033203125,
"learning_rate": 7.450690917680407e-05,
"loss": 3.1062,
"step": 22300
},
{
"epoch": 0.6401237286133159,
"grad_norm": 490.0,
"learning_rate": 7.421164521081847e-05,
"loss": 2.028,
"step": 22350
},
{
"epoch": 0.6415557727489162,
"grad_norm": 688.0,
"learning_rate": 7.391638124483289e-05,
"loss": 1.6743,
"step": 22400
},
{
"epoch": 0.6429878168845163,
"grad_norm": 0.2578125,
"learning_rate": 7.36211172788473e-05,
"loss": 1.3926,
"step": 22450
},
{
"epoch": 0.6444198610201166,
"grad_norm": 278.0,
"learning_rate": 7.33258533128617e-05,
"loss": 2.073,
"step": 22500
},
{
"epoch": 0.6444198610201166,
"eval_accuracy": 0.9495,
"eval_loss": 0.2617259919643402,
"eval_macro_f1": 0.9489699460568645,
"eval_runtime": 172.6662,
"eval_samples_per_second": 11.583,
"eval_steps_per_second": 11.583,
"step": 22500
},
{
"epoch": 0.6458519051557169,
"grad_norm": 0.07373046875,
"learning_rate": 7.303058934687611e-05,
"loss": 2.099,
"step": 22550
},
{
"epoch": 0.6472839492913172,
"grad_norm": 0.5859375,
"learning_rate": 7.273532538089051e-05,
"loss": 2.2826,
"step": 22600
},
{
"epoch": 0.6487159934269174,
"grad_norm": 79.5,
"learning_rate": 7.244006141490493e-05,
"loss": 1.377,
"step": 22650
},
{
"epoch": 0.6501480375625177,
"grad_norm": 2.265625,
"learning_rate": 7.214479744891934e-05,
"loss": 1.9826,
"step": 22700
},
{
"epoch": 0.651580081698118,
"grad_norm": 0.37109375,
"learning_rate": 7.184953348293376e-05,
"loss": 2.2446,
"step": 22750
},
{
"epoch": 0.6530121258337181,
"grad_norm": 4.25,
"learning_rate": 7.155426951694816e-05,
"loss": 2.0254,
"step": 22800
},
{
"epoch": 0.6544441699693184,
"grad_norm": 0.03564453125,
"learning_rate": 7.125900555096257e-05,
"loss": 2.0871,
"step": 22850
},
{
"epoch": 0.6558762141049187,
"grad_norm": 0.390625,
"learning_rate": 7.096374158497697e-05,
"loss": 2.9276,
"step": 22900
},
{
"epoch": 0.657308258240519,
"grad_norm": 0.384765625,
"learning_rate": 7.066847761899138e-05,
"loss": 1.0622,
"step": 22950
},
{
"epoch": 0.6587403023761192,
"grad_norm": 4576.0,
"learning_rate": 7.037321365300578e-05,
"loss": 3.0808,
"step": 23000
},
{
"epoch": 0.6601723465117195,
"grad_norm": 372.0,
"learning_rate": 7.00779496870202e-05,
"loss": 1.8306,
"step": 23050
},
{
"epoch": 0.6616043906473198,
"grad_norm": 0.208984375,
"learning_rate": 6.978268572103462e-05,
"loss": 2.1282,
"step": 23100
},
{
"epoch": 0.66303643478292,
"grad_norm": 5.65625,
"learning_rate": 6.948742175504902e-05,
"loss": 1.8392,
"step": 23150
},
{
"epoch": 0.6644684789185202,
"grad_norm": 0.24609375,
"learning_rate": 6.919215778906343e-05,
"loss": 2.594,
"step": 23200
},
{
"epoch": 0.6659005230541205,
"grad_norm": 0.123046875,
"learning_rate": 6.889689382307783e-05,
"loss": 2.4234,
"step": 23250
},
{
"epoch": 0.6673325671897208,
"grad_norm": 268.0,
"learning_rate": 6.860162985709224e-05,
"loss": 2.3424,
"step": 23300
},
{
"epoch": 0.6687646113253211,
"grad_norm": 0.427734375,
"learning_rate": 6.830636589110664e-05,
"loss": 2.3216,
"step": 23350
},
{
"epoch": 0.6701966554609213,
"grad_norm": 0.9296875,
"learning_rate": 6.801110192512106e-05,
"loss": 2.4566,
"step": 23400
},
{
"epoch": 0.6716286995965216,
"grad_norm": 88.5,
"learning_rate": 6.771583795913548e-05,
"loss": 1.3767,
"step": 23450
},
{
"epoch": 0.6730607437321218,
"grad_norm": 488.0,
"learning_rate": 6.742057399314989e-05,
"loss": 2.343,
"step": 23500
},
{
"epoch": 0.674492787867722,
"grad_norm": 296.0,
"learning_rate": 6.712531002716429e-05,
"loss": 1.4841,
"step": 23550
},
{
"epoch": 0.6759248320033223,
"grad_norm": 218.0,
"learning_rate": 6.68300460611787e-05,
"loss": 2.4037,
"step": 23600
},
{
"epoch": 0.6773568761389226,
"grad_norm": 0.466796875,
"learning_rate": 6.65347820951931e-05,
"loss": 1.4982,
"step": 23650
},
{
"epoch": 0.6787889202745229,
"grad_norm": 49.25,
"learning_rate": 6.623951812920751e-05,
"loss": 2.2085,
"step": 23700
},
{
"epoch": 0.6802209644101231,
"grad_norm": 0.30078125,
"learning_rate": 6.594425416322191e-05,
"loss": 1.7055,
"step": 23750
},
{
"epoch": 0.6802209644101231,
"eval_accuracy": 0.9505,
"eval_loss": 0.26270824670791626,
"eval_macro_f1": 0.9498080478089564,
"eval_runtime": 172.6664,
"eval_samples_per_second": 11.583,
"eval_steps_per_second": 11.583,
"step": 23750
},
{
"epoch": 0.6816530085457234,
"grad_norm": 95.5,
"learning_rate": 6.564899019723633e-05,
"loss": 2.5024,
"step": 23800
},
{
"epoch": 0.6830850526813237,
"grad_norm": 0.30078125,
"learning_rate": 6.535372623125075e-05,
"loss": 2.2518,
"step": 23850
},
{
"epoch": 0.6845170968169239,
"grad_norm": 116.5,
"learning_rate": 6.505846226526516e-05,
"loss": 2.0539,
"step": 23900
},
{
"epoch": 0.6859491409525241,
"grad_norm": 264.0,
"learning_rate": 6.476319829927956e-05,
"loss": 2.5857,
"step": 23950
},
{
"epoch": 0.6873811850881244,
"grad_norm": 302.0,
"learning_rate": 6.446793433329397e-05,
"loss": 2.1408,
"step": 24000
},
{
"epoch": 0.6888132292237247,
"grad_norm": 0.5390625,
"learning_rate": 6.417267036730837e-05,
"loss": 1.9618,
"step": 24050
},
{
"epoch": 0.690245273359325,
"grad_norm": 164.0,
"learning_rate": 6.387740640132278e-05,
"loss": 2.0112,
"step": 24100
},
{
"epoch": 0.6916773174949252,
"grad_norm": 14.0625,
"learning_rate": 6.35821424353372e-05,
"loss": 2.7256,
"step": 24150
},
{
"epoch": 0.6931093616305255,
"grad_norm": 164.0,
"learning_rate": 6.328687846935161e-05,
"loss": 0.8362,
"step": 24200
},
{
"epoch": 0.6945414057661257,
"grad_norm": 0.271484375,
"learning_rate": 6.299161450336602e-05,
"loss": 2.2874,
"step": 24250
},
{
"epoch": 0.6959734499017259,
"grad_norm": 110.5,
"learning_rate": 6.269635053738042e-05,
"loss": 1.5674,
"step": 24300
},
{
"epoch": 0.6974054940373262,
"grad_norm": 0.1630859375,
"learning_rate": 6.240108657139483e-05,
"loss": 2.5817,
"step": 24350
},
{
"epoch": 0.6988375381729265,
"grad_norm": 0.99609375,
"learning_rate": 6.210582260540923e-05,
"loss": 1.2537,
"step": 24400
},
{
"epoch": 0.7002695823085268,
"grad_norm": 266.0,
"learning_rate": 6.181055863942364e-05,
"loss": 2.4499,
"step": 24450
},
{
"epoch": 0.701701626444127,
"grad_norm": 1.59375,
"learning_rate": 6.151529467343806e-05,
"loss": 2.8047,
"step": 24500
},
{
"epoch": 0.7031336705797273,
"grad_norm": 0.08447265625,
"learning_rate": 6.122003070745246e-05,
"loss": 1.6917,
"step": 24550
},
{
"epoch": 0.7045657147153275,
"grad_norm": 296.0,
"learning_rate": 6.0924766741466875e-05,
"loss": 2.2486,
"step": 24600
},
{
"epoch": 0.7059977588509277,
"grad_norm": 183.0,
"learning_rate": 6.062950277548128e-05,
"loss": 2.5183,
"step": 24650
},
{
"epoch": 0.707429802986528,
"grad_norm": 0.123046875,
"learning_rate": 6.033423880949569e-05,
"loss": 2.2984,
"step": 24700
},
{
"epoch": 0.7088618471221283,
"grad_norm": 0.37890625,
"learning_rate": 6.00389748435101e-05,
"loss": 2.4598,
"step": 24750
},
{
"epoch": 0.7102938912577286,
"grad_norm": 6.25,
"learning_rate": 5.97437108775245e-05,
"loss": 2.0554,
"step": 24800
},
{
"epoch": 0.7117259353933288,
"grad_norm": 1.5234375,
"learning_rate": 5.9448446911538915e-05,
"loss": 1.3688,
"step": 24850
},
{
"epoch": 0.7131579795289291,
"grad_norm": 83.5,
"learning_rate": 5.9153182945553334e-05,
"loss": 2.6434,
"step": 24900
},
{
"epoch": 0.7145900236645294,
"grad_norm": 0.8046875,
"learning_rate": 5.885791897956774e-05,
"loss": 1.1703,
"step": 24950
},
{
"epoch": 0.7160220678001296,
"grad_norm": 0.76953125,
"learning_rate": 5.8562655013582144e-05,
"loss": 1.7433,
"step": 25000
},
{
"epoch": 0.7160220678001296,
"eval_accuracy": 0.9475,
"eval_loss": 0.2805185317993164,
"eval_macro_f1": 0.9469725724830536,
"eval_runtime": 172.6365,
"eval_samples_per_second": 11.585,
"eval_steps_per_second": 11.585,
"step": 25000
},
{
"epoch": 0.7174541119357298,
"grad_norm": 268.0,
"learning_rate": 5.8267391047596556e-05,
"loss": 2.7963,
"step": 25050
},
{
"epoch": 0.7188861560713301,
"grad_norm": 294.0,
"learning_rate": 5.797212708161096e-05,
"loss": 2.3253,
"step": 25100
},
{
"epoch": 0.7203182002069304,
"grad_norm": 0.11181640625,
"learning_rate": 5.7676863115625366e-05,
"loss": 1.0165,
"step": 25150
},
{
"epoch": 0.7217502443425307,
"grad_norm": 756.0,
"learning_rate": 5.738159914963978e-05,
"loss": 1.4844,
"step": 25200
},
{
"epoch": 0.7231822884781309,
"grad_norm": 0.10107421875,
"learning_rate": 5.70863351836542e-05,
"loss": 2.7171,
"step": 25250
},
{
"epoch": 0.7246143326137312,
"grad_norm": 336.0,
"learning_rate": 5.67910712176686e-05,
"loss": 3.1605,
"step": 25300
},
{
"epoch": 0.7260463767493314,
"grad_norm": 177.0,
"learning_rate": 5.649580725168301e-05,
"loss": 1.9816,
"step": 25350
},
{
"epoch": 0.7274784208849316,
"grad_norm": 1.78125,
"learning_rate": 5.620054328569741e-05,
"loss": 1.8129,
"step": 25400
},
{
"epoch": 0.7289104650205319,
"grad_norm": 0.040771484375,
"learning_rate": 5.5905279319711824e-05,
"loss": 1.3484,
"step": 25450
},
{
"epoch": 0.7303425091561322,
"grad_norm": 8.375,
"learning_rate": 5.561001535372623e-05,
"loss": 2.1354,
"step": 25500
},
{
"epoch": 0.7317745532917325,
"grad_norm": 0.0308837890625,
"learning_rate": 5.5314751387740635e-05,
"loss": 1.747,
"step": 25550
},
{
"epoch": 0.7332065974273327,
"grad_norm": 808.0,
"learning_rate": 5.5019487421755053e-05,
"loss": 2.6803,
"step": 25600
},
{
"epoch": 0.734638641562933,
"grad_norm": 0.96484375,
"learning_rate": 5.4724223455769465e-05,
"loss": 2.2422,
"step": 25650
},
{
"epoch": 0.7360706856985332,
"grad_norm": 10.0,
"learning_rate": 5.442895948978387e-05,
"loss": 2.0731,
"step": 25700
},
{
"epoch": 0.7375027298341335,
"grad_norm": 0.27734375,
"learning_rate": 5.4133695523798276e-05,
"loss": 2.9622,
"step": 25750
},
{
"epoch": 0.7389347739697337,
"grad_norm": 200.0,
"learning_rate": 5.383843155781269e-05,
"loss": 2.179,
"step": 25800
},
{
"epoch": 0.740366818105334,
"grad_norm": 118.5,
"learning_rate": 5.354316759182709e-05,
"loss": 2.4152,
"step": 25850
},
{
"epoch": 0.7417988622409343,
"grad_norm": 0.2470703125,
"learning_rate": 5.32479036258415e-05,
"loss": 1.4274,
"step": 25900
},
{
"epoch": 0.7432309063765346,
"grad_norm": 6.71875,
"learning_rate": 5.295263965985592e-05,
"loss": 2.0263,
"step": 25950
},
{
"epoch": 0.7446629505121348,
"grad_norm": 0.28125,
"learning_rate": 5.265737569387033e-05,
"loss": 1.8231,
"step": 26000
},
{
"epoch": 0.746094994647735,
"grad_norm": 0.8046875,
"learning_rate": 5.2362111727884734e-05,
"loss": 1.7974,
"step": 26050
},
{
"epoch": 0.7475270387833353,
"grad_norm": 102.5,
"learning_rate": 5.206684776189914e-05,
"loss": 2.5667,
"step": 26100
},
{
"epoch": 0.7489590829189355,
"grad_norm": 380.0,
"learning_rate": 5.1771583795913544e-05,
"loss": 1.8334,
"step": 26150
},
{
"epoch": 0.7503911270545358,
"grad_norm": 0.74609375,
"learning_rate": 5.1476319829927956e-05,
"loss": 1.2929,
"step": 26200
},
{
"epoch": 0.7518231711901361,
"grad_norm": 7.3125,
"learning_rate": 5.118105586394236e-05,
"loss": 2.2943,
"step": 26250
},
{
"epoch": 0.7518231711901361,
"eval_accuracy": 0.951,
"eval_loss": 0.2633407413959503,
"eval_macro_f1": 0.9502699810655684,
"eval_runtime": 172.7789,
"eval_samples_per_second": 11.575,
"eval_steps_per_second": 11.575,
"step": 26250
}
],
"logging_steps": 50,
"max_steps": 34916,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1179332952064e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}