{ "best_global_step": 26250, "best_metric": 0.9502699810655684, "best_model_checkpoint": "D:\\Task_design\\Topic\\strategy_train\\outputs\\qwen7b-lora-topic_strategy\\checkpoint-26250", "epoch": 0.7518231711901361, "eval_steps": 1250, "global_step": 26250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014320441356002593, "grad_norm": 608.0, "learning_rate": 9.351145038167939e-06, "loss": 28.2021, "step": 50 }, { "epoch": 0.0028640882712005185, "grad_norm": 326.0, "learning_rate": 1.8893129770992367e-05, "loss": 13.4193, "step": 100 }, { "epoch": 0.004296132406800777, "grad_norm": 600.0, "learning_rate": 2.8435114503816796e-05, "loss": 8.5573, "step": 150 }, { "epoch": 0.005728176542401037, "grad_norm": 146.0, "learning_rate": 3.797709923664122e-05, "loss": 4.1909, "step": 200 }, { "epoch": 0.007160220678001296, "grad_norm": 372.0, "learning_rate": 4.751908396946565e-05, "loss": 3.6133, "step": 250 }, { "epoch": 0.008592264813601555, "grad_norm": 36.75, "learning_rate": 5.7061068702290074e-05, "loss": 2.8409, "step": 300 }, { "epoch": 0.010024308949201815, "grad_norm": 210.0, "learning_rate": 6.66030534351145e-05, "loss": 2.9306, "step": 350 }, { "epoch": 0.011456353084802074, "grad_norm": 0.00037384033203125, "learning_rate": 7.614503816793893e-05, "loss": 4.0291, "step": 400 }, { "epoch": 0.012888397220402333, "grad_norm": 24.875, "learning_rate": 8.568702290076335e-05, "loss": 4.0621, "step": 450 }, { "epoch": 0.014320441356002592, "grad_norm": 378.0, "learning_rate": 9.522900763358779e-05, "loss": 5.0668, "step": 500 }, { "epoch": 0.015752485491602852, "grad_norm": 20.5, "learning_rate": 0.00010477099236641222, "loss": 3.8491, "step": 550 }, { "epoch": 0.01718452962720311, "grad_norm": 478.0, "learning_rate": 0.00011431297709923666, "loss": 2.9158, "step": 600 }, { "epoch": 0.01861657376280337, "grad_norm": 53.5, "learning_rate": 0.00012385496183206106, "loss": 4.3353, "step": 650 }, { "epoch": 0.02004861789840363, "grad_norm": 0.0010986328125, "learning_rate": 0.0001333969465648855, "loss": 4.2307, "step": 700 }, { "epoch": 0.021480662034003888, "grad_norm": 44.75, "learning_rate": 0.0001429389312977099, "loss": 3.1633, "step": 750 }, { "epoch": 0.022912706169604148, "grad_norm": 396.0, "learning_rate": 0.00015248091603053436, "loss": 4.8827, "step": 800 }, { "epoch": 0.024344750305204405, "grad_norm": 3.838539123535156e-05, "learning_rate": 0.0001620229007633588, "loss": 3.0475, "step": 850 }, { "epoch": 0.025776794440804666, "grad_norm": 12.625, "learning_rate": 0.0001715648854961832, "loss": 4.3648, "step": 900 }, { "epoch": 0.027208838576404926, "grad_norm": 0.009033203125, "learning_rate": 0.00018110687022900764, "loss": 3.8295, "step": 950 }, { "epoch": 0.028640882712005183, "grad_norm": 304.0, "learning_rate": 0.00019064885496183207, "loss": 5.2518, "step": 1000 }, { "epoch": 0.030072926847605444, "grad_norm": 872.0, "learning_rate": 0.0001999940947206803, "loss": 17.7632, "step": 1050 }, { "epoch": 0.031504970983205705, "grad_norm": 328.0, "learning_rate": 0.00019969883075469472, "loss": 16.0223, "step": 1100 }, { "epoch": 0.03293701511880596, "grad_norm": 468.0, "learning_rate": 0.0001994035667887091, "loss": 10.9938, "step": 1150 }, { "epoch": 0.03436905925440622, "grad_norm": 462.0, "learning_rate": 0.00019910830282272353, "loss": 9.1089, "step": 1200 }, { "epoch": 0.03580110339000648, "grad_norm": 43.25, "learning_rate": 0.00019881303885673795, "loss": 8.8076, "step": 1250 }, { "epoch": 0.03580110339000648, "eval_accuracy": 0.473, "eval_loss": 1.0368720293045044, "eval_macro_f1": 0.3829550887916939, "eval_runtime": 172.7823, "eval_samples_per_second": 11.575, "eval_steps_per_second": 11.575, "step": 1250 }, { "epoch": 0.03723314752560674, "grad_norm": 276.0, "learning_rate": 0.00019851777489075234, "loss": 8.0924, "step": 1300 }, { "epoch": 0.038665191661207, "grad_norm": 150.0, "learning_rate": 0.00019822251092476676, "loss": 7.168, "step": 1350 }, { "epoch": 0.04009723579680726, "grad_norm": 282.0, "learning_rate": 0.00019792724695878115, "loss": 6.6729, "step": 1400 }, { "epoch": 0.041529279932407515, "grad_norm": 155.0, "learning_rate": 0.00019763198299279557, "loss": 6.2658, "step": 1450 }, { "epoch": 0.042961324068007775, "grad_norm": 95.0, "learning_rate": 0.00019733671902680996, "loss": 4.2749, "step": 1500 }, { "epoch": 0.044393368203608036, "grad_norm": 121.5, "learning_rate": 0.00019704145506082438, "loss": 6.0376, "step": 1550 }, { "epoch": 0.045825412339208296, "grad_norm": 484.0, "learning_rate": 0.0001967461910948388, "loss": 5.4624, "step": 1600 }, { "epoch": 0.04725745647480856, "grad_norm": 83.0, "learning_rate": 0.00019645092712885321, "loss": 4.8571, "step": 1650 }, { "epoch": 0.04868950061040881, "grad_norm": 86.0, "learning_rate": 0.00019615566316286763, "loss": 5.2631, "step": 1700 }, { "epoch": 0.05012154474600907, "grad_norm": 0.35546875, "learning_rate": 0.00019586039919688202, "loss": 4.2013, "step": 1750 }, { "epoch": 0.05155358888160933, "grad_norm": 58.0, "learning_rate": 0.00019556513523089644, "loss": 5.4813, "step": 1800 }, { "epoch": 0.05298563301720959, "grad_norm": 868.0, "learning_rate": 0.00019526987126491083, "loss": 4.6324, "step": 1850 }, { "epoch": 0.05441767715280985, "grad_norm": 169.0, "learning_rate": 0.00019497460729892525, "loss": 3.9849, "step": 1900 }, { "epoch": 0.055849721288410106, "grad_norm": 4.28125, "learning_rate": 0.00019467934333293967, "loss": 3.2505, "step": 1950 }, { "epoch": 0.05728176542401037, "grad_norm": 11.0625, "learning_rate": 0.00019438407936695406, "loss": 3.7368, "step": 2000 }, { "epoch": 0.05871380955961063, "grad_norm": 40.75, "learning_rate": 0.00019408881540096848, "loss": 4.2252, "step": 2050 }, { "epoch": 0.06014585369521089, "grad_norm": 2240.0, "learning_rate": 0.00019379355143498287, "loss": 3.8708, "step": 2100 }, { "epoch": 0.06157789783081115, "grad_norm": 318.0, "learning_rate": 0.0001934982874689973, "loss": 3.7427, "step": 2150 }, { "epoch": 0.06300994196641141, "grad_norm": 79.0, "learning_rate": 0.00019320302350301168, "loss": 2.5798, "step": 2200 }, { "epoch": 0.06444198610201167, "grad_norm": 188.0, "learning_rate": 0.0001929077595370261, "loss": 3.2888, "step": 2250 }, { "epoch": 0.06587403023761192, "grad_norm": 255.0, "learning_rate": 0.00019261249557104052, "loss": 3.5956, "step": 2300 }, { "epoch": 0.06730607437321218, "grad_norm": 111.5, "learning_rate": 0.0001923172316050549, "loss": 2.6906, "step": 2350 }, { "epoch": 0.06873811850881244, "grad_norm": 8.8125, "learning_rate": 0.00019202196763906933, "loss": 2.9821, "step": 2400 }, { "epoch": 0.0701701626444127, "grad_norm": 62.25, "learning_rate": 0.00019172670367308375, "loss": 2.9432, "step": 2450 }, { "epoch": 0.07160220678001296, "grad_norm": 268.0, "learning_rate": 0.00019143143970709817, "loss": 5.8543, "step": 2500 }, { "epoch": 0.07160220678001296, "eval_accuracy": 0.8855, "eval_loss": 0.4594672918319702, "eval_macro_f1": 0.8847948863660271, "eval_runtime": 174.3198, "eval_samples_per_second": 11.473, "eval_steps_per_second": 11.473, "step": 2500 }, { "epoch": 0.07303425091561322, "grad_norm": 1.703125, "learning_rate": 0.00019113617574111256, "loss": 4.3718, "step": 2550 }, { "epoch": 0.07446629505121348, "grad_norm": 6.125, "learning_rate": 0.00019084091177512698, "loss": 5.6269, "step": 2600 }, { "epoch": 0.07589833918681374, "grad_norm": 2.796875, "learning_rate": 0.0001905456478091414, "loss": 4.2341, "step": 2650 }, { "epoch": 0.077330383322414, "grad_norm": 608.0, "learning_rate": 0.0001902503838431558, "loss": 3.3186, "step": 2700 }, { "epoch": 0.07876242745801426, "grad_norm": 140.0, "learning_rate": 0.0001899551198771702, "loss": 5.9126, "step": 2750 }, { "epoch": 0.08019447159361452, "grad_norm": 824.0, "learning_rate": 0.0001896598559111846, "loss": 5.0582, "step": 2800 }, { "epoch": 0.08162651572921477, "grad_norm": 165.0, "learning_rate": 0.00018936459194519902, "loss": 3.5105, "step": 2850 }, { "epoch": 0.08305855986481503, "grad_norm": 0.0284423828125, "learning_rate": 0.0001890693279792134, "loss": 4.6236, "step": 2900 }, { "epoch": 0.08449060400041529, "grad_norm": 116.5, "learning_rate": 0.00018877406401322783, "loss": 3.9021, "step": 2950 }, { "epoch": 0.08592264813601555, "grad_norm": 0.251953125, "learning_rate": 0.00018847880004724225, "loss": 3.883, "step": 3000 }, { "epoch": 0.08735469227161581, "grad_norm": 0.0260009765625, "learning_rate": 0.00018818353608125664, "loss": 3.9736, "step": 3050 }, { "epoch": 0.08878673640721607, "grad_norm": 290.0, "learning_rate": 0.00018788827211527106, "loss": 5.218, "step": 3100 }, { "epoch": 0.09021878054281633, "grad_norm": 1720.0, "learning_rate": 0.00018759300814928548, "loss": 3.2961, "step": 3150 }, { "epoch": 0.09165082467841659, "grad_norm": 0.26953125, "learning_rate": 0.0001872977441832999, "loss": 3.4482, "step": 3200 }, { "epoch": 0.09308286881401685, "grad_norm": 13.375, "learning_rate": 0.0001870024802173143, "loss": 2.928, "step": 3250 }, { "epoch": 0.09451491294961711, "grad_norm": 73.0, "learning_rate": 0.0001867072162513287, "loss": 3.4569, "step": 3300 }, { "epoch": 0.09594695708521736, "grad_norm": 9.25, "learning_rate": 0.00018641195228534313, "loss": 3.8492, "step": 3350 }, { "epoch": 0.09737900122081762, "grad_norm": 274.0, "learning_rate": 0.00018611668831935752, "loss": 3.4008, "step": 3400 }, { "epoch": 0.09881104535641788, "grad_norm": 158.0, "learning_rate": 0.00018582142435337194, "loss": 3.6703, "step": 3450 }, { "epoch": 0.10024308949201814, "grad_norm": 264.0, "learning_rate": 0.00018552616038738633, "loss": 3.4321, "step": 3500 }, { "epoch": 0.1016751336276184, "grad_norm": 2.015625, "learning_rate": 0.00018523089642140075, "loss": 2.4367, "step": 3550 }, { "epoch": 0.10310717776321866, "grad_norm": 270.0, "learning_rate": 0.00018493563245541514, "loss": 3.6473, "step": 3600 }, { "epoch": 0.10453922189881892, "grad_norm": 0.0478515625, "learning_rate": 0.00018464036848942956, "loss": 2.3759, "step": 3650 }, { "epoch": 0.10597126603441918, "grad_norm": 282.0, "learning_rate": 0.00018434510452344395, "loss": 2.5434, "step": 3700 }, { "epoch": 0.10740331017001944, "grad_norm": 100.0, "learning_rate": 0.00018404984055745837, "loss": 2.4411, "step": 3750 }, { "epoch": 0.10740331017001944, "eval_accuracy": 0.911, "eval_loss": 0.6009318232536316, "eval_macro_f1": 0.9109307309196768, "eval_runtime": 173.1976, "eval_samples_per_second": 11.548, "eval_steps_per_second": 11.548, "step": 3750 }, { "epoch": 0.1088353543056197, "grad_norm": 8.875, "learning_rate": 0.00018375457659147279, "loss": 3.7952, "step": 3800 }, { "epoch": 0.11026739844121995, "grad_norm": 408.0, "learning_rate": 0.00018345931262548718, "loss": 2.7528, "step": 3850 }, { "epoch": 0.11169944257682021, "grad_norm": 4.65625, "learning_rate": 0.0001831640486595016, "loss": 3.0934, "step": 3900 }, { "epoch": 0.11313148671242047, "grad_norm": 0.0274658203125, "learning_rate": 0.00018286878469351601, "loss": 3.3618, "step": 3950 }, { "epoch": 0.11456353084802073, "grad_norm": 93.0, "learning_rate": 0.00018257352072753043, "loss": 3.635, "step": 4000 }, { "epoch": 0.115995574983621, "grad_norm": 159.0, "learning_rate": 0.00018227825676154482, "loss": 2.3589, "step": 4050 }, { "epoch": 0.11742761911922125, "grad_norm": 290.0, "learning_rate": 0.00018198299279555924, "loss": 3.9717, "step": 4100 }, { "epoch": 0.11885966325482152, "grad_norm": 8.375, "learning_rate": 0.00018168772882957366, "loss": 3.0616, "step": 4150 }, { "epoch": 0.12029170739042178, "grad_norm": 264.0, "learning_rate": 0.00018139246486358805, "loss": 3.4315, "step": 4200 }, { "epoch": 0.12172375152602204, "grad_norm": 206.0, "learning_rate": 0.00018109720089760247, "loss": 3.3353, "step": 4250 }, { "epoch": 0.1231557956616223, "grad_norm": 0.37109375, "learning_rate": 0.00018080193693161686, "loss": 2.7568, "step": 4300 }, { "epoch": 0.12458783979722254, "grad_norm": 314.0, "learning_rate": 0.00018050667296563128, "loss": 3.0107, "step": 4350 }, { "epoch": 0.12601988393282282, "grad_norm": 992.0, "learning_rate": 0.00018021140899964567, "loss": 2.8247, "step": 4400 }, { "epoch": 0.12745192806842306, "grad_norm": 225.0, "learning_rate": 0.0001799161450336601, "loss": 3.3408, "step": 4450 }, { "epoch": 0.12888397220402334, "grad_norm": 0.703125, "learning_rate": 0.0001796208810676745, "loss": 2.8974, "step": 4500 }, { "epoch": 0.13031601633962359, "grad_norm": 280.0, "learning_rate": 0.0001793256171016889, "loss": 2.8223, "step": 4550 }, { "epoch": 0.13174806047522383, "grad_norm": 180.0, "learning_rate": 0.00017903035313570332, "loss": 3.7603, "step": 4600 }, { "epoch": 0.1331801046108241, "grad_norm": 6.28125, "learning_rate": 0.00017873508916971774, "loss": 4.2271, "step": 4650 }, { "epoch": 0.13461214874642435, "grad_norm": 338.0, "learning_rate": 0.00017843982520373216, "loss": 3.2114, "step": 4700 }, { "epoch": 0.13604419288202463, "grad_norm": 0.6796875, "learning_rate": 0.00017814456123774655, "loss": 3.4457, "step": 4750 }, { "epoch": 0.13747623701762487, "grad_norm": 2.34375, "learning_rate": 0.00017784929727176097, "loss": 2.2643, "step": 4800 }, { "epoch": 0.13890828115322515, "grad_norm": 288.0, "learning_rate": 0.0001775540333057754, "loss": 3.0672, "step": 4850 }, { "epoch": 0.1403403252888254, "grad_norm": 1896.0, "learning_rate": 0.00017725876933978978, "loss": 2.8551, "step": 4900 }, { "epoch": 0.14177236942442567, "grad_norm": 88.0, "learning_rate": 0.0001769635053738042, "loss": 3.5021, "step": 4950 }, { "epoch": 0.14320441356002592, "grad_norm": 94.0, "learning_rate": 0.0001766682414078186, "loss": 2.1413, "step": 5000 }, { "epoch": 0.14320441356002592, "eval_accuracy": 0.917, "eval_loss": 0.3995007872581482, "eval_macro_f1": 0.9161602620439439, "eval_runtime": 179.9592, "eval_samples_per_second": 11.114, "eval_steps_per_second": 11.114, "step": 5000 }, { "epoch": 0.1446364576956262, "grad_norm": 0.97265625, "learning_rate": 0.000176372977441833, "loss": 2.3626, "step": 5050 }, { "epoch": 0.14606850183122644, "grad_norm": 266.0, "learning_rate": 0.0001760777134758474, "loss": 3.3284, "step": 5100 }, { "epoch": 0.14750054596682668, "grad_norm": 0.2314453125, "learning_rate": 0.00017578244950986182, "loss": 2.2628, "step": 5150 }, { "epoch": 0.14893259010242696, "grad_norm": 237.0, "learning_rate": 0.00017548718554387624, "loss": 2.5359, "step": 5200 }, { "epoch": 0.1503646342380272, "grad_norm": 65.5, "learning_rate": 0.00017519192157789063, "loss": 2.5109, "step": 5250 }, { "epoch": 0.15179667837362748, "grad_norm": 0.2197265625, "learning_rate": 0.00017489665761190505, "loss": 3.4319, "step": 5300 }, { "epoch": 0.15322872250922773, "grad_norm": 140.0, "learning_rate": 0.00017460139364591944, "loss": 2.149, "step": 5350 }, { "epoch": 0.154660766644828, "grad_norm": 74.0, "learning_rate": 0.00017430612967993386, "loss": 3.3437, "step": 5400 }, { "epoch": 0.15609281078042825, "grad_norm": 160.0, "learning_rate": 0.00017401086571394828, "loss": 3.2952, "step": 5450 }, { "epoch": 0.15752485491602852, "grad_norm": 0.451171875, "learning_rate": 0.0001737156017479627, "loss": 2.6442, "step": 5500 }, { "epoch": 0.15895689905162877, "grad_norm": 246.0, "learning_rate": 0.00017342033778197712, "loss": 2.1805, "step": 5550 }, { "epoch": 0.16038894318722904, "grad_norm": 0.1669921875, "learning_rate": 0.0001731250738159915, "loss": 2.957, "step": 5600 }, { "epoch": 0.1618209873228293, "grad_norm": 11.4375, "learning_rate": 0.00017282980985000593, "loss": 3.791, "step": 5650 }, { "epoch": 0.16325303145842954, "grad_norm": 241.0, "learning_rate": 0.00017253454588402032, "loss": 2.3945, "step": 5700 }, { "epoch": 0.1646850755940298, "grad_norm": 0.1572265625, "learning_rate": 0.00017223928191803474, "loss": 2.3927, "step": 5750 }, { "epoch": 0.16611711972963006, "grad_norm": 0.06494140625, "learning_rate": 0.00017194401795204913, "loss": 2.4573, "step": 5800 }, { "epoch": 0.16754916386523033, "grad_norm": 0.34375, "learning_rate": 0.00017164875398606355, "loss": 2.6829, "step": 5850 }, { "epoch": 0.16898120800083058, "grad_norm": 7.46875, "learning_rate": 0.00017135349002007797, "loss": 3.0156, "step": 5900 }, { "epoch": 0.17041325213643085, "grad_norm": 0.2470703125, "learning_rate": 0.00017105822605409236, "loss": 2.5155, "step": 5950 }, { "epoch": 0.1718452962720311, "grad_norm": 3.65625, "learning_rate": 0.00017076296208810678, "loss": 2.5886, "step": 6000 }, { "epoch": 0.17327734040763138, "grad_norm": 420.0, "learning_rate": 0.00017046769812212117, "loss": 3.7327, "step": 6050 }, { "epoch": 0.17470938454323162, "grad_norm": 88.0, "learning_rate": 0.00017017243415613559, "loss": 4.1712, "step": 6100 }, { "epoch": 0.17614142867883187, "grad_norm": 1864.0, "learning_rate": 0.00016987717019015, "loss": 3.0617, "step": 6150 }, { "epoch": 0.17757347281443214, "grad_norm": 56.25, "learning_rate": 0.00016958190622416442, "loss": 2.6603, "step": 6200 }, { "epoch": 0.1790055169500324, "grad_norm": 25.25, "learning_rate": 0.00016928664225817884, "loss": 2.7308, "step": 6250 }, { "epoch": 0.1790055169500324, "eval_accuracy": 0.9195, "eval_loss": 0.47414371371269226, "eval_macro_f1": 0.9193664539192946, "eval_runtime": 182.0886, "eval_samples_per_second": 10.984, "eval_steps_per_second": 10.984, "step": 6250 }, { "epoch": 0.18043756108563266, "grad_norm": 66.5, "learning_rate": 0.00016899137829219323, "loss": 2.9805, "step": 6300 }, { "epoch": 0.1818696052212329, "grad_norm": 119.0, "learning_rate": 0.00016869611432620765, "loss": 2.343, "step": 6350 }, { "epoch": 0.18330164935683319, "grad_norm": 0.14453125, "learning_rate": 0.00016840085036022204, "loss": 2.5346, "step": 6400 }, { "epoch": 0.18473369349243343, "grad_norm": 67.5, "learning_rate": 0.00016810558639423646, "loss": 2.6565, "step": 6450 }, { "epoch": 0.1861657376280337, "grad_norm": 14.0, "learning_rate": 0.00016781032242825085, "loss": 3.2329, "step": 6500 }, { "epoch": 0.18759778176363395, "grad_norm": 1408.0, "learning_rate": 0.00016751505846226527, "loss": 2.7886, "step": 6550 }, { "epoch": 0.18902982589923423, "grad_norm": 23.0, "learning_rate": 0.0001672197944962797, "loss": 2.2165, "step": 6600 }, { "epoch": 0.19046187003483447, "grad_norm": 88.0, "learning_rate": 0.00016692453053029408, "loss": 2.826, "step": 6650 }, { "epoch": 0.19189391417043472, "grad_norm": 7.28125, "learning_rate": 0.0001666292665643085, "loss": 2.6884, "step": 6700 }, { "epoch": 0.193325958306035, "grad_norm": 4.3125, "learning_rate": 0.0001663340025983229, "loss": 2.3811, "step": 6750 }, { "epoch": 0.19475800244163524, "grad_norm": 2.78125, "learning_rate": 0.0001660387386323373, "loss": 2.1648, "step": 6800 }, { "epoch": 0.19619004657723552, "grad_norm": 2.65625, "learning_rate": 0.0001657434746663517, "loss": 2.0769, "step": 6850 }, { "epoch": 0.19762209071283576, "grad_norm": 0.337890625, "learning_rate": 0.00016544821070036612, "loss": 3.2644, "step": 6900 }, { "epoch": 0.19905413484843604, "grad_norm": 5.15625, "learning_rate": 0.00016515294673438054, "loss": 3.1548, "step": 6950 }, { "epoch": 0.20048617898403628, "grad_norm": 52.75, "learning_rate": 0.00016485768276839496, "loss": 2.3094, "step": 7000 }, { "epoch": 0.20191822311963656, "grad_norm": 0.15625, "learning_rate": 0.00016456241880240938, "loss": 2.2522, "step": 7050 }, { "epoch": 0.2033502672552368, "grad_norm": 0.09521484375, "learning_rate": 0.00016426715483642377, "loss": 2.1453, "step": 7100 }, { "epoch": 0.20478231139083705, "grad_norm": 274.0, "learning_rate": 0.0001639718908704382, "loss": 2.8386, "step": 7150 }, { "epoch": 0.20621435552643733, "grad_norm": 274.0, "learning_rate": 0.00016367662690445258, "loss": 3.5395, "step": 7200 }, { "epoch": 0.20764639966203757, "grad_norm": 81.0, "learning_rate": 0.000163381362938467, "loss": 2.668, "step": 7250 }, { "epoch": 0.20907844379763785, "grad_norm": 0.1162109375, "learning_rate": 0.00016308609897248142, "loss": 2.2543, "step": 7300 }, { "epoch": 0.2105104879332381, "grad_norm": 0.05517578125, "learning_rate": 0.0001627908350064958, "loss": 2.4399, "step": 7350 }, { "epoch": 0.21194253206883837, "grad_norm": 0.283203125, "learning_rate": 0.00016249557104051023, "loss": 2.0814, "step": 7400 }, { "epoch": 0.21337457620443862, "grad_norm": 79.0, "learning_rate": 0.00016220030707452462, "loss": 3.2041, "step": 7450 }, { "epoch": 0.2148066203400389, "grad_norm": 144.0, "learning_rate": 0.00016190504310853904, "loss": 1.962, "step": 7500 }, { "epoch": 0.2148066203400389, "eval_accuracy": 0.93, "eval_loss": 0.3529609441757202, "eval_macro_f1": 0.9295120271109343, "eval_runtime": 175.7548, "eval_samples_per_second": 11.379, "eval_steps_per_second": 11.379, "step": 7500 }, { "epoch": 0.21623866447563914, "grad_norm": 2592.0, "learning_rate": 0.00016160977914255343, "loss": 2.7684, "step": 7550 }, { "epoch": 0.2176707086112394, "grad_norm": 0.03271484375, "learning_rate": 0.00016131451517656785, "loss": 2.5066, "step": 7600 }, { "epoch": 0.21910275274683966, "grad_norm": 0.09423828125, "learning_rate": 0.00016101925121058227, "loss": 2.6791, "step": 7650 }, { "epoch": 0.2205347968824399, "grad_norm": 536.0, "learning_rate": 0.0001607239872445967, "loss": 3.3268, "step": 7700 }, { "epoch": 0.22196684101804018, "grad_norm": 0.01007080078125, "learning_rate": 0.0001604287232786111, "loss": 2.2916, "step": 7750 }, { "epoch": 0.22339888515364043, "grad_norm": 0.1396484375, "learning_rate": 0.0001601334593126255, "loss": 2.8402, "step": 7800 }, { "epoch": 0.2248309292892407, "grad_norm": 93.5, "learning_rate": 0.00015983819534663992, "loss": 2.5527, "step": 7850 }, { "epoch": 0.22626297342484095, "grad_norm": 0.318359375, "learning_rate": 0.0001595429313806543, "loss": 3.0559, "step": 7900 }, { "epoch": 0.22769501756044122, "grad_norm": 276.0, "learning_rate": 0.00015924766741466873, "loss": 1.8897, "step": 7950 }, { "epoch": 0.22912706169604147, "grad_norm": 1.7421875, "learning_rate": 0.00015895240344868315, "loss": 1.9342, "step": 8000 }, { "epoch": 0.23055910583164174, "grad_norm": 0.036865234375, "learning_rate": 0.00015865713948269754, "loss": 2.0979, "step": 8050 }, { "epoch": 0.231991149967242, "grad_norm": 164.0, "learning_rate": 0.00015836187551671196, "loss": 2.2929, "step": 8100 }, { "epoch": 0.23342319410284226, "grad_norm": 88.5, "learning_rate": 0.00015806661155072635, "loss": 3.0427, "step": 8150 }, { "epoch": 0.2348552382384425, "grad_norm": 1104.0, "learning_rate": 0.00015777134758474077, "loss": 2.8966, "step": 8200 }, { "epoch": 0.23628728237404276, "grad_norm": 520.0, "learning_rate": 0.00015747608361875516, "loss": 2.0752, "step": 8250 }, { "epoch": 0.23771932650964303, "grad_norm": 0.07568359375, "learning_rate": 0.00015718081965276958, "loss": 1.7808, "step": 8300 }, { "epoch": 0.23915137064524328, "grad_norm": 0.06982421875, "learning_rate": 0.000156885555686784, "loss": 2.9426, "step": 8350 }, { "epoch": 0.24058341478084355, "grad_norm": 242.0, "learning_rate": 0.00015659029172079839, "loss": 2.3159, "step": 8400 }, { "epoch": 0.2420154589164438, "grad_norm": 7.5, "learning_rate": 0.0001562950277548128, "loss": 2.6197, "step": 8450 }, { "epoch": 0.24344750305204407, "grad_norm": 57.25, "learning_rate": 0.00015599976378882722, "loss": 2.6834, "step": 8500 }, { "epoch": 0.24487954718764432, "grad_norm": 4.0, "learning_rate": 0.00015570449982284164, "loss": 2.116, "step": 8550 }, { "epoch": 0.2463115913232446, "grad_norm": 0.11181640625, "learning_rate": 0.00015540923585685603, "loss": 3.5668, "step": 8600 }, { "epoch": 0.24774363545884484, "grad_norm": 240.0, "learning_rate": 0.00015511397189087045, "loss": 3.1473, "step": 8650 }, { "epoch": 0.2491756795944451, "grad_norm": 117.5, "learning_rate": 0.00015481870792488487, "loss": 2.4813, "step": 8700 }, { "epoch": 0.25060772373004536, "grad_norm": 7.46875, "learning_rate": 0.00015452344395889926, "loss": 1.8936, "step": 8750 }, { "epoch": 0.25060772373004536, "eval_accuracy": 0.9365, "eval_loss": 0.328545480966568, "eval_macro_f1": 0.9360277798015127, "eval_runtime": 178.5517, "eval_samples_per_second": 11.201, "eval_steps_per_second": 11.201, "step": 8750 }, { "epoch": 0.25203976786564564, "grad_norm": 126.0, "learning_rate": 0.00015422817999291368, "loss": 2.649, "step": 8800 }, { "epoch": 0.25347181200124586, "grad_norm": 0.06884765625, "learning_rate": 0.00015393291602692807, "loss": 2.8102, "step": 8850 }, { "epoch": 0.25490385613684613, "grad_norm": 0.6015625, "learning_rate": 0.0001536376520609425, "loss": 2.4762, "step": 8900 }, { "epoch": 0.2563359002724464, "grad_norm": 370.0, "learning_rate": 0.00015334238809495688, "loss": 2.1245, "step": 8950 }, { "epoch": 0.2577679444080467, "grad_norm": 238.0, "learning_rate": 0.0001530471241289713, "loss": 1.4588, "step": 9000 }, { "epoch": 0.2591999885436469, "grad_norm": 8.5625, "learning_rate": 0.00015275186016298572, "loss": 2.7869, "step": 9050 }, { "epoch": 0.26063203267924717, "grad_norm": 118.5, "learning_rate": 0.0001524565961970001, "loss": 2.1987, "step": 9100 }, { "epoch": 0.26206407681484745, "grad_norm": 37.25, "learning_rate": 0.00015216133223101453, "loss": 2.8539, "step": 9150 }, { "epoch": 0.26349612095044767, "grad_norm": 0.62109375, "learning_rate": 0.00015186606826502895, "loss": 2.6421, "step": 9200 }, { "epoch": 0.26492816508604794, "grad_norm": 0.404296875, "learning_rate": 0.00015157080429904337, "loss": 3.3623, "step": 9250 }, { "epoch": 0.2663602092216482, "grad_norm": 0.09130859375, "learning_rate": 0.00015127554033305776, "loss": 2.6995, "step": 9300 }, { "epoch": 0.2677922533572485, "grad_norm": 82.5, "learning_rate": 0.00015098027636707218, "loss": 1.8874, "step": 9350 }, { "epoch": 0.2692242974928487, "grad_norm": 4416.0, "learning_rate": 0.0001506850124010866, "loss": 2.2107, "step": 9400 }, { "epoch": 0.270656341628449, "grad_norm": 18.125, "learning_rate": 0.000150389748435101, "loss": 3.2056, "step": 9450 }, { "epoch": 0.27208838576404926, "grad_norm": 3.09375, "learning_rate": 0.0001500944844691154, "loss": 2.9934, "step": 9500 }, { "epoch": 0.27352042989964953, "grad_norm": 280.0, "learning_rate": 0.0001497992205031298, "loss": 2.2205, "step": 9550 }, { "epoch": 0.27495247403524975, "grad_norm": 94.5, "learning_rate": 0.00014950395653714422, "loss": 2.5102, "step": 9600 }, { "epoch": 0.27638451817085, "grad_norm": 0.2021484375, "learning_rate": 0.0001492086925711586, "loss": 2.0138, "step": 9650 }, { "epoch": 0.2778165623064503, "grad_norm": 1.3359375, "learning_rate": 0.00014891342860517303, "loss": 1.556, "step": 9700 }, { "epoch": 0.2792486064420505, "grad_norm": 0.494140625, "learning_rate": 0.00014861816463918745, "loss": 2.7351, "step": 9750 }, { "epoch": 0.2806806505776508, "grad_norm": 0.1953125, "learning_rate": 0.00014832290067320184, "loss": 2.0641, "step": 9800 }, { "epoch": 0.28211269471325107, "grad_norm": 0.421875, "learning_rate": 0.00014802763670721626, "loss": 2.642, "step": 9850 }, { "epoch": 0.28354473884885134, "grad_norm": 292.0, "learning_rate": 0.00014773237274123065, "loss": 2.5676, "step": 9900 }, { "epoch": 0.28497678298445156, "grad_norm": 4.40625, "learning_rate": 0.00014743710877524507, "loss": 2.5438, "step": 9950 }, { "epoch": 0.28640882712005183, "grad_norm": 129.0, "learning_rate": 0.0001471418448092595, "loss": 3.0776, "step": 10000 }, { "epoch": 0.28640882712005183, "eval_accuracy": 0.9335, "eval_loss": 0.34245818853378296, "eval_macro_f1": 0.9327568911653952, "eval_runtime": 181.524, "eval_samples_per_second": 11.018, "eval_steps_per_second": 11.018, "step": 10000 }, { "epoch": 0.2878408712556521, "grad_norm": 9.25, "learning_rate": 0.0001468465808432739, "loss": 2.2061, "step": 10050 }, { "epoch": 0.2892729153912524, "grad_norm": 0.10693359375, "learning_rate": 0.00014655131687728832, "loss": 2.6087, "step": 10100 }, { "epoch": 0.2907049595268526, "grad_norm": 0.10107421875, "learning_rate": 0.00014625605291130272, "loss": 2.4579, "step": 10150 }, { "epoch": 0.2921370036624529, "grad_norm": 0.33203125, "learning_rate": 0.00014596078894531714, "loss": 2.2037, "step": 10200 }, { "epoch": 0.29356904779805315, "grad_norm": 0.1513671875, "learning_rate": 0.00014566552497933153, "loss": 2.3772, "step": 10250 }, { "epoch": 0.29500109193365337, "grad_norm": 80.5, "learning_rate": 0.00014537026101334595, "loss": 2.2901, "step": 10300 }, { "epoch": 0.29643313606925364, "grad_norm": 21.0, "learning_rate": 0.00014507499704736034, "loss": 2.1736, "step": 10350 }, { "epoch": 0.2978651802048539, "grad_norm": 168.0, "learning_rate": 0.00014477973308137476, "loss": 2.3493, "step": 10400 }, { "epoch": 0.2992972243404542, "grad_norm": 229.0, "learning_rate": 0.00014448446911538915, "loss": 2.4933, "step": 10450 }, { "epoch": 0.3007292684760544, "grad_norm": 274.0, "learning_rate": 0.00014418920514940357, "loss": 2.6936, "step": 10500 }, { "epoch": 0.3021613126116547, "grad_norm": 272.0, "learning_rate": 0.00014389394118341799, "loss": 3.3879, "step": 10550 }, { "epoch": 0.30359335674725496, "grad_norm": 0.0103759765625, "learning_rate": 0.00014359867721743238, "loss": 1.6445, "step": 10600 }, { "epoch": 0.30502540088285524, "grad_norm": 0.02880859375, "learning_rate": 0.0001433034132514468, "loss": 1.5567, "step": 10650 }, { "epoch": 0.30645744501845545, "grad_norm": 292.0, "learning_rate": 0.00014300814928546121, "loss": 2.5947, "step": 10700 }, { "epoch": 0.30788948915405573, "grad_norm": 97.5, "learning_rate": 0.00014271288531947563, "loss": 2.7865, "step": 10750 }, { "epoch": 0.309321533289656, "grad_norm": 79.5, "learning_rate": 0.00014241762135349002, "loss": 2.1275, "step": 10800 }, { "epoch": 0.3107535774252562, "grad_norm": 298.0, "learning_rate": 0.00014212235738750444, "loss": 2.0145, "step": 10850 }, { "epoch": 0.3121856215608565, "grad_norm": 0.040771484375, "learning_rate": 0.00014182709342151886, "loss": 1.8322, "step": 10900 }, { "epoch": 0.31361766569645677, "grad_norm": 142.0, "learning_rate": 0.00014153182945553325, "loss": 1.3864, "step": 10950 }, { "epoch": 0.31504970983205705, "grad_norm": 1.7578125, "learning_rate": 0.00014123656548954767, "loss": 2.7755, "step": 11000 }, { "epoch": 0.31648175396765726, "grad_norm": 82.0, "learning_rate": 0.00014094130152356206, "loss": 2.5528, "step": 11050 }, { "epoch": 0.31791379810325754, "grad_norm": 80.0, "learning_rate": 0.00014064603755757648, "loss": 2.5284, "step": 11100 }, { "epoch": 0.3193458422388578, "grad_norm": 0.05224609375, "learning_rate": 0.00014035077359159087, "loss": 2.8708, "step": 11150 }, { "epoch": 0.3207778863744581, "grad_norm": 0.140625, "learning_rate": 0.0001400555096256053, "loss": 3.5295, "step": 11200 }, { "epoch": 0.3222099305100583, "grad_norm": 0.050048828125, "learning_rate": 0.0001397602456596197, "loss": 3.325, "step": 11250 }, { "epoch": 0.3222099305100583, "eval_accuracy": 0.94, "eval_loss": 0.2819044888019562, "eval_macro_f1": 0.9395225640341313, "eval_runtime": 173.501, "eval_samples_per_second": 11.527, "eval_steps_per_second": 11.527, "step": 11250 }, { "epoch": 0.3236419746456586, "grad_norm": 1.078125, "learning_rate": 0.0001394649816936341, "loss": 3.0985, "step": 11300 }, { "epoch": 0.32507401878125886, "grad_norm": 116.0, "learning_rate": 0.00013916971772764852, "loss": 2.5793, "step": 11350 }, { "epoch": 0.3265060629168591, "grad_norm": 0.1865234375, "learning_rate": 0.00013887445376166291, "loss": 2.5646, "step": 11400 }, { "epoch": 0.32793810705245935, "grad_norm": 306.0, "learning_rate": 0.00013857918979567733, "loss": 1.9864, "step": 11450 }, { "epoch": 0.3293701511880596, "grad_norm": 274.0, "learning_rate": 0.00013828392582969175, "loss": 1.8868, "step": 11500 }, { "epoch": 0.3308021953236599, "grad_norm": 0.08984375, "learning_rate": 0.00013798866186370617, "loss": 2.5106, "step": 11550 }, { "epoch": 0.3322342394592601, "grad_norm": 270.0, "learning_rate": 0.0001376933978977206, "loss": 1.8537, "step": 11600 }, { "epoch": 0.3336662835948604, "grad_norm": 270.0, "learning_rate": 0.00013739813393173498, "loss": 2.3735, "step": 11650 }, { "epoch": 0.33509832773046067, "grad_norm": 0.416015625, "learning_rate": 0.0001371028699657494, "loss": 2.0794, "step": 11700 }, { "epoch": 0.3365303718660609, "grad_norm": 0.32421875, "learning_rate": 0.0001368076059997638, "loss": 2.5114, "step": 11750 }, { "epoch": 0.33796241600166116, "grad_norm": 0.0595703125, "learning_rate": 0.0001365123420337782, "loss": 2.199, "step": 11800 }, { "epoch": 0.33939446013726143, "grad_norm": 61.75, "learning_rate": 0.0001362170780677926, "loss": 2.64, "step": 11850 }, { "epoch": 0.3408265042728617, "grad_norm": 9.0, "learning_rate": 0.00013592181410180702, "loss": 1.8553, "step": 11900 }, { "epoch": 0.3422585484084619, "grad_norm": 177.0, "learning_rate": 0.00013562655013582144, "loss": 1.6963, "step": 11950 }, { "epoch": 0.3436905925440622, "grad_norm": 326.0, "learning_rate": 0.00013533128616983583, "loss": 3.007, "step": 12000 }, { "epoch": 0.3451226366796625, "grad_norm": 0.06787109375, "learning_rate": 0.00013503602220385025, "loss": 1.6731, "step": 12050 }, { "epoch": 0.34655468081526275, "grad_norm": 5.28125, "learning_rate": 0.00013474075823786464, "loss": 2.5167, "step": 12100 }, { "epoch": 0.34798672495086297, "grad_norm": 0.11865234375, "learning_rate": 0.00013444549427187906, "loss": 3.4208, "step": 12150 }, { "epoch": 0.34941876908646324, "grad_norm": 0.1337890625, "learning_rate": 0.00013415023030589348, "loss": 1.3073, "step": 12200 }, { "epoch": 0.3508508132220635, "grad_norm": 82.5, "learning_rate": 0.0001338549663399079, "loss": 2.3618, "step": 12250 }, { "epoch": 0.35228285735766374, "grad_norm": 1.546875, "learning_rate": 0.00013355970237392232, "loss": 2.0756, "step": 12300 }, { "epoch": 0.353714901493264, "grad_norm": 120.0, "learning_rate": 0.0001332644384079367, "loss": 2.2016, "step": 12350 }, { "epoch": 0.3551469456288643, "grad_norm": 5.34375, "learning_rate": 0.00013296917444195113, "loss": 2.7446, "step": 12400 }, { "epoch": 0.35657898976446456, "grad_norm": 0.057861328125, "learning_rate": 0.00013267391047596552, "loss": 2.3892, "step": 12450 }, { "epoch": 0.3580110339000648, "grad_norm": 1.6640625, "learning_rate": 0.00013237864650997994, "loss": 2.3038, "step": 12500 }, { "epoch": 0.3580110339000648, "eval_accuracy": 0.943, "eval_loss": 0.2820850610733032, "eval_macro_f1": 0.9423954094372721, "eval_runtime": 180.0229, "eval_samples_per_second": 11.11, "eval_steps_per_second": 11.11, "step": 12500 }, { "epoch": 0.35944307803566505, "grad_norm": 0.15625, "learning_rate": 0.00013208338254399433, "loss": 1.6647, "step": 12550 }, { "epoch": 0.36087512217126533, "grad_norm": 0.06640625, "learning_rate": 0.00013178811857800875, "loss": 2.2751, "step": 12600 }, { "epoch": 0.3623071663068656, "grad_norm": 0.984375, "learning_rate": 0.00013149285461202316, "loss": 2.7458, "step": 12650 }, { "epoch": 0.3637392104424658, "grad_norm": 420.0, "learning_rate": 0.00013119759064603756, "loss": 2.2766, "step": 12700 }, { "epoch": 0.3651712545780661, "grad_norm": 0.1103515625, "learning_rate": 0.00013090232668005198, "loss": 1.4627, "step": 12750 }, { "epoch": 0.36660329871366637, "grad_norm": 0.25390625, "learning_rate": 0.00013060706271406637, "loss": 1.9492, "step": 12800 }, { "epoch": 0.3680353428492666, "grad_norm": 0.232421875, "learning_rate": 0.00013031179874808079, "loss": 1.8883, "step": 12850 }, { "epoch": 0.36946738698486686, "grad_norm": 0.205078125, "learning_rate": 0.00013001653478209518, "loss": 2.058, "step": 12900 }, { "epoch": 0.37089943112046714, "grad_norm": 0.53515625, "learning_rate": 0.0001297212708161096, "loss": 2.7257, "step": 12950 }, { "epoch": 0.3723314752560674, "grad_norm": 764.0, "learning_rate": 0.00012942600685012401, "loss": 2.4923, "step": 13000 }, { "epoch": 0.37376351939166763, "grad_norm": 298.0, "learning_rate": 0.00012913074288413843, "loss": 1.8667, "step": 13050 }, { "epoch": 0.3751955635272679, "grad_norm": 282.0, "learning_rate": 0.00012883547891815285, "loss": 2.4477, "step": 13100 }, { "epoch": 0.3766276076628682, "grad_norm": 0.02587890625, "learning_rate": 0.00012854021495216724, "loss": 1.0699, "step": 13150 }, { "epoch": 0.37805965179846845, "grad_norm": 160.0, "learning_rate": 0.00012824495098618166, "loss": 2.8487, "step": 13200 }, { "epoch": 0.3794916959340687, "grad_norm": 216.0, "learning_rate": 0.00012794968702019605, "loss": 1.8886, "step": 13250 }, { "epoch": 0.38092374006966895, "grad_norm": 302.0, "learning_rate": 0.00012765442305421047, "loss": 2.4619, "step": 13300 }, { "epoch": 0.3823557842052692, "grad_norm": 172.0, "learning_rate": 0.0001273591590882249, "loss": 2.8237, "step": 13350 }, { "epoch": 0.38378782834086944, "grad_norm": 91.5, "learning_rate": 0.00012706389512223928, "loss": 2.7431, "step": 13400 }, { "epoch": 0.3852198724764697, "grad_norm": 0.275390625, "learning_rate": 0.0001267686311562537, "loss": 1.9888, "step": 13450 }, { "epoch": 0.38665191661207, "grad_norm": 194.0, "learning_rate": 0.0001264733671902681, "loss": 2.5123, "step": 13500 }, { "epoch": 0.38808396074767026, "grad_norm": 396.0, "learning_rate": 0.0001261781032242825, "loss": 1.9384, "step": 13550 }, { "epoch": 0.3895160048832705, "grad_norm": 54.75, "learning_rate": 0.0001258828392582969, "loss": 1.9031, "step": 13600 }, { "epoch": 0.39094804901887076, "grad_norm": 0.828125, "learning_rate": 0.00012558757529231132, "loss": 2.65, "step": 13650 }, { "epoch": 0.39238009315447103, "grad_norm": 36.25, "learning_rate": 0.00012529231132632574, "loss": 2.2794, "step": 13700 }, { "epoch": 0.3938121372900713, "grad_norm": 0.11962890625, "learning_rate": 0.00012499704736034016, "loss": 2.1189, "step": 13750 }, { "epoch": 0.3938121372900713, "eval_accuracy": 0.9465, "eval_loss": 0.31273505091667175, "eval_macro_f1": 0.9457595736365828, "eval_runtime": 172.8312, "eval_samples_per_second": 11.572, "eval_steps_per_second": 11.572, "step": 13750 }, { "epoch": 0.3952441814256715, "grad_norm": 398.0, "learning_rate": 0.00012470178339435458, "loss": 2.5979, "step": 13800 }, { "epoch": 0.3966762255612718, "grad_norm": 266.0, "learning_rate": 0.00012440651942836897, "loss": 2.8401, "step": 13850 }, { "epoch": 0.3981082696968721, "grad_norm": 0.1884765625, "learning_rate": 0.0001241112554623834, "loss": 1.9365, "step": 13900 }, { "epoch": 0.3995403138324723, "grad_norm": 0.055908203125, "learning_rate": 0.00012381599149639778, "loss": 0.9845, "step": 13950 }, { "epoch": 0.40097235796807257, "grad_norm": 130.0, "learning_rate": 0.0001235207275304122, "loss": 2.9002, "step": 14000 }, { "epoch": 0.40240440210367284, "grad_norm": 0.0045166015625, "learning_rate": 0.00012322546356442662, "loss": 2.1528, "step": 14050 }, { "epoch": 0.4038364462392731, "grad_norm": 0.39453125, "learning_rate": 0.000122930199598441, "loss": 2.4575, "step": 14100 }, { "epoch": 0.40526849037487334, "grad_norm": 241.0, "learning_rate": 0.00012263493563245543, "loss": 2.475, "step": 14150 }, { "epoch": 0.4067005345104736, "grad_norm": 37.5, "learning_rate": 0.00012233967166646982, "loss": 1.6529, "step": 14200 }, { "epoch": 0.4081325786460739, "grad_norm": 272.0, "learning_rate": 0.00012204440770048424, "loss": 2.922, "step": 14250 }, { "epoch": 0.4095646227816741, "grad_norm": 3.734375, "learning_rate": 0.00012174914373449864, "loss": 2.2361, "step": 14300 }, { "epoch": 0.4109966669172744, "grad_norm": 338.0, "learning_rate": 0.00012145387976851306, "loss": 1.5299, "step": 14350 }, { "epoch": 0.41242871105287465, "grad_norm": 0.043212890625, "learning_rate": 0.00012115861580252748, "loss": 2.5728, "step": 14400 }, { "epoch": 0.4138607551884749, "grad_norm": 0.15625, "learning_rate": 0.00012086335183654187, "loss": 2.122, "step": 14450 }, { "epoch": 0.41529279932407515, "grad_norm": 0.008056640625, "learning_rate": 0.00012056808787055629, "loss": 2.4863, "step": 14500 }, { "epoch": 0.4167248434596754, "grad_norm": 32.5, "learning_rate": 0.00012027282390457068, "loss": 3.3401, "step": 14550 }, { "epoch": 0.4181568875952757, "grad_norm": 0.2490234375, "learning_rate": 0.0001199775599385851, "loss": 2.0398, "step": 14600 }, { "epoch": 0.41958893173087597, "grad_norm": 0.326171875, "learning_rate": 0.00011968229597259951, "loss": 2.8364, "step": 14650 }, { "epoch": 0.4210209758664762, "grad_norm": 0.15625, "learning_rate": 0.00011938703200661391, "loss": 1.4864, "step": 14700 }, { "epoch": 0.42245302000207646, "grad_norm": 80.0, "learning_rate": 0.00011909176804062833, "loss": 2.3397, "step": 14750 }, { "epoch": 0.42388506413767674, "grad_norm": 101.5, "learning_rate": 0.00011879650407464274, "loss": 3.3186, "step": 14800 }, { "epoch": 0.42531710827327696, "grad_norm": 2.6875, "learning_rate": 0.00011850124010865716, "loss": 2.7605, "step": 14850 }, { "epoch": 0.42674915240887723, "grad_norm": 95.0, "learning_rate": 0.00011820597614267155, "loss": 1.9215, "step": 14900 }, { "epoch": 0.4281811965444775, "grad_norm": 0.65625, "learning_rate": 0.00011791071217668597, "loss": 2.0689, "step": 14950 }, { "epoch": 0.4296132406800778, "grad_norm": 0.66015625, "learning_rate": 0.00011761544821070036, "loss": 3.5633, "step": 15000 }, { "epoch": 0.4296132406800778, "eval_accuracy": 0.942, "eval_loss": 0.27190178632736206, "eval_macro_f1": 0.9413541591870516, "eval_runtime": 181.3524, "eval_samples_per_second": 11.028, "eval_steps_per_second": 11.028, "step": 15000 }, { "epoch": 0.431045284815678, "grad_norm": 42.25, "learning_rate": 0.00011732018424471478, "loss": 1.8073, "step": 15050 }, { "epoch": 0.4324773289512783, "grad_norm": 0.22265625, "learning_rate": 0.0001170249202787292, "loss": 1.777, "step": 15100 }, { "epoch": 0.43390937308687855, "grad_norm": 0.11328125, "learning_rate": 0.0001167296563127436, "loss": 1.9598, "step": 15150 }, { "epoch": 0.4353414172224788, "grad_norm": 0.095703125, "learning_rate": 0.00011643439234675802, "loss": 2.7789, "step": 15200 }, { "epoch": 0.43677346135807904, "grad_norm": 0.8125, "learning_rate": 0.00011613912838077241, "loss": 2.3985, "step": 15250 }, { "epoch": 0.4382055054936793, "grad_norm": 0.22265625, "learning_rate": 0.00011584386441478683, "loss": 1.6076, "step": 15300 }, { "epoch": 0.4396375496292796, "grad_norm": 0.578125, "learning_rate": 0.00011554860044880122, "loss": 2.9266, "step": 15350 }, { "epoch": 0.4410695937648798, "grad_norm": 0.0130615234375, "learning_rate": 0.00011525333648281564, "loss": 1.388, "step": 15400 }, { "epoch": 0.4425016379004801, "grad_norm": 88.5, "learning_rate": 0.00011495807251683006, "loss": 2.6264, "step": 15450 }, { "epoch": 0.44393368203608036, "grad_norm": 0.123046875, "learning_rate": 0.00011466280855084446, "loss": 1.9447, "step": 15500 }, { "epoch": 0.44536572617168063, "grad_norm": 318.0, "learning_rate": 0.00011436754458485888, "loss": 1.4494, "step": 15550 }, { "epoch": 0.44679777030728085, "grad_norm": 0.123046875, "learning_rate": 0.00011407228061887327, "loss": 2.4483, "step": 15600 }, { "epoch": 0.4482298144428811, "grad_norm": 260.0, "learning_rate": 0.00011377701665288769, "loss": 2.8899, "step": 15650 }, { "epoch": 0.4496618585784814, "grad_norm": 0.07666015625, "learning_rate": 0.00011348175268690208, "loss": 2.0935, "step": 15700 }, { "epoch": 0.4510939027140817, "grad_norm": 0.047119140625, "learning_rate": 0.0001131864887209165, "loss": 3.0298, "step": 15750 }, { "epoch": 0.4525259468496819, "grad_norm": 18.25, "learning_rate": 0.00011289122475493092, "loss": 1.8288, "step": 15800 }, { "epoch": 0.45395799098528217, "grad_norm": 11.75, "learning_rate": 0.00011259596078894533, "loss": 2.9182, "step": 15850 }, { "epoch": 0.45539003512088244, "grad_norm": 0.1416015625, "learning_rate": 0.00011230069682295975, "loss": 1.9045, "step": 15900 }, { "epoch": 0.45682207925648266, "grad_norm": 0.1982421875, "learning_rate": 0.00011200543285697414, "loss": 1.7933, "step": 15950 }, { "epoch": 0.45825412339208293, "grad_norm": 201.0, "learning_rate": 0.00011171016889098856, "loss": 2.4752, "step": 16000 }, { "epoch": 0.4596861675276832, "grad_norm": 284.0, "learning_rate": 0.00011141490492500295, "loss": 1.7394, "step": 16050 }, { "epoch": 0.4611182116632835, "grad_norm": 0.1162109375, "learning_rate": 0.00011111964095901737, "loss": 1.3612, "step": 16100 }, { "epoch": 0.4625502557988837, "grad_norm": 0.30859375, "learning_rate": 0.00011082437699303178, "loss": 2.6066, "step": 16150 }, { "epoch": 0.463982299934484, "grad_norm": 23.75, "learning_rate": 0.00011052911302704618, "loss": 2.6868, "step": 16200 }, { "epoch": 0.46541434407008425, "grad_norm": 1.5546875, "learning_rate": 0.0001102338490610606, "loss": 2.6526, "step": 16250 }, { "epoch": 0.46541434407008425, "eval_accuracy": 0.9445, "eval_loss": 0.2859993577003479, "eval_macro_f1": 0.9439942443541156, "eval_runtime": 175.6054, "eval_samples_per_second": 11.389, "eval_steps_per_second": 11.389, "step": 16250 }, { "epoch": 0.4668463882056845, "grad_norm": 0.11181640625, "learning_rate": 0.000109938585095075, "loss": 2.041, "step": 16300 }, { "epoch": 0.46827843234128474, "grad_norm": 8.3125, "learning_rate": 0.00010964332112908942, "loss": 2.3374, "step": 16350 }, { "epoch": 0.469710476476885, "grad_norm": 268.0, "learning_rate": 0.00010934805716310381, "loss": 3.461, "step": 16400 }, { "epoch": 0.4711425206124853, "grad_norm": 270.0, "learning_rate": 0.00010905279319711823, "loss": 1.5168, "step": 16450 }, { "epoch": 0.4725745647480855, "grad_norm": 1.6171875, "learning_rate": 0.00010875752923113265, "loss": 1.9156, "step": 16500 }, { "epoch": 0.4740066088836858, "grad_norm": 3.25, "learning_rate": 0.00010846226526514704, "loss": 1.9988, "step": 16550 }, { "epoch": 0.47543865301928606, "grad_norm": 266.0, "learning_rate": 0.00010816700129916146, "loss": 1.9911, "step": 16600 }, { "epoch": 0.47687069715488634, "grad_norm": 0.45703125, "learning_rate": 0.00010787173733317586, "loss": 2.2805, "step": 16650 }, { "epoch": 0.47830274129048655, "grad_norm": 0.08740234375, "learning_rate": 0.00010757647336719028, "loss": 2.3786, "step": 16700 }, { "epoch": 0.47973478542608683, "grad_norm": 0.0263671875, "learning_rate": 0.00010728120940120467, "loss": 2.0964, "step": 16750 }, { "epoch": 0.4811668295616871, "grad_norm": 288.0, "learning_rate": 0.00010698594543521909, "loss": 2.5816, "step": 16800 }, { "epoch": 0.4825988736972873, "grad_norm": 0.2138671875, "learning_rate": 0.00010669068146923348, "loss": 1.2136, "step": 16850 }, { "epoch": 0.4840309178328876, "grad_norm": 1.1796875, "learning_rate": 0.0001063954175032479, "loss": 2.2321, "step": 16900 }, { "epoch": 0.48546296196848787, "grad_norm": 0.259765625, "learning_rate": 0.00010610015353726232, "loss": 2.7485, "step": 16950 }, { "epoch": 0.48689500610408815, "grad_norm": 75.5, "learning_rate": 0.00010580488957127673, "loss": 3.0909, "step": 17000 }, { "epoch": 0.48832705023968836, "grad_norm": 86.0, "learning_rate": 0.00010550962560529115, "loss": 2.0178, "step": 17050 }, { "epoch": 0.48975909437528864, "grad_norm": 0.98046875, "learning_rate": 0.00010521436163930554, "loss": 2.0193, "step": 17100 }, { "epoch": 0.4911911385108889, "grad_norm": 68.0, "learning_rate": 0.00010491909767331996, "loss": 2.178, "step": 17150 }, { "epoch": 0.4926231826464892, "grad_norm": 0.375, "learning_rate": 0.00010462383370733435, "loss": 2.4366, "step": 17200 }, { "epoch": 0.4940552267820894, "grad_norm": 0.322265625, "learning_rate": 0.00010432856974134877, "loss": 2.7989, "step": 17250 }, { "epoch": 0.4954872709176897, "grad_norm": 0.1513671875, "learning_rate": 0.00010403330577536318, "loss": 1.8681, "step": 17300 }, { "epoch": 0.49691931505328996, "grad_norm": 0.08935546875, "learning_rate": 0.00010373804180937759, "loss": 1.9697, "step": 17350 }, { "epoch": 0.4983513591888902, "grad_norm": 0.1328125, "learning_rate": 0.00010344277784339201, "loss": 1.7415, "step": 17400 }, { "epoch": 0.49978340332449045, "grad_norm": 0.65234375, "learning_rate": 0.0001031475138774064, "loss": 1.6849, "step": 17450 }, { "epoch": 0.5012154474600907, "grad_norm": 86.5, "learning_rate": 0.00010285224991142082, "loss": 2.1474, "step": 17500 }, { "epoch": 0.5012154474600907, "eval_accuracy": 0.947, "eval_loss": 0.2907390892505646, "eval_macro_f1": 0.9463529866080697, "eval_runtime": 172.6878, "eval_samples_per_second": 11.582, "eval_steps_per_second": 11.582, "step": 17500 }, { "epoch": 0.502647491595691, "grad_norm": 472.0, "learning_rate": 0.00010255698594543521, "loss": 1.9173, "step": 17550 }, { "epoch": 0.5040795357312913, "grad_norm": 1.3828125, "learning_rate": 0.00010226172197944963, "loss": 3.1869, "step": 17600 }, { "epoch": 0.5055115798668915, "grad_norm": 145.0, "learning_rate": 0.00010196645801346405, "loss": 2.5482, "step": 17650 }, { "epoch": 0.5069436240024917, "grad_norm": 0.58203125, "learning_rate": 0.00010167119404747844, "loss": 2.8567, "step": 17700 }, { "epoch": 0.508375668138092, "grad_norm": 0.0888671875, "learning_rate": 0.00010137593008149286, "loss": 1.7268, "step": 17750 }, { "epoch": 0.5098077122736923, "grad_norm": 286.0, "learning_rate": 0.00010108066611550726, "loss": 2.2268, "step": 17800 }, { "epoch": 0.5112397564092925, "grad_norm": 169.0, "learning_rate": 0.00010078540214952168, "loss": 1.8245, "step": 17850 }, { "epoch": 0.5126718005448928, "grad_norm": 0.1923828125, "learning_rate": 0.00010049013818353607, "loss": 2.3801, "step": 17900 }, { "epoch": 0.5141038446804931, "grad_norm": 77.0, "learning_rate": 0.00010019487421755049, "loss": 2.3412, "step": 17950 }, { "epoch": 0.5155358888160934, "grad_norm": 328.0, "learning_rate": 9.98996102515649e-05, "loss": 3.1564, "step": 18000 }, { "epoch": 0.5169679329516935, "grad_norm": 0.57421875, "learning_rate": 9.96043462855793e-05, "loss": 2.0409, "step": 18050 }, { "epoch": 0.5183999770872938, "grad_norm": 0.8828125, "learning_rate": 9.930908231959372e-05, "loss": 1.8093, "step": 18100 }, { "epoch": 0.5198320212228941, "grad_norm": 0.263671875, "learning_rate": 9.901381835360814e-05, "loss": 2.1228, "step": 18150 }, { "epoch": 0.5212640653584943, "grad_norm": 13.625, "learning_rate": 9.871855438762255e-05, "loss": 1.6072, "step": 18200 }, { "epoch": 0.5226961094940946, "grad_norm": 176.0, "learning_rate": 9.842329042163695e-05, "loss": 2.1088, "step": 18250 }, { "epoch": 0.5241281536296949, "grad_norm": 0.09228515625, "learning_rate": 9.812802645565136e-05, "loss": 2.2985, "step": 18300 }, { "epoch": 0.5255601977652952, "grad_norm": 5.9375, "learning_rate": 9.783276248966576e-05, "loss": 2.8687, "step": 18350 }, { "epoch": 0.5269922419008953, "grad_norm": 0.1142578125, "learning_rate": 9.753749852368017e-05, "loss": 1.9855, "step": 18400 }, { "epoch": 0.5284242860364956, "grad_norm": 1.03125, "learning_rate": 9.724223455769457e-05, "loss": 2.5827, "step": 18450 }, { "epoch": 0.5298563301720959, "grad_norm": 270.0, "learning_rate": 9.694697059170899e-05, "loss": 1.9905, "step": 18500 }, { "epoch": 0.5312883743076962, "grad_norm": 0.2392578125, "learning_rate": 9.665170662572341e-05, "loss": 1.9516, "step": 18550 }, { "epoch": 0.5327204184432964, "grad_norm": 116.5, "learning_rate": 9.635644265973781e-05, "loss": 1.7887, "step": 18600 }, { "epoch": 0.5341524625788967, "grad_norm": 0.0306396484375, "learning_rate": 9.606117869375222e-05, "loss": 1.8686, "step": 18650 }, { "epoch": 0.535584506714497, "grad_norm": 0.7109375, "learning_rate": 9.576591472776662e-05, "loss": 1.7828, "step": 18700 }, { "epoch": 0.5370165508500973, "grad_norm": 0.92578125, "learning_rate": 9.547065076178103e-05, "loss": 1.8761, "step": 18750 }, { "epoch": 0.5370165508500973, "eval_accuracy": 0.9495, "eval_loss": 0.26102420687675476, "eval_macro_f1": 0.9488861373782008, "eval_runtime": 172.7705, "eval_samples_per_second": 11.576, "eval_steps_per_second": 11.576, "step": 18750 }, { "epoch": 0.5384485949856974, "grad_norm": 0.1416015625, "learning_rate": 9.517538679579543e-05, "loss": 2.3765, "step": 18800 }, { "epoch": 0.5398806391212977, "grad_norm": 258.0, "learning_rate": 9.488012282980985e-05, "loss": 1.5944, "step": 18850 }, { "epoch": 0.541312683256898, "grad_norm": 106.0, "learning_rate": 9.458485886382427e-05, "loss": 2.4606, "step": 18900 }, { "epoch": 0.5427447273924982, "grad_norm": 92.5, "learning_rate": 9.428959489783868e-05, "loss": 2.6863, "step": 18950 }, { "epoch": 0.5441767715280985, "grad_norm": 207.0, "learning_rate": 9.399433093185308e-05, "loss": 1.9462, "step": 19000 }, { "epoch": 0.5456088156636988, "grad_norm": 82.5, "learning_rate": 9.369906696586749e-05, "loss": 2.8243, "step": 19050 }, { "epoch": 0.5470408597992991, "grad_norm": 0.1572265625, "learning_rate": 9.340380299988189e-05, "loss": 1.8158, "step": 19100 }, { "epoch": 0.5484729039348992, "grad_norm": 0.4609375, "learning_rate": 9.31085390338963e-05, "loss": 2.7108, "step": 19150 }, { "epoch": 0.5499049480704995, "grad_norm": 0.62109375, "learning_rate": 9.281327506791072e-05, "loss": 1.9835, "step": 19200 }, { "epoch": 0.5513369922060998, "grad_norm": 0.039794921875, "learning_rate": 9.251801110192512e-05, "loss": 2.0608, "step": 19250 }, { "epoch": 0.5527690363417, "grad_norm": 1.265625, "learning_rate": 9.222274713593954e-05, "loss": 1.9135, "step": 19300 }, { "epoch": 0.5542010804773003, "grad_norm": 0.030517578125, "learning_rate": 9.192748316995395e-05, "loss": 2.5224, "step": 19350 }, { "epoch": 0.5556331246129006, "grad_norm": 332.0, "learning_rate": 9.163221920396835e-05, "loss": 3.2123, "step": 19400 }, { "epoch": 0.5570651687485009, "grad_norm": 0.046142578125, "learning_rate": 9.133695523798276e-05, "loss": 1.9807, "step": 19450 }, { "epoch": 0.558497212884101, "grad_norm": 0.036376953125, "learning_rate": 9.104169127199716e-05, "loss": 2.8211, "step": 19500 }, { "epoch": 0.5599292570197013, "grad_norm": 0.2412109375, "learning_rate": 9.074642730601158e-05, "loss": 2.7913, "step": 19550 }, { "epoch": 0.5613613011553016, "grad_norm": 0.087890625, "learning_rate": 9.045116334002599e-05, "loss": 1.5528, "step": 19600 }, { "epoch": 0.5627933452909019, "grad_norm": 0.21484375, "learning_rate": 9.01558993740404e-05, "loss": 2.3024, "step": 19650 }, { "epoch": 0.5642253894265021, "grad_norm": 0.271484375, "learning_rate": 8.986063540805481e-05, "loss": 1.305, "step": 19700 }, { "epoch": 0.5656574335621024, "grad_norm": 0.373046875, "learning_rate": 8.956537144206921e-05, "loss": 2.1656, "step": 19750 }, { "epoch": 0.5670894776977027, "grad_norm": 84.5, "learning_rate": 8.927010747608362e-05, "loss": 1.6671, "step": 19800 }, { "epoch": 0.568521521833303, "grad_norm": 112.0, "learning_rate": 8.897484351009802e-05, "loss": 2.4715, "step": 19850 }, { "epoch": 0.5699535659689031, "grad_norm": 0.130859375, "learning_rate": 8.867957954411244e-05, "loss": 1.7577, "step": 19900 }, { "epoch": 0.5713856101045034, "grad_norm": 0.1357421875, "learning_rate": 8.838431557812685e-05, "loss": 1.6778, "step": 19950 }, { "epoch": 0.5728176542401037, "grad_norm": 0.07080078125, "learning_rate": 8.808905161214125e-05, "loss": 1.4789, "step": 20000 }, { "epoch": 0.5728176542401037, "eval_accuracy": 0.949, "eval_loss": 0.2858292758464813, "eval_macro_f1": 0.9484543460104051, "eval_runtime": 172.7421, "eval_samples_per_second": 11.578, "eval_steps_per_second": 11.578, "step": 20000 }, { "epoch": 0.5742496983757039, "grad_norm": 0.140625, "learning_rate": 8.779378764615567e-05, "loss": 1.6797, "step": 20050 }, { "epoch": 0.5756817425113042, "grad_norm": 0.115234375, "learning_rate": 8.749852368017008e-05, "loss": 1.5026, "step": 20100 }, { "epoch": 0.5771137866469045, "grad_norm": 0.02880859375, "learning_rate": 8.720325971418448e-05, "loss": 2.1316, "step": 20150 }, { "epoch": 0.5785458307825048, "grad_norm": 0.08544921875, "learning_rate": 8.690799574819889e-05, "loss": 1.7517, "step": 20200 }, { "epoch": 0.5799778749181049, "grad_norm": 82.0, "learning_rate": 8.661273178221331e-05, "loss": 2.4167, "step": 20250 }, { "epoch": 0.5814099190537052, "grad_norm": 0.138671875, "learning_rate": 8.631746781622771e-05, "loss": 1.8565, "step": 20300 }, { "epoch": 0.5828419631893055, "grad_norm": 34.25, "learning_rate": 8.602220385024212e-05, "loss": 1.9747, "step": 20350 }, { "epoch": 0.5842740073249058, "grad_norm": 91.0, "learning_rate": 8.572693988425654e-05, "loss": 2.3284, "step": 20400 }, { "epoch": 0.585706051460506, "grad_norm": 0.80859375, "learning_rate": 8.543167591827094e-05, "loss": 2.1788, "step": 20450 }, { "epoch": 0.5871380955961063, "grad_norm": 446.0, "learning_rate": 8.513641195228535e-05, "loss": 1.6187, "step": 20500 }, { "epoch": 0.5885701397317066, "grad_norm": 43.75, "learning_rate": 8.484114798629975e-05, "loss": 1.6338, "step": 20550 }, { "epoch": 0.5900021838673067, "grad_norm": 0.91015625, "learning_rate": 8.454588402031417e-05, "loss": 2.424, "step": 20600 }, { "epoch": 0.591434228002907, "grad_norm": 0.44921875, "learning_rate": 8.425062005432858e-05, "loss": 2.3043, "step": 20650 }, { "epoch": 0.5928662721385073, "grad_norm": 268.0, "learning_rate": 8.395535608834298e-05, "loss": 2.5707, "step": 20700 }, { "epoch": 0.5942983162741076, "grad_norm": 4.125, "learning_rate": 8.366009212235739e-05, "loss": 1.9577, "step": 20750 }, { "epoch": 0.5957303604097078, "grad_norm": 5.0, "learning_rate": 8.33648281563718e-05, "loss": 0.7482, "step": 20800 }, { "epoch": 0.5971624045453081, "grad_norm": 0.04052734375, "learning_rate": 8.306956419038621e-05, "loss": 1.5055, "step": 20850 }, { "epoch": 0.5985944486809084, "grad_norm": 1.109375, "learning_rate": 8.277430022440061e-05, "loss": 3.3671, "step": 20900 }, { "epoch": 0.6000264928165085, "grad_norm": 0.162109375, "learning_rate": 8.247903625841503e-05, "loss": 2.0574, "step": 20950 }, { "epoch": 0.6014585369521088, "grad_norm": 0.03173828125, "learning_rate": 8.218377229242944e-05, "loss": 2.1942, "step": 21000 }, { "epoch": 0.6028905810877091, "grad_norm": 0.251953125, "learning_rate": 8.188850832644384e-05, "loss": 1.6319, "step": 21050 }, { "epoch": 0.6043226252233094, "grad_norm": 86.5, "learning_rate": 8.159324436045825e-05, "loss": 2.1558, "step": 21100 }, { "epoch": 0.6057546693589096, "grad_norm": 2.984375, "learning_rate": 8.129798039447267e-05, "loss": 2.2353, "step": 21150 }, { "epoch": 0.6071867134945099, "grad_norm": 0.0908203125, "learning_rate": 8.100271642848707e-05, "loss": 1.4975, "step": 21200 }, { "epoch": 0.6086187576301102, "grad_norm": 98.0, "learning_rate": 8.070745246250148e-05, "loss": 2.5975, "step": 21250 }, { "epoch": 0.6086187576301102, "eval_accuracy": 0.948, "eval_loss": 0.2741381525993347, "eval_macro_f1": 0.9473973559594594, "eval_runtime": 172.6111, "eval_samples_per_second": 11.587, "eval_steps_per_second": 11.587, "step": 21250 }, { "epoch": 0.6100508017657105, "grad_norm": 328.0, "learning_rate": 8.04121884965159e-05, "loss": 2.4534, "step": 21300 }, { "epoch": 0.6114828459013106, "grad_norm": 11.125, "learning_rate": 8.01169245305303e-05, "loss": 2.1319, "step": 21350 }, { "epoch": 0.6129148900369109, "grad_norm": 0.08642578125, "learning_rate": 7.982166056454471e-05, "loss": 2.4199, "step": 21400 }, { "epoch": 0.6143469341725112, "grad_norm": 0.18359375, "learning_rate": 7.952639659855911e-05, "loss": 1.7527, "step": 21450 }, { "epoch": 0.6157789783081115, "grad_norm": 0.458984375, "learning_rate": 7.923113263257352e-05, "loss": 2.4992, "step": 21500 }, { "epoch": 0.6172110224437117, "grad_norm": 0.671875, "learning_rate": 7.893586866658794e-05, "loss": 2.5082, "step": 21550 }, { "epoch": 0.618643066579312, "grad_norm": 272.0, "learning_rate": 7.864060470060234e-05, "loss": 2.2187, "step": 21600 }, { "epoch": 0.6200751107149123, "grad_norm": 310.0, "learning_rate": 7.834534073461675e-05, "loss": 3.006, "step": 21650 }, { "epoch": 0.6215071548505124, "grad_norm": 536.0, "learning_rate": 7.805007676863117e-05, "loss": 2.4535, "step": 21700 }, { "epoch": 0.6229391989861127, "grad_norm": 408.0, "learning_rate": 7.775481280264557e-05, "loss": 2.376, "step": 21750 }, { "epoch": 0.624371243121713, "grad_norm": 0.123046875, "learning_rate": 7.745954883665998e-05, "loss": 1.3044, "step": 21800 }, { "epoch": 0.6258032872573133, "grad_norm": 0.70703125, "learning_rate": 7.716428487067438e-05, "loss": 1.9046, "step": 21850 }, { "epoch": 0.6272353313929135, "grad_norm": 0.189453125, "learning_rate": 7.68690209046888e-05, "loss": 1.8825, "step": 21900 }, { "epoch": 0.6286673755285138, "grad_norm": 0.251953125, "learning_rate": 7.65737569387032e-05, "loss": 2.353, "step": 21950 }, { "epoch": 0.6300994196641141, "grad_norm": 0.0301513671875, "learning_rate": 7.627849297271761e-05, "loss": 1.7222, "step": 22000 }, { "epoch": 0.6315314637997143, "grad_norm": 0.126953125, "learning_rate": 7.598322900673203e-05, "loss": 2.4583, "step": 22050 }, { "epoch": 0.6329635079353145, "grad_norm": 1.0234375, "learning_rate": 7.568796504074643e-05, "loss": 1.9643, "step": 22100 }, { "epoch": 0.6343955520709148, "grad_norm": 70.5, "learning_rate": 7.539270107476084e-05, "loss": 1.6712, "step": 22150 }, { "epoch": 0.6358275962065151, "grad_norm": 8.0, "learning_rate": 7.509743710877524e-05, "loss": 2.1964, "step": 22200 }, { "epoch": 0.6372596403421154, "grad_norm": 11.625, "learning_rate": 7.480217314278965e-05, "loss": 2.0319, "step": 22250 }, { "epoch": 0.6386916844777156, "grad_norm": 0.09033203125, "learning_rate": 7.450690917680407e-05, "loss": 3.1062, "step": 22300 }, { "epoch": 0.6401237286133159, "grad_norm": 490.0, "learning_rate": 7.421164521081847e-05, "loss": 2.028, "step": 22350 }, { "epoch": 0.6415557727489162, "grad_norm": 688.0, "learning_rate": 7.391638124483289e-05, "loss": 1.6743, "step": 22400 }, { "epoch": 0.6429878168845163, "grad_norm": 0.2578125, "learning_rate": 7.36211172788473e-05, "loss": 1.3926, "step": 22450 }, { "epoch": 0.6444198610201166, "grad_norm": 278.0, "learning_rate": 7.33258533128617e-05, "loss": 2.073, "step": 22500 }, { "epoch": 0.6444198610201166, "eval_accuracy": 0.9495, "eval_loss": 0.2617259919643402, "eval_macro_f1": 0.9489699460568645, "eval_runtime": 172.6662, "eval_samples_per_second": 11.583, "eval_steps_per_second": 11.583, "step": 22500 }, { "epoch": 0.6458519051557169, "grad_norm": 0.07373046875, "learning_rate": 7.303058934687611e-05, "loss": 2.099, "step": 22550 }, { "epoch": 0.6472839492913172, "grad_norm": 0.5859375, "learning_rate": 7.273532538089051e-05, "loss": 2.2826, "step": 22600 }, { "epoch": 0.6487159934269174, "grad_norm": 79.5, "learning_rate": 7.244006141490493e-05, "loss": 1.377, "step": 22650 }, { "epoch": 0.6501480375625177, "grad_norm": 2.265625, "learning_rate": 7.214479744891934e-05, "loss": 1.9826, "step": 22700 }, { "epoch": 0.651580081698118, "grad_norm": 0.37109375, "learning_rate": 7.184953348293376e-05, "loss": 2.2446, "step": 22750 }, { "epoch": 0.6530121258337181, "grad_norm": 4.25, "learning_rate": 7.155426951694816e-05, "loss": 2.0254, "step": 22800 }, { "epoch": 0.6544441699693184, "grad_norm": 0.03564453125, "learning_rate": 7.125900555096257e-05, "loss": 2.0871, "step": 22850 }, { "epoch": 0.6558762141049187, "grad_norm": 0.390625, "learning_rate": 7.096374158497697e-05, "loss": 2.9276, "step": 22900 }, { "epoch": 0.657308258240519, "grad_norm": 0.384765625, "learning_rate": 7.066847761899138e-05, "loss": 1.0622, "step": 22950 }, { "epoch": 0.6587403023761192, "grad_norm": 4576.0, "learning_rate": 7.037321365300578e-05, "loss": 3.0808, "step": 23000 }, { "epoch": 0.6601723465117195, "grad_norm": 372.0, "learning_rate": 7.00779496870202e-05, "loss": 1.8306, "step": 23050 }, { "epoch": 0.6616043906473198, "grad_norm": 0.208984375, "learning_rate": 6.978268572103462e-05, "loss": 2.1282, "step": 23100 }, { "epoch": 0.66303643478292, "grad_norm": 5.65625, "learning_rate": 6.948742175504902e-05, "loss": 1.8392, "step": 23150 }, { "epoch": 0.6644684789185202, "grad_norm": 0.24609375, "learning_rate": 6.919215778906343e-05, "loss": 2.594, "step": 23200 }, { "epoch": 0.6659005230541205, "grad_norm": 0.123046875, "learning_rate": 6.889689382307783e-05, "loss": 2.4234, "step": 23250 }, { "epoch": 0.6673325671897208, "grad_norm": 268.0, "learning_rate": 6.860162985709224e-05, "loss": 2.3424, "step": 23300 }, { "epoch": 0.6687646113253211, "grad_norm": 0.427734375, "learning_rate": 6.830636589110664e-05, "loss": 2.3216, "step": 23350 }, { "epoch": 0.6701966554609213, "grad_norm": 0.9296875, "learning_rate": 6.801110192512106e-05, "loss": 2.4566, "step": 23400 }, { "epoch": 0.6716286995965216, "grad_norm": 88.5, "learning_rate": 6.771583795913548e-05, "loss": 1.3767, "step": 23450 }, { "epoch": 0.6730607437321218, "grad_norm": 488.0, "learning_rate": 6.742057399314989e-05, "loss": 2.343, "step": 23500 }, { "epoch": 0.674492787867722, "grad_norm": 296.0, "learning_rate": 6.712531002716429e-05, "loss": 1.4841, "step": 23550 }, { "epoch": 0.6759248320033223, "grad_norm": 218.0, "learning_rate": 6.68300460611787e-05, "loss": 2.4037, "step": 23600 }, { "epoch": 0.6773568761389226, "grad_norm": 0.466796875, "learning_rate": 6.65347820951931e-05, "loss": 1.4982, "step": 23650 }, { "epoch": 0.6787889202745229, "grad_norm": 49.25, "learning_rate": 6.623951812920751e-05, "loss": 2.2085, "step": 23700 }, { "epoch": 0.6802209644101231, "grad_norm": 0.30078125, "learning_rate": 6.594425416322191e-05, "loss": 1.7055, "step": 23750 }, { "epoch": 0.6802209644101231, "eval_accuracy": 0.9505, "eval_loss": 0.26270824670791626, "eval_macro_f1": 0.9498080478089564, "eval_runtime": 172.6664, "eval_samples_per_second": 11.583, "eval_steps_per_second": 11.583, "step": 23750 }, { "epoch": 0.6816530085457234, "grad_norm": 95.5, "learning_rate": 6.564899019723633e-05, "loss": 2.5024, "step": 23800 }, { "epoch": 0.6830850526813237, "grad_norm": 0.30078125, "learning_rate": 6.535372623125075e-05, "loss": 2.2518, "step": 23850 }, { "epoch": 0.6845170968169239, "grad_norm": 116.5, "learning_rate": 6.505846226526516e-05, "loss": 2.0539, "step": 23900 }, { "epoch": 0.6859491409525241, "grad_norm": 264.0, "learning_rate": 6.476319829927956e-05, "loss": 2.5857, "step": 23950 }, { "epoch": 0.6873811850881244, "grad_norm": 302.0, "learning_rate": 6.446793433329397e-05, "loss": 2.1408, "step": 24000 }, { "epoch": 0.6888132292237247, "grad_norm": 0.5390625, "learning_rate": 6.417267036730837e-05, "loss": 1.9618, "step": 24050 }, { "epoch": 0.690245273359325, "grad_norm": 164.0, "learning_rate": 6.387740640132278e-05, "loss": 2.0112, "step": 24100 }, { "epoch": 0.6916773174949252, "grad_norm": 14.0625, "learning_rate": 6.35821424353372e-05, "loss": 2.7256, "step": 24150 }, { "epoch": 0.6931093616305255, "grad_norm": 164.0, "learning_rate": 6.328687846935161e-05, "loss": 0.8362, "step": 24200 }, { "epoch": 0.6945414057661257, "grad_norm": 0.271484375, "learning_rate": 6.299161450336602e-05, "loss": 2.2874, "step": 24250 }, { "epoch": 0.6959734499017259, "grad_norm": 110.5, "learning_rate": 6.269635053738042e-05, "loss": 1.5674, "step": 24300 }, { "epoch": 0.6974054940373262, "grad_norm": 0.1630859375, "learning_rate": 6.240108657139483e-05, "loss": 2.5817, "step": 24350 }, { "epoch": 0.6988375381729265, "grad_norm": 0.99609375, "learning_rate": 6.210582260540923e-05, "loss": 1.2537, "step": 24400 }, { "epoch": 0.7002695823085268, "grad_norm": 266.0, "learning_rate": 6.181055863942364e-05, "loss": 2.4499, "step": 24450 }, { "epoch": 0.701701626444127, "grad_norm": 1.59375, "learning_rate": 6.151529467343806e-05, "loss": 2.8047, "step": 24500 }, { "epoch": 0.7031336705797273, "grad_norm": 0.08447265625, "learning_rate": 6.122003070745246e-05, "loss": 1.6917, "step": 24550 }, { "epoch": 0.7045657147153275, "grad_norm": 296.0, "learning_rate": 6.0924766741466875e-05, "loss": 2.2486, "step": 24600 }, { "epoch": 0.7059977588509277, "grad_norm": 183.0, "learning_rate": 6.062950277548128e-05, "loss": 2.5183, "step": 24650 }, { "epoch": 0.707429802986528, "grad_norm": 0.123046875, "learning_rate": 6.033423880949569e-05, "loss": 2.2984, "step": 24700 }, { "epoch": 0.7088618471221283, "grad_norm": 0.37890625, "learning_rate": 6.00389748435101e-05, "loss": 2.4598, "step": 24750 }, { "epoch": 0.7102938912577286, "grad_norm": 6.25, "learning_rate": 5.97437108775245e-05, "loss": 2.0554, "step": 24800 }, { "epoch": 0.7117259353933288, "grad_norm": 1.5234375, "learning_rate": 5.9448446911538915e-05, "loss": 1.3688, "step": 24850 }, { "epoch": 0.7131579795289291, "grad_norm": 83.5, "learning_rate": 5.9153182945553334e-05, "loss": 2.6434, "step": 24900 }, { "epoch": 0.7145900236645294, "grad_norm": 0.8046875, "learning_rate": 5.885791897956774e-05, "loss": 1.1703, "step": 24950 }, { "epoch": 0.7160220678001296, "grad_norm": 0.76953125, "learning_rate": 5.8562655013582144e-05, "loss": 1.7433, "step": 25000 }, { "epoch": 0.7160220678001296, "eval_accuracy": 0.9475, "eval_loss": 0.2805185317993164, "eval_macro_f1": 0.9469725724830536, "eval_runtime": 172.6365, "eval_samples_per_second": 11.585, "eval_steps_per_second": 11.585, "step": 25000 }, { "epoch": 0.7174541119357298, "grad_norm": 268.0, "learning_rate": 5.8267391047596556e-05, "loss": 2.7963, "step": 25050 }, { "epoch": 0.7188861560713301, "grad_norm": 294.0, "learning_rate": 5.797212708161096e-05, "loss": 2.3253, "step": 25100 }, { "epoch": 0.7203182002069304, "grad_norm": 0.11181640625, "learning_rate": 5.7676863115625366e-05, "loss": 1.0165, "step": 25150 }, { "epoch": 0.7217502443425307, "grad_norm": 756.0, "learning_rate": 5.738159914963978e-05, "loss": 1.4844, "step": 25200 }, { "epoch": 0.7231822884781309, "grad_norm": 0.10107421875, "learning_rate": 5.70863351836542e-05, "loss": 2.7171, "step": 25250 }, { "epoch": 0.7246143326137312, "grad_norm": 336.0, "learning_rate": 5.67910712176686e-05, "loss": 3.1605, "step": 25300 }, { "epoch": 0.7260463767493314, "grad_norm": 177.0, "learning_rate": 5.649580725168301e-05, "loss": 1.9816, "step": 25350 }, { "epoch": 0.7274784208849316, "grad_norm": 1.78125, "learning_rate": 5.620054328569741e-05, "loss": 1.8129, "step": 25400 }, { "epoch": 0.7289104650205319, "grad_norm": 0.040771484375, "learning_rate": 5.5905279319711824e-05, "loss": 1.3484, "step": 25450 }, { "epoch": 0.7303425091561322, "grad_norm": 8.375, "learning_rate": 5.561001535372623e-05, "loss": 2.1354, "step": 25500 }, { "epoch": 0.7317745532917325, "grad_norm": 0.0308837890625, "learning_rate": 5.5314751387740635e-05, "loss": 1.747, "step": 25550 }, { "epoch": 0.7332065974273327, "grad_norm": 808.0, "learning_rate": 5.5019487421755053e-05, "loss": 2.6803, "step": 25600 }, { "epoch": 0.734638641562933, "grad_norm": 0.96484375, "learning_rate": 5.4724223455769465e-05, "loss": 2.2422, "step": 25650 }, { "epoch": 0.7360706856985332, "grad_norm": 10.0, "learning_rate": 5.442895948978387e-05, "loss": 2.0731, "step": 25700 }, { "epoch": 0.7375027298341335, "grad_norm": 0.27734375, "learning_rate": 5.4133695523798276e-05, "loss": 2.9622, "step": 25750 }, { "epoch": 0.7389347739697337, "grad_norm": 200.0, "learning_rate": 5.383843155781269e-05, "loss": 2.179, "step": 25800 }, { "epoch": 0.740366818105334, "grad_norm": 118.5, "learning_rate": 5.354316759182709e-05, "loss": 2.4152, "step": 25850 }, { "epoch": 0.7417988622409343, "grad_norm": 0.2470703125, "learning_rate": 5.32479036258415e-05, "loss": 1.4274, "step": 25900 }, { "epoch": 0.7432309063765346, "grad_norm": 6.71875, "learning_rate": 5.295263965985592e-05, "loss": 2.0263, "step": 25950 }, { "epoch": 0.7446629505121348, "grad_norm": 0.28125, "learning_rate": 5.265737569387033e-05, "loss": 1.8231, "step": 26000 }, { "epoch": 0.746094994647735, "grad_norm": 0.8046875, "learning_rate": 5.2362111727884734e-05, "loss": 1.7974, "step": 26050 }, { "epoch": 0.7475270387833353, "grad_norm": 102.5, "learning_rate": 5.206684776189914e-05, "loss": 2.5667, "step": 26100 }, { "epoch": 0.7489590829189355, "grad_norm": 380.0, "learning_rate": 5.1771583795913544e-05, "loss": 1.8334, "step": 26150 }, { "epoch": 0.7503911270545358, "grad_norm": 0.74609375, "learning_rate": 5.1476319829927956e-05, "loss": 1.2929, "step": 26200 }, { "epoch": 0.7518231711901361, "grad_norm": 7.3125, "learning_rate": 5.118105586394236e-05, "loss": 2.2943, "step": 26250 }, { "epoch": 0.7518231711901361, "eval_accuracy": 0.951, "eval_loss": 0.2633407413959503, "eval_macro_f1": 0.9502699810655684, "eval_runtime": 172.7789, "eval_samples_per_second": 11.575, "eval_steps_per_second": 11.575, "step": 26250 } ], "logging_steps": 50, "max_steps": 34916, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1179332952064e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }