diff --git "a/tmp-checkpoint-5335/trainer_state.json" "b/tmp-checkpoint-5335/trainer_state.json" --- "a/tmp-checkpoint-5335/trainer_state.json" +++ "b/tmp-checkpoint-5335/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 0.3931359052658081, - "best_model_checkpoint": "MarkKisker/RoBERTa-base-RottenTomatoes_v2\\checkpoint-2134", + "best_metric": 0.6987214088439941, + "best_model_checkpoint": "MarkKisker/RoBERTa-base-RottenTomatoes_v2\\checkpoint-1067", "epoch": 5.0, "eval_steps": 500, "global_step": 5335, @@ -10,3793 +10,3793 @@ "log_history": [ { "epoch": 0.01, - "grad_norm": 3.744861602783203, + "grad_norm": 0.18819734454154968, "learning_rate": 1.0000000000000002e-06, - "loss": 0.7101, + "loss": 0.2519, "step": 10 }, { "epoch": 0.02, - "grad_norm": 4.64638090133667, + "grad_norm": 0.9499709010124207, "learning_rate": 2.0000000000000003e-06, - "loss": 0.7158, + "loss": 0.2018, "step": 20 }, { "epoch": 0.03, - "grad_norm": 1.5597444772720337, + "grad_norm": 20.164424896240234, "learning_rate": 3e-06, - "loss": 0.6905, + "loss": 0.472, "step": 30 }, { "epoch": 0.04, - "grad_norm": 2.208892345428467, + "grad_norm": 1.5840502977371216, "learning_rate": 4.000000000000001e-06, - "loss": 0.6901, + "loss": 0.1679, "step": 40 }, { "epoch": 0.05, - "grad_norm": 4.314438819885254, + "grad_norm": 79.08979034423828, "learning_rate": 5e-06, - "loss": 0.6825, + "loss": 0.3838, "step": 50 }, { "epoch": 0.06, - "grad_norm": 3.1201136112213135, + "grad_norm": 3.6526267528533936, "learning_rate": 6e-06, - "loss": 0.7093, + "loss": 0.3122, "step": 60 }, { "epoch": 0.07, - "grad_norm": 1.3445158004760742, + "grad_norm": 88.9712142944336, "learning_rate": 7.000000000000001e-06, - "loss": 0.691, + "loss": 0.2529, "step": 70 }, { "epoch": 0.07, - "grad_norm": 2.153444766998291, + "grad_norm": 1.6937979459762573, "learning_rate": 8.000000000000001e-06, - "loss": 0.7013, + "loss": 0.1963, "step": 80 }, { "epoch": 0.08, - "grad_norm": 1.514472246170044, + "grad_norm": 0.20718297362327576, "learning_rate": 9e-06, - "loss": 0.6843, + "loss": 0.3032, "step": 90 }, { "epoch": 0.09, - "grad_norm": 7.173806190490723, + "grad_norm": 0.26690974831581116, "learning_rate": 1e-05, - "loss": 0.6641, + "loss": 0.2491, "step": 100 }, { "epoch": 0.1, - "grad_norm": 11.494062423706055, + "grad_norm": 60.143089294433594, "learning_rate": 1.1000000000000001e-05, - "loss": 0.5903, + "loss": 0.2035, "step": 110 }, { "epoch": 0.11, - "grad_norm": 28.968042373657227, + "grad_norm": 81.73893737792969, "learning_rate": 1.2e-05, - "loss": 0.4994, + "loss": 0.2426, "step": 120 }, { "epoch": 0.12, - "grad_norm": 44.565860748291016, + "grad_norm": 0.23848050832748413, "learning_rate": 1.3000000000000001e-05, - "loss": 0.4509, + "loss": 0.1964, "step": 130 }, { "epoch": 0.13, - "grad_norm": 28.26981544494629, + "grad_norm": 150.04852294921875, "learning_rate": 1.4000000000000001e-05, - "loss": 0.4284, + "loss": 0.3048, "step": 140 }, { "epoch": 0.14, - "grad_norm": 8.198760032653809, + "grad_norm": 0.18502970039844513, "learning_rate": 1.5e-05, - "loss": 0.4876, + "loss": 0.0782, "step": 150 }, { "epoch": 0.15, - "grad_norm": 50.528987884521484, + "grad_norm": 4.491607666015625, "learning_rate": 1.6000000000000003e-05, - "loss": 0.4672, + "loss": 0.2463, "step": 160 }, { "epoch": 0.16, - "grad_norm": 16.226484298706055, + "grad_norm": 0.17159606516361237, "learning_rate": 1.7000000000000003e-05, - "loss": 0.6434, + "loss": 0.1454, "step": 170 }, { "epoch": 0.17, - "grad_norm": 14.364855766296387, + "grad_norm": 8.306988716125488, "learning_rate": 1.8e-05, - "loss": 0.5164, + "loss": 0.1481, "step": 180 }, { "epoch": 0.18, - "grad_norm": 7.223922252655029, + "grad_norm": 68.59893798828125, "learning_rate": 1.9e-05, - "loss": 0.4885, + "loss": 0.0853, "step": 190 }, { "epoch": 0.19, - "grad_norm": 8.672370910644531, + "grad_norm": 0.32962626218795776, "learning_rate": 2e-05, - "loss": 0.4994, + "loss": 0.2061, "step": 200 }, { "epoch": 0.2, - "grad_norm": 52.54035949707031, + "grad_norm": 0.44911274313926697, "learning_rate": 2.1e-05, - "loss": 0.462, + "loss": 0.2424, "step": 210 }, { "epoch": 0.21, - "grad_norm": 0.855399489402771, + "grad_norm": 0.1533774435520172, "learning_rate": 2.2000000000000003e-05, - "loss": 0.217, + "loss": 0.0086, "step": 220 }, { "epoch": 0.22, - "grad_norm": 66.37751007080078, + "grad_norm": 0.11695315688848495, "learning_rate": 2.3000000000000003e-05, - "loss": 0.6573, + "loss": 0.1169, "step": 230 }, { "epoch": 0.22, - "grad_norm": 6.453437805175781, + "grad_norm": 0.11531256884336472, "learning_rate": 2.4e-05, - "loss": 0.5052, + "loss": 0.0582, "step": 240 }, { "epoch": 0.23, - "grad_norm": 7.456045627593994, + "grad_norm": 0.08368314802646637, "learning_rate": 2.5e-05, - "loss": 0.3942, + "loss": 0.1063, "step": 250 }, { "epoch": 0.24, - "grad_norm": 16.87609100341797, + "grad_norm": 0.08130346238613129, "learning_rate": 2.6000000000000002e-05, - "loss": 0.4712, + "loss": 0.1673, "step": 260 }, { "epoch": 0.25, - "grad_norm": 22.84710693359375, + "grad_norm": 0.07953976094722748, "learning_rate": 2.7000000000000002e-05, - "loss": 0.4618, + "loss": 0.1073, "step": 270 }, { "epoch": 0.26, - "grad_norm": 1.8799303770065308, + "grad_norm": 0.08237360417842865, "learning_rate": 2.8000000000000003e-05, - "loss": 0.59, + "loss": 0.1379, "step": 280 }, { "epoch": 0.27, - "grad_norm": 80.71158599853516, + "grad_norm": 0.09733965992927551, "learning_rate": 2.9e-05, - "loss": 0.392, + "loss": 0.1136, "step": 290 }, { "epoch": 0.28, - "grad_norm": 2.51127290725708, + "grad_norm": 0.9005267024040222, "learning_rate": 3e-05, - "loss": 0.4768, + "loss": 0.2757, "step": 300 }, { "epoch": 0.29, - "grad_norm": 4.434652805328369, + "grad_norm": 216.63461303710938, "learning_rate": 3.1e-05, - "loss": 0.3732, + "loss": 0.1274, "step": 310 }, { "epoch": 0.3, - "grad_norm": 12.18094253540039, + "grad_norm": 0.08846700191497803, "learning_rate": 3.2000000000000005e-05, - "loss": 0.4689, + "loss": 0.1549, "step": 320 }, { "epoch": 0.31, - "grad_norm": 22.681018829345703, + "grad_norm": 0.10107693076133728, "learning_rate": 3.3e-05, - "loss": 0.4172, + "loss": 0.0722, "step": 330 }, { "epoch": 0.32, - "grad_norm": 12.466619491577148, + "grad_norm": 0.11054098606109619, "learning_rate": 3.4000000000000007e-05, - "loss": 0.4661, + "loss": 0.0776, "step": 340 }, { "epoch": 0.33, - "grad_norm": 7.371677398681641, + "grad_norm": 0.1684148609638214, "learning_rate": 3.5e-05, - "loss": 0.3976, + "loss": 0.1409, "step": 350 }, { "epoch": 0.34, - "grad_norm": 4.3304595947265625, + "grad_norm": 13.547722816467285, "learning_rate": 3.6e-05, - "loss": 0.6109, + "loss": 0.1295, "step": 360 }, { "epoch": 0.35, - "grad_norm": 165.4158172607422, + "grad_norm": 0.09100760519504547, "learning_rate": 3.7e-05, - "loss": 0.3678, + "loss": 0.1349, "step": 370 }, { "epoch": 0.36, - "grad_norm": 4.8646016120910645, + "grad_norm": 0.11295680701732635, "learning_rate": 3.8e-05, - "loss": 0.6374, + "loss": 0.2007, "step": 380 }, { "epoch": 0.37, - "grad_norm": 1.7399684190750122, + "grad_norm": 0.28185102343559265, "learning_rate": 3.9000000000000006e-05, - "loss": 0.4887, + "loss": 0.0049, "step": 390 }, { "epoch": 0.37, - "grad_norm": 16.256572723388672, + "grad_norm": 44.035606384277344, "learning_rate": 4e-05, - "loss": 0.5702, + "loss": 0.1354, "step": 400 }, { "epoch": 0.38, - "grad_norm": 45.110679626464844, + "grad_norm": 88.09428405761719, "learning_rate": 4.1e-05, - "loss": 0.5779, + "loss": 0.2693, "step": 410 }, { "epoch": 0.39, - "grad_norm": 10.500941276550293, + "grad_norm": 0.13857559859752655, "learning_rate": 4.2e-05, - "loss": 0.6049, + "loss": 0.309, "step": 420 }, { "epoch": 0.4, - "grad_norm": 46.1656494140625, + "grad_norm": 0.13726216554641724, "learning_rate": 4.3e-05, - "loss": 0.5598, + "loss": 0.2309, "step": 430 }, { "epoch": 0.41, - "grad_norm": 11.01528549194336, + "grad_norm": 0.35133376717567444, "learning_rate": 4.4000000000000006e-05, - "loss": 0.4441, + "loss": 0.3066, "step": 440 }, { "epoch": 0.42, - "grad_norm": 11.193294525146484, + "grad_norm": 2.9189860820770264, "learning_rate": 4.5e-05, - "loss": 0.5585, + "loss": 0.2049, "step": 450 }, { "epoch": 0.43, - "grad_norm": 18.2698917388916, + "grad_norm": 8.981446266174316, "learning_rate": 4.600000000000001e-05, - "loss": 0.4241, + "loss": 0.1587, "step": 460 }, { "epoch": 0.44, - "grad_norm": 166.50296020507812, + "grad_norm": 1.4933860301971436, "learning_rate": 4.7e-05, - "loss": 0.3354, + "loss": 0.1674, "step": 470 }, { "epoch": 0.45, - "grad_norm": 128.95260620117188, + "grad_norm": 0.10708652436733246, "learning_rate": 4.8e-05, - "loss": 0.5357, + "loss": 0.0669, "step": 480 }, { "epoch": 0.46, - "grad_norm": 5.128959655761719, + "grad_norm": 27.164520263671875, "learning_rate": 4.9e-05, - "loss": 0.4072, + "loss": 0.1222, "step": 490 }, { "epoch": 0.47, - "grad_norm": 3.268726348876953, + "grad_norm": 0.1280289888381958, "learning_rate": 5e-05, - "loss": 0.474, + "loss": 0.2507, "step": 500 }, { "epoch": 0.48, - "grad_norm": 37.61716842651367, + "grad_norm": 0.14575959742069244, "learning_rate": 4.9896587383660806e-05, - "loss": 0.6837, + "loss": 0.2695, "step": 510 }, { "epoch": 0.49, - "grad_norm": 5.7842302322387695, + "grad_norm": 1.213277816772461, "learning_rate": 4.9793174767321616e-05, - "loss": 0.4518, + "loss": 0.1962, "step": 520 }, { "epoch": 0.5, - "grad_norm": 17.824024200439453, + "grad_norm": 110.40786743164062, "learning_rate": 4.968976215098242e-05, - "loss": 0.5325, + "loss": 0.1626, "step": 530 }, { "epoch": 0.51, - "grad_norm": 13.727099418640137, + "grad_norm": 3.0125222206115723, "learning_rate": 4.958634953464323e-05, - "loss": 0.4236, + "loss": 0.3283, "step": 540 }, { "epoch": 0.52, - "grad_norm": 11.013121604919434, + "grad_norm": 2.7797441482543945, "learning_rate": 4.948293691830403e-05, - "loss": 0.3925, + "loss": 0.259, "step": 550 }, { "epoch": 0.52, - "grad_norm": 10.758121490478516, + "grad_norm": 0.21556439995765686, "learning_rate": 4.937952430196484e-05, - "loss": 0.4354, + "loss": 0.1826, "step": 560 }, { "epoch": 0.53, - "grad_norm": 63.49281692504883, + "grad_norm": 4.356193542480469, "learning_rate": 4.9276111685625646e-05, - "loss": 0.456, + "loss": 0.1792, "step": 570 }, { "epoch": 0.54, - "grad_norm": 14.087475776672363, + "grad_norm": 10.7008056640625, "learning_rate": 4.9172699069286456e-05, - "loss": 0.514, + "loss": 0.6648, "step": 580 }, { "epoch": 0.55, - "grad_norm": 2.470353841781616, + "grad_norm": 5.572720050811768, "learning_rate": 4.906928645294726e-05, - "loss": 0.5102, + "loss": 0.5023, "step": 590 }, { "epoch": 0.56, - "grad_norm": 43.60289001464844, + "grad_norm": 41.6730842590332, "learning_rate": 4.896587383660807e-05, - "loss": 0.4577, + "loss": 0.2741, "step": 600 }, { "epoch": 0.57, - "grad_norm": 13.848435401916504, + "grad_norm": 39.446434020996094, "learning_rate": 4.886246122026887e-05, - "loss": 0.3769, + "loss": 0.1501, "step": 610 }, { "epoch": 0.58, - "grad_norm": 85.09272003173828, + "grad_norm": 84.94667053222656, "learning_rate": 4.8759048603929683e-05, - "loss": 0.4745, + "loss": 0.2907, "step": 620 }, { "epoch": 0.59, - "grad_norm": 2.0452070236206055, + "grad_norm": 1.8101928234100342, "learning_rate": 4.865563598759049e-05, - "loss": 0.3618, + "loss": 0.2514, "step": 630 }, { "epoch": 0.6, - "grad_norm": 5.641032695770264, + "grad_norm": 102.29499053955078, "learning_rate": 4.855222337125129e-05, - "loss": 0.5703, + "loss": 0.5467, "step": 640 }, { "epoch": 0.61, - "grad_norm": 19.266847610473633, + "grad_norm": 2.9631879329681396, "learning_rate": 4.84488107549121e-05, - "loss": 0.5826, + "loss": 0.2997, "step": 650 }, { "epoch": 0.62, - "grad_norm": 73.6524658203125, + "grad_norm": 0.1405537873506546, "learning_rate": 4.8345398138572904e-05, - "loss": 0.5455, + "loss": 0.2319, "step": 660 }, { "epoch": 0.63, - "grad_norm": 1.602294683456421, + "grad_norm": 0.13494689762592316, "learning_rate": 4.8241985522233714e-05, - "loss": 0.5497, + "loss": 0.4836, "step": 670 }, { "epoch": 0.64, - "grad_norm": 22.117345809936523, + "grad_norm": 3.714601755142212, "learning_rate": 4.813857290589452e-05, - "loss": 0.4378, + "loss": 0.3312, "step": 680 }, { "epoch": 0.65, - "grad_norm": 110.3412094116211, + "grad_norm": 0.274013876914978, "learning_rate": 4.803516028955533e-05, - "loss": 0.4393, + "loss": 0.1327, "step": 690 }, { "epoch": 0.66, - "grad_norm": 3.794416904449463, + "grad_norm": 0.6073850989341736, "learning_rate": 4.793174767321613e-05, - "loss": 0.5334, + "loss": 0.1932, "step": 700 }, { "epoch": 0.67, - "grad_norm": 21.734609603881836, + "grad_norm": 0.08534292876720428, "learning_rate": 4.782833505687694e-05, - "loss": 0.5833, + "loss": 0.0918, "step": 710 }, { "epoch": 0.67, - "grad_norm": 15.466120719909668, + "grad_norm": 0.1106233149766922, "learning_rate": 4.772492244053775e-05, - "loss": 0.6138, + "loss": 0.0718, "step": 720 }, { "epoch": 0.68, - "grad_norm": 15.851093292236328, + "grad_norm": 0.14169247448444366, "learning_rate": 4.7621509824198554e-05, - "loss": 0.474, + "loss": 0.2155, "step": 730 }, { "epoch": 0.69, - "grad_norm": 11.810195922851562, + "grad_norm": 5.554646015167236, "learning_rate": 4.7518097207859365e-05, - "loss": 0.5048, + "loss": 0.2144, "step": 740 }, { "epoch": 0.7, - "grad_norm": 7.658618450164795, + "grad_norm": 0.28406473994255066, "learning_rate": 4.741468459152017e-05, - "loss": 0.3721, + "loss": 0.2004, "step": 750 }, { "epoch": 0.71, - "grad_norm": 5.516314506530762, + "grad_norm": 0.2573547959327698, "learning_rate": 4.731127197518098e-05, - "loss": 0.4057, + "loss": 0.1411, "step": 760 }, { "epoch": 0.72, - "grad_norm": 29.502197265625, + "grad_norm": 0.1661107838153839, "learning_rate": 4.720785935884178e-05, - "loss": 0.4228, + "loss": 0.1108, "step": 770 }, { "epoch": 0.73, - "grad_norm": 5.364101409912109, + "grad_norm": 0.1617022007703781, "learning_rate": 4.710444674250259e-05, - "loss": 0.6065, + "loss": 0.1855, "step": 780 }, { "epoch": 0.74, - "grad_norm": 3.6527419090270996, + "grad_norm": 0.2379370480775833, "learning_rate": 4.7001034126163395e-05, - "loss": 0.4702, + "loss": 0.145, "step": 790 }, { "epoch": 0.75, - "grad_norm": 234.80238342285156, + "grad_norm": 0.22972096502780914, "learning_rate": 4.6897621509824205e-05, - "loss": 0.4136, + "loss": 0.1885, "step": 800 }, { "epoch": 0.76, - "grad_norm": 3.4541990756988525, + "grad_norm": 0.21119792759418488, "learning_rate": 4.679420889348501e-05, - "loss": 0.4633, + "loss": 0.1153, "step": 810 }, { "epoch": 0.77, - "grad_norm": 10.891709327697754, + "grad_norm": 0.5091124773025513, "learning_rate": 4.669079627714581e-05, - "loss": 0.4773, + "loss": 0.2018, "step": 820 }, { "epoch": 0.78, - "grad_norm": 2.062467575073242, + "grad_norm": 0.09518251568078995, "learning_rate": 4.658738366080662e-05, - "loss": 0.4229, + "loss": 0.0668, "step": 830 }, { "epoch": 0.79, - "grad_norm": 1.5813032388687134, + "grad_norm": 0.08498405665159225, "learning_rate": 4.6483971044467425e-05, - "loss": 0.4163, + "loss": 0.1655, "step": 840 }, { "epoch": 0.8, - "grad_norm": 2.950200080871582, + "grad_norm": 0.11825589835643768, "learning_rate": 4.6380558428128236e-05, - "loss": 0.4312, + "loss": 0.3369, "step": 850 }, { "epoch": 0.81, - "grad_norm": 15.027694702148438, + "grad_norm": 0.19106203317642212, "learning_rate": 4.627714581178904e-05, - "loss": 0.4475, + "loss": 0.3149, "step": 860 }, { "epoch": 0.82, - "grad_norm": 6.54260778427124, + "grad_norm": 0.15981145203113556, "learning_rate": 4.617373319544985e-05, - "loss": 0.4871, + "loss": 0.1569, "step": 870 }, { "epoch": 0.82, - "grad_norm": 40.894622802734375, + "grad_norm": 1.3525559902191162, "learning_rate": 4.607032057911065e-05, - "loss": 0.7644, + "loss": 0.0151, "step": 880 }, { "epoch": 0.83, - "grad_norm": 40.53754425048828, + "grad_norm": 0.14920440316200256, "learning_rate": 4.596690796277146e-05, - "loss": 0.6225, + "loss": 0.3345, "step": 890 }, { "epoch": 0.84, - "grad_norm": 2.118955373764038, + "grad_norm": 0.42381948232650757, "learning_rate": 4.5863495346432266e-05, - "loss": 0.38, + "loss": 0.3653, "step": 900 }, { "epoch": 0.85, - "grad_norm": 4.862216472625732, + "grad_norm": 2.133568048477173, "learning_rate": 4.5760082730093076e-05, - "loss": 0.4339, + "loss": 0.2117, "step": 910 }, { "epoch": 0.86, - "grad_norm": 32.316802978515625, + "grad_norm": 0.3465505540370941, "learning_rate": 4.565667011375388e-05, - "loss": 0.6216, + "loss": 0.1258, "step": 920 }, { "epoch": 0.87, - "grad_norm": 8.157034873962402, + "grad_norm": 6.085052967071533, "learning_rate": 4.555325749741469e-05, - "loss": 0.3952, + "loss": 0.0828, "step": 930 }, { "epoch": 0.88, - "grad_norm": 34.675384521484375, + "grad_norm": 43.05193328857422, "learning_rate": 4.544984488107549e-05, - "loss": 0.4752, + "loss": 0.1615, "step": 940 }, { "epoch": 0.89, - "grad_norm": 1.4986870288848877, + "grad_norm": 0.3297998607158661, "learning_rate": 4.5346432264736296e-05, - "loss": 0.5754, + "loss": 0.2717, "step": 950 }, { "epoch": 0.9, - "grad_norm": 5.362887859344482, + "grad_norm": 0.6388803720474243, "learning_rate": 4.5243019648397106e-05, - "loss": 0.4518, + "loss": 0.3202, "step": 960 }, { "epoch": 0.91, - "grad_norm": 8.203104972839355, + "grad_norm": 0.17811286449432373, "learning_rate": 4.513960703205791e-05, - "loss": 0.4244, + "loss": 0.0243, "step": 970 }, { "epoch": 0.92, - "grad_norm": 146.74620056152344, + "grad_norm": 0.07790106534957886, "learning_rate": 4.503619441571872e-05, - "loss": 0.4484, + "loss": 0.1374, "step": 980 }, { "epoch": 0.93, - "grad_norm": 38.64397048950195, + "grad_norm": 4.508092403411865, "learning_rate": 4.493278179937952e-05, - "loss": 0.5375, + "loss": 0.458, "step": 990 }, { "epoch": 0.94, - "grad_norm": 9.168529510498047, + "grad_norm": 0.2888680100440979, "learning_rate": 4.4829369183040333e-05, - "loss": 0.4305, + "loss": 0.315, "step": 1000 }, { "epoch": 0.95, - "grad_norm": 11.166984558105469, + "grad_norm": 0.36161041259765625, "learning_rate": 4.472595656670114e-05, - "loss": 0.5018, + "loss": 0.0882, "step": 1010 }, { "epoch": 0.96, - "grad_norm": 2.078636646270752, + "grad_norm": 0.2502145767211914, "learning_rate": 4.462254395036195e-05, - "loss": 0.5365, + "loss": 0.063, "step": 1020 }, { "epoch": 0.97, - "grad_norm": 2.5594077110290527, + "grad_norm": 0.17374250292778015, "learning_rate": 4.451913133402275e-05, - "loss": 0.2867, + "loss": 0.1916, "step": 1030 }, { "epoch": 0.97, - "grad_norm": 40.97541046142578, + "grad_norm": 2.548640251159668, "learning_rate": 4.441571871768356e-05, - "loss": 0.5615, + "loss": 0.3626, "step": 1040 }, { "epoch": 0.98, - "grad_norm": 5.333548545837402, + "grad_norm": 0.27965047955513, "learning_rate": 4.4312306101344364e-05, - "loss": 0.371, + "loss": 0.2225, "step": 1050 }, { "epoch": 0.99, - "grad_norm": 29.254602432250977, + "grad_norm": 4.920745372772217, "learning_rate": 4.420889348500517e-05, - "loss": 0.7758, + "loss": 0.5524, "step": 1060 }, { "epoch": 1.0, - "eval_accuracy": 0.8180112570356473, - "eval_f1": 0.831304347826087, - "eval_loss": 0.4565158784389496, - "eval_precision": 0.7734627831715211, - "eval_recall": 0.8984962406015038, - "eval_runtime": 1.5762, - "eval_samples_per_second": 338.164, - "eval_steps_per_second": 42.508, + "eval_accuracy": 0.7936210131332082, + "eval_f1": 0.8166666666666667, + "eval_loss": 0.6987214088439941, + "eval_precision": 0.7335329341317365, + "eval_recall": 0.9210526315789473, + "eval_runtime": 1.4067, + "eval_samples_per_second": 378.902, + "eval_steps_per_second": 47.629, "step": 1067 }, { "epoch": 1.0, - "grad_norm": 4.774388313293457, + "grad_norm": 5.866567611694336, "learning_rate": 4.410548086866598e-05, - "loss": 0.4325, + "loss": 0.3374, "step": 1070 }, { "epoch": 1.01, - "grad_norm": 25.7803897857666, + "grad_norm": 0.8940852284431458, "learning_rate": 4.400206825232678e-05, - "loss": 0.3303, + "loss": 0.3523, "step": 1080 }, { "epoch": 1.02, - "grad_norm": 4.480411529541016, + "grad_norm": 0.5038263201713562, "learning_rate": 4.389865563598759e-05, - "loss": 0.425, + "loss": 0.2949, "step": 1090 }, { "epoch": 1.03, - "grad_norm": 2.4002604484558105, + "grad_norm": 4.7829179763793945, "learning_rate": 4.3795243019648394e-05, - "loss": 0.386, + "loss": 0.246, "step": 1100 }, { "epoch": 1.04, - "grad_norm": 61.24464416503906, + "grad_norm": 2.0902047157287598, "learning_rate": 4.369183040330921e-05, - "loss": 0.277, + "loss": 0.3073, "step": 1110 }, { "epoch": 1.05, - "grad_norm": 25.068614959716797, + "grad_norm": 1.7576603889465332, "learning_rate": 4.3588417786970015e-05, - "loss": 0.354, + "loss": 0.4624, "step": 1120 }, { "epoch": 1.06, - "grad_norm": 1.0058590173721313, + "grad_norm": 1.3686978816986084, "learning_rate": 4.3485005170630825e-05, - "loss": 0.3038, + "loss": 0.2415, "step": 1130 }, { "epoch": 1.07, - "grad_norm": 408.1586608886719, + "grad_norm": 0.6715884804725647, "learning_rate": 4.338159255429163e-05, - "loss": 0.5157, + "loss": 0.2274, "step": 1140 }, { "epoch": 1.08, - "grad_norm": 6.401839256286621, + "grad_norm": 2.798624038696289, "learning_rate": 4.327817993795243e-05, - "loss": 0.4993, + "loss": 0.2798, "step": 1150 }, { "epoch": 1.09, - "grad_norm": 6.1133317947387695, + "grad_norm": 0.3907501697540283, "learning_rate": 4.317476732161324e-05, - "loss": 0.4571, + "loss": 0.1533, "step": 1160 }, { "epoch": 1.1, - "grad_norm": 2.3999035358428955, + "grad_norm": 0.4938846230506897, "learning_rate": 4.3071354705274045e-05, - "loss": 0.4561, + "loss": 0.284, "step": 1170 }, { "epoch": 1.11, - "grad_norm": 12.716032028198242, + "grad_norm": 1.830908179283142, "learning_rate": 4.2967942088934855e-05, - "loss": 0.2751, + "loss": 0.1958, "step": 1180 }, { "epoch": 1.12, - "grad_norm": 36.668941497802734, + "grad_norm": 0.467227965593338, "learning_rate": 4.286452947259566e-05, - "loss": 0.328, + "loss": 0.1826, "step": 1190 }, { "epoch": 1.12, - "grad_norm": 8.840494155883789, + "grad_norm": 1.6289818286895752, "learning_rate": 4.276111685625647e-05, - "loss": 0.4708, + "loss": 0.2573, "step": 1200 }, { "epoch": 1.13, - "grad_norm": 2.9924654960632324, + "grad_norm": 0.42744210362434387, "learning_rate": 4.265770423991727e-05, - "loss": 0.2318, + "loss": 0.0986, "step": 1210 }, { "epoch": 1.14, - "grad_norm": 0.4895036816596985, + "grad_norm": 0.48458239436149597, "learning_rate": 4.255429162357808e-05, - "loss": 0.3037, + "loss": 0.2624, "step": 1220 }, { "epoch": 1.15, - "grad_norm": 44.88699722290039, + "grad_norm": 0.6485515832901001, "learning_rate": 4.2450879007238886e-05, - "loss": 0.3585, + "loss": 0.317, "step": 1230 }, { "epoch": 1.16, - "grad_norm": 5.701717376708984, + "grad_norm": 3.006849527359009, "learning_rate": 4.2347466390899696e-05, - "loss": 0.5978, + "loss": 0.2392, "step": 1240 }, { "epoch": 1.17, - "grad_norm": 1.0451136827468872, + "grad_norm": 1.1750074625015259, "learning_rate": 4.22440537745605e-05, - "loss": 0.8103, + "loss": 0.2915, "step": 1250 }, { "epoch": 1.18, - "grad_norm": 2.085301399230957, + "grad_norm": 0.35948407649993896, "learning_rate": 4.21406411582213e-05, - "loss": 0.3038, + "loss": 0.1648, "step": 1260 }, { "epoch": 1.19, - "grad_norm": 66.31008911132812, + "grad_norm": 0.35395777225494385, "learning_rate": 4.203722854188211e-05, - "loss": 0.5508, + "loss": 0.1433, "step": 1270 }, { "epoch": 1.2, - "grad_norm": 19.349102020263672, + "grad_norm": 0.526642382144928, "learning_rate": 4.1933815925542916e-05, - "loss": 0.2065, + "loss": 0.1706, "step": 1280 }, { "epoch": 1.21, - "grad_norm": 2.037997007369995, + "grad_norm": 2.7297651767730713, "learning_rate": 4.1830403309203726e-05, - "loss": 0.4177, + "loss": 0.2143, "step": 1290 }, { "epoch": 1.22, - "grad_norm": 0.6965357065200806, + "grad_norm": 0.2764374911785126, "learning_rate": 4.172699069286453e-05, - "loss": 0.3867, + "loss": 0.2931, "step": 1300 }, { "epoch": 1.23, - "grad_norm": 0.722122848033905, + "grad_norm": 2.820064067840576, "learning_rate": 4.162357807652534e-05, - "loss": 0.4882, + "loss": 0.3882, "step": 1310 }, { "epoch": 1.24, - "grad_norm": 9.377044677734375, + "grad_norm": 3.801469326019287, "learning_rate": 4.152016546018614e-05, - "loss": 0.4661, + "loss": 0.2653, "step": 1320 }, { "epoch": 1.25, - "grad_norm": 0.40828177332878113, + "grad_norm": 0.3793887495994568, "learning_rate": 4.141675284384695e-05, - "loss": 0.4306, + "loss": 0.2912, "step": 1330 }, { "epoch": 1.26, - "grad_norm": 4.25218391418457, + "grad_norm": 1.7214317321777344, "learning_rate": 4.1313340227507756e-05, - "loss": 0.4858, + "loss": 0.3062, "step": 1340 }, { "epoch": 1.27, - "grad_norm": 3.3388795852661133, + "grad_norm": 0.3773093819618225, "learning_rate": 4.1209927611168567e-05, - "loss": 0.5039, + "loss": 0.1882, "step": 1350 }, { "epoch": 1.27, - "grad_norm": 6.391818523406982, + "grad_norm": 4.08633279800415, "learning_rate": 4.110651499482937e-05, - "loss": 0.4893, + "loss": 0.3492, "step": 1360 }, { "epoch": 1.28, - "grad_norm": 27.01968765258789, + "grad_norm": 0.6732409596443176, "learning_rate": 4.100310237849018e-05, - "loss": 0.4502, + "loss": 0.2416, "step": 1370 }, { "epoch": 1.29, - "grad_norm": 29.210132598876953, + "grad_norm": 2.674741268157959, "learning_rate": 4.0899689762150983e-05, - "loss": 0.3657, + "loss": 0.2195, "step": 1380 }, { "epoch": 1.3, - "grad_norm": 54.26066970825195, + "grad_norm": 2.5985941886901855, "learning_rate": 4.079627714581179e-05, - "loss": 0.2958, + "loss": 0.2772, "step": 1390 }, { "epoch": 1.31, - "grad_norm": 36.26032257080078, + "grad_norm": 2.2892799377441406, "learning_rate": 4.06928645294726e-05, - "loss": 0.2768, + "loss": 0.3021, "step": 1400 }, { "epoch": 1.32, - "grad_norm": 1.0523189306259155, + "grad_norm": 1.6137515306472778, "learning_rate": 4.05894519131334e-05, - "loss": 0.3634, + "loss": 0.4407, "step": 1410 }, { "epoch": 1.33, - "grad_norm": 191.5191192626953, + "grad_norm": 1.6823959350585938, "learning_rate": 4.048603929679421e-05, - "loss": 0.3442, + "loss": 0.1717, "step": 1420 }, { "epoch": 1.34, - "grad_norm": 0.6843079924583435, + "grad_norm": 0.7000110745429993, "learning_rate": 4.0382626680455014e-05, - "loss": 0.4025, + "loss": 0.4317, "step": 1430 }, { "epoch": 1.35, - "grad_norm": 0.2217019647359848, + "grad_norm": 14.843025207519531, "learning_rate": 4.0279214064115824e-05, - "loss": 0.2445, + "loss": 0.5217, "step": 1440 }, { "epoch": 1.36, - "grad_norm": 7.919774055480957, + "grad_norm": 1.8638509511947632, "learning_rate": 4.017580144777663e-05, - "loss": 0.548, + "loss": 0.6968, "step": 1450 }, { "epoch": 1.37, - "grad_norm": 1.322029709815979, + "grad_norm": 4.495730876922607, "learning_rate": 4.007238883143744e-05, - "loss": 0.3518, + "loss": 0.3835, "step": 1460 }, { "epoch": 1.38, - "grad_norm": 0.4105088412761688, + "grad_norm": 80.59029388427734, "learning_rate": 3.996897621509824e-05, - "loss": 0.3859, + "loss": 0.3564, "step": 1470 }, { "epoch": 1.39, - "grad_norm": 0.31217288970947266, + "grad_norm": 0.4503944218158722, "learning_rate": 3.986556359875905e-05, - "loss": 0.3287, + "loss": 0.2868, "step": 1480 }, { "epoch": 1.4, - "grad_norm": 39.67789077758789, + "grad_norm": 1.2642120122909546, "learning_rate": 3.9762150982419854e-05, - "loss": 0.4649, + "loss": 0.272, "step": 1490 }, { "epoch": 1.41, - "grad_norm": 17.59617042541504, + "grad_norm": 5.5861005783081055, "learning_rate": 3.965873836608066e-05, - "loss": 0.4667, + "loss": 0.3955, "step": 1500 }, { "epoch": 1.42, - "grad_norm": 14.470635414123535, + "grad_norm": 64.48252868652344, "learning_rate": 3.9555325749741475e-05, - "loss": 0.4061, + "loss": 0.3061, "step": 1510 }, { "epoch": 1.42, - "grad_norm": 62.437618255615234, + "grad_norm": 8.142120361328125, "learning_rate": 3.945191313340228e-05, - "loss": 0.466, + "loss": 0.2618, "step": 1520 }, { "epoch": 1.43, - "grad_norm": 23.588979721069336, + "grad_norm": 3.7314465045928955, "learning_rate": 3.934850051706309e-05, - "loss": 0.3333, + "loss": 0.2106, "step": 1530 }, { "epoch": 1.44, - "grad_norm": 3.491382598876953, + "grad_norm": 1.7706559896469116, "learning_rate": 3.924508790072389e-05, - "loss": 0.2536, + "loss": 0.4735, "step": 1540 }, { "epoch": 1.45, - "grad_norm": 0.3138933479785919, + "grad_norm": 0.5831993222236633, "learning_rate": 3.91416752843847e-05, - "loss": 0.3297, + "loss": 0.474, "step": 1550 }, { "epoch": 1.46, - "grad_norm": 4.9050211906433105, + "grad_norm": 1.258558750152588, "learning_rate": 3.9038262668045505e-05, - "loss": 0.3643, + "loss": 0.4618, "step": 1560 }, { "epoch": 1.47, - "grad_norm": 13.311896324157715, + "grad_norm": 1.3511230945587158, "learning_rate": 3.8934850051706315e-05, - "loss": 0.2224, + "loss": 0.3205, "step": 1570 }, { "epoch": 1.48, - "grad_norm": 136.0790252685547, + "grad_norm": 1.4303970336914062, "learning_rate": 3.883143743536712e-05, - "loss": 0.5125, + "loss": 0.2386, "step": 1580 }, { "epoch": 1.49, - "grad_norm": 1.9536207914352417, + "grad_norm": 2.5400567054748535, "learning_rate": 3.872802481902792e-05, - "loss": 0.3984, + "loss": 0.3399, "step": 1590 }, { "epoch": 1.5, - "grad_norm": 15.781469345092773, + "grad_norm": 1.0604122877120972, "learning_rate": 3.862461220268873e-05, - "loss": 0.4071, + "loss": 0.2023, "step": 1600 }, { "epoch": 1.51, - "grad_norm": 4.010928153991699, + "grad_norm": 5.520498275756836, "learning_rate": 3.8521199586349535e-05, - "loss": 0.3088, + "loss": 0.2842, "step": 1610 }, { "epoch": 1.52, - "grad_norm": 15.730233192443848, + "grad_norm": 0.4855865240097046, "learning_rate": 3.8417786970010346e-05, - "loss": 0.4905, + "loss": 0.2712, "step": 1620 }, { "epoch": 1.53, - "grad_norm": 18.30491065979004, + "grad_norm": 0.35705462098121643, "learning_rate": 3.831437435367115e-05, - "loss": 0.366, + "loss": 0.2409, "step": 1630 }, { "epoch": 1.54, - "grad_norm": 0.39613863825798035, + "grad_norm": 0.26277977228164673, "learning_rate": 3.821096173733196e-05, - "loss": 0.3205, + "loss": 0.2584, "step": 1640 }, { "epoch": 1.55, - "grad_norm": 51.20091247558594, + "grad_norm": 3.6166861057281494, "learning_rate": 3.810754912099276e-05, - "loss": 0.5628, + "loss": 0.4416, "step": 1650 }, { "epoch": 1.56, - "grad_norm": 17.219799041748047, + "grad_norm": 2.64589524269104, "learning_rate": 3.800413650465357e-05, - "loss": 0.6222, + "loss": 0.3003, "step": 1660 }, { "epoch": 1.57, - "grad_norm": 41.373634338378906, + "grad_norm": 62.66883087158203, "learning_rate": 3.7900723888314376e-05, - "loss": 0.293, + "loss": 0.2302, "step": 1670 }, { "epoch": 1.57, - "grad_norm": 77.48905181884766, + "grad_norm": 1.1689012050628662, "learning_rate": 3.7797311271975186e-05, - "loss": 0.3165, + "loss": 0.5299, "step": 1680 }, { "epoch": 1.58, - "grad_norm": 94.227783203125, + "grad_norm": 0.7352681756019592, "learning_rate": 3.769389865563599e-05, - "loss": 0.455, + "loss": 0.1651, "step": 1690 }, { "epoch": 1.59, - "grad_norm": 21.350830078125, + "grad_norm": 0.66200852394104, "learning_rate": 3.759048603929679e-05, - "loss": 0.4369, + "loss": 0.2608, "step": 1700 }, { "epoch": 1.6, - "grad_norm": 17.99802589416504, + "grad_norm": 13.166357040405273, "learning_rate": 3.74870734229576e-05, - "loss": 0.4513, + "loss": 0.3542, "step": 1710 }, { "epoch": 1.61, - "grad_norm": 2.999746322631836, + "grad_norm": 1.8984485864639282, "learning_rate": 3.7383660806618406e-05, - "loss": 0.3328, + "loss": 0.3457, "step": 1720 }, { "epoch": 1.62, - "grad_norm": 1.245676040649414, + "grad_norm": 1.0245816707611084, "learning_rate": 3.7280248190279217e-05, - "loss": 0.5112, + "loss": 0.3729, "step": 1730 }, { "epoch": 1.63, - "grad_norm": 0.8955879211425781, + "grad_norm": 2.527099370956421, "learning_rate": 3.717683557394002e-05, - "loss": 0.2696, + "loss": 0.2096, "step": 1740 }, { "epoch": 1.64, - "grad_norm": 14.041428565979004, + "grad_norm": 0.7097912430763245, "learning_rate": 3.707342295760083e-05, - "loss": 0.3887, + "loss": 0.241, "step": 1750 }, { "epoch": 1.65, - "grad_norm": 10.70749568939209, + "grad_norm": 0.5669069886207581, "learning_rate": 3.6970010341261633e-05, - "loss": 0.5087, + "loss": 0.2875, "step": 1760 }, { "epoch": 1.66, - "grad_norm": 26.841726303100586, + "grad_norm": 2.8181681632995605, "learning_rate": 3.6866597724922444e-05, - "loss": 0.5083, + "loss": 0.3424, "step": 1770 }, { "epoch": 1.67, - "grad_norm": 9.196104049682617, + "grad_norm": 2.679267406463623, "learning_rate": 3.676318510858325e-05, - "loss": 0.3208, + "loss": 0.2514, "step": 1780 }, { "epoch": 1.68, - "grad_norm": 15.969416618347168, + "grad_norm": 5.605787754058838, "learning_rate": 3.665977249224406e-05, - "loss": 0.3257, + "loss": 0.4296, "step": 1790 }, { "epoch": 1.69, - "grad_norm": 27.856632232666016, + "grad_norm": 1.4018586874008179, "learning_rate": 3.655635987590486e-05, - "loss": 0.3663, + "loss": 0.2786, "step": 1800 }, { "epoch": 1.7, - "grad_norm": 1.3687829971313477, + "grad_norm": 0.8706427216529846, "learning_rate": 3.645294725956567e-05, - "loss": 0.3494, + "loss": 0.2925, "step": 1810 }, { "epoch": 1.71, - "grad_norm": 24.28046989440918, + "grad_norm": 0.6901386976242065, "learning_rate": 3.6349534643226474e-05, - "loss": 0.4308, + "loss": 0.1585, "step": 1820 }, { "epoch": 1.72, - "grad_norm": 0.7270779013633728, + "grad_norm": 2.785902500152588, "learning_rate": 3.624612202688728e-05, - "loss": 0.3686, + "loss": 0.2355, "step": 1830 }, { "epoch": 1.72, - "grad_norm": 0.5241263508796692, + "grad_norm": 2.0460734367370605, "learning_rate": 3.614270941054809e-05, - "loss": 0.3533, + "loss": 0.1592, "step": 1840 }, { "epoch": 1.73, - "grad_norm": 120.1414794921875, + "grad_norm": 1.6502751111984253, "learning_rate": 3.603929679420889e-05, - "loss": 0.1999, + "loss": 0.3338, "step": 1850 }, { "epoch": 1.74, - "grad_norm": 26.968687057495117, + "grad_norm": 1.587488055229187, "learning_rate": 3.59358841778697e-05, - "loss": 0.3365, + "loss": 0.4915, "step": 1860 }, { "epoch": 1.75, - "grad_norm": 31.189796447753906, + "grad_norm": 0.7224433422088623, "learning_rate": 3.5832471561530504e-05, - "loss": 0.5752, + "loss": 0.6087, "step": 1870 }, { "epoch": 1.76, - "grad_norm": 3.3193278312683105, + "grad_norm": 0.8321720957756042, "learning_rate": 3.5729058945191315e-05, - "loss": 0.4337, + "loss": 0.2709, "step": 1880 }, { "epoch": 1.77, - "grad_norm": 17.27393913269043, + "grad_norm": 0.8136805891990662, "learning_rate": 3.562564632885212e-05, - "loss": 0.4251, + "loss": 0.1631, "step": 1890 }, { "epoch": 1.78, - "grad_norm": 44.38993835449219, + "grad_norm": 18.07825469970703, "learning_rate": 3.552223371251293e-05, - "loss": 0.3813, + "loss": 0.2628, "step": 1900 }, { "epoch": 1.79, - "grad_norm": 14.854008674621582, + "grad_norm": 0.4381904602050781, "learning_rate": 3.541882109617373e-05, - "loss": 0.349, + "loss": 0.0763, "step": 1910 }, { "epoch": 1.8, - "grad_norm": 6.35029935836792, + "grad_norm": 0.4933386743068695, "learning_rate": 3.531540847983454e-05, - "loss": 0.3621, + "loss": 0.1871, "step": 1920 }, { "epoch": 1.81, - "grad_norm": 38.28048324584961, + "grad_norm": 0.3448088765144348, "learning_rate": 3.521199586349535e-05, - "loss": 0.3143, + "loss": 0.277, "step": 1930 }, { "epoch": 1.82, - "grad_norm": 3.274022340774536, + "grad_norm": 0.33174896240234375, "learning_rate": 3.5108583247156155e-05, - "loss": 0.3147, + "loss": 0.0749, "step": 1940 }, { "epoch": 1.83, - "grad_norm": 24.270750045776367, + "grad_norm": 0.7721774578094482, "learning_rate": 3.5005170630816965e-05, - "loss": 0.486, + "loss": 0.1818, "step": 1950 }, { "epoch": 1.84, - "grad_norm": 26.76897621154785, + "grad_norm": 0.26144295930862427, "learning_rate": 3.490175801447777e-05, - "loss": 0.3353, + "loss": 0.1525, "step": 1960 }, { "epoch": 1.85, - "grad_norm": 28.255657196044922, + "grad_norm": 0.190896674990654, "learning_rate": 3.479834539813858e-05, - "loss": 0.3074, + "loss": 0.0797, "step": 1970 }, { "epoch": 1.86, - "grad_norm": 14.969733238220215, + "grad_norm": 3.312373638153076, "learning_rate": 3.469493278179938e-05, - "loss": 0.3648, + "loss": 0.2365, "step": 1980 }, { "epoch": 1.87, - "grad_norm": 13.978160858154297, + "grad_norm": 0.1972302347421646, "learning_rate": 3.459152016546019e-05, - "loss": 0.3108, + "loss": 0.1235, "step": 1990 }, { "epoch": 1.87, - "grad_norm": 4.878652095794678, + "grad_norm": 3.559814214706421, "learning_rate": 3.4488107549120996e-05, - "loss": 0.3236, + "loss": 0.2742, "step": 2000 }, { "epoch": 1.88, - "grad_norm": 9.4730806350708, + "grad_norm": 1.8290061950683594, "learning_rate": 3.4384694932781806e-05, - "loss": 0.3036, + "loss": 0.1612, "step": 2010 }, { "epoch": 1.89, - "grad_norm": 71.49247741699219, + "grad_norm": 0.30610841512680054, "learning_rate": 3.428128231644261e-05, - "loss": 0.3228, + "loss": 0.1185, "step": 2020 }, { "epoch": 1.9, - "grad_norm": 11.899968147277832, + "grad_norm": 0.32188501954078674, "learning_rate": 3.417786970010341e-05, - "loss": 0.422, + "loss": 0.1858, "step": 2030 }, { "epoch": 1.91, - "grad_norm": 10.609174728393555, + "grad_norm": 0.20377053320407867, "learning_rate": 3.407445708376422e-05, - "loss": 0.2226, + "loss": 0.1092, "step": 2040 }, { "epoch": 1.92, - "grad_norm": 0.9701952338218689, + "grad_norm": 0.182037815451622, "learning_rate": 3.3971044467425026e-05, - "loss": 0.3561, + "loss": 0.1015, "step": 2050 }, { "epoch": 1.93, - "grad_norm": 16.32933235168457, + "grad_norm": 25.248992919921875, "learning_rate": 3.3867631851085836e-05, - "loss": 0.3124, + "loss": 0.2925, "step": 2060 }, { "epoch": 1.94, - "grad_norm": 1.6818139553070068, + "grad_norm": 0.8001343607902527, "learning_rate": 3.376421923474664e-05, - "loss": 0.274, + "loss": 0.4151, "step": 2070 }, { "epoch": 1.95, - "grad_norm": 2.9835000038146973, + "grad_norm": 1.3534541130065918, "learning_rate": 3.366080661840745e-05, - "loss": 0.3897, + "loss": 0.4384, "step": 2080 }, { "epoch": 1.96, - "grad_norm": 0.44764024019241333, + "grad_norm": 0.3796235918998718, "learning_rate": 3.355739400206825e-05, - "loss": 0.3541, + "loss": 0.1112, "step": 2090 }, { "epoch": 1.97, - "grad_norm": 34.775390625, + "grad_norm": 0.280908465385437, "learning_rate": 3.345398138572906e-05, - "loss": 0.3058, + "loss": 0.1345, "step": 2100 }, { "epoch": 1.98, - "grad_norm": 10.107741355895996, + "grad_norm": 0.2541828155517578, "learning_rate": 3.3350568769389867e-05, - "loss": 0.2805, + "loss": 0.1643, "step": 2110 }, { "epoch": 1.99, - "grad_norm": 9.582823753356934, + "grad_norm": 0.25121456384658813, "learning_rate": 3.324715615305068e-05, - "loss": 0.3837, + "loss": 0.064, "step": 2120 }, { "epoch": 2.0, - "grad_norm": 86.71916198730469, + "grad_norm": 0.19068244099617004, "learning_rate": 3.314374353671148e-05, - "loss": 0.4249, + "loss": 0.0118, "step": 2130 }, { "epoch": 2.0, - "eval_accuracy": 0.8555347091932458, - "eval_f1": 0.8560747663551402, - "eval_loss": 0.3931359052658081, - "eval_precision": 0.8513011152416357, - "eval_recall": 0.8609022556390977, - "eval_runtime": 1.5356, - "eval_samples_per_second": 347.106, - "eval_steps_per_second": 43.632, + "eval_accuracy": 0.851782363977486, + "eval_f1": 0.8500948766603416, + "eval_loss": 0.7103251218795776, + "eval_precision": 0.8582375478927203, + "eval_recall": 0.8421052631578947, + "eval_runtime": 1.4454, + "eval_samples_per_second": 368.759, + "eval_steps_per_second": 46.354, "step": 2134 }, { "epoch": 2.01, - "grad_norm": 2.3712680339813232, + "grad_norm": 0.4352416694164276, "learning_rate": 3.3040330920372283e-05, - "loss": 0.1953, + "loss": 0.1945, "step": 2140 }, { "epoch": 2.01, - "grad_norm": 28.71424102783203, + "grad_norm": 3.1081056594848633, "learning_rate": 3.2936918304033094e-05, - "loss": 0.1712, + "loss": 0.192, "step": 2150 }, { "epoch": 2.02, - "grad_norm": 0.18505160510540009, + "grad_norm": 0.20173217356204987, "learning_rate": 3.28335056876939e-05, - "loss": 0.3299, + "loss": 0.1651, "step": 2160 }, { "epoch": 2.03, - "grad_norm": 74.10963439941406, + "grad_norm": 3.0507891178131104, "learning_rate": 3.273009307135471e-05, - "loss": 0.2948, + "loss": 0.1877, "step": 2170 }, { "epoch": 2.04, - "grad_norm": 1.3842068910598755, + "grad_norm": 0.13503152132034302, "learning_rate": 3.262668045501551e-05, - "loss": 0.3453, + "loss": 0.2316, "step": 2180 }, { "epoch": 2.05, - "grad_norm": 10.536425590515137, + "grad_norm": 0.2812797725200653, "learning_rate": 3.252326783867632e-05, - "loss": 0.23, + "loss": 0.3679, "step": 2190 }, { "epoch": 2.06, - "grad_norm": 28.119539260864258, + "grad_norm": 0.2773813307285309, "learning_rate": 3.2419855222337124e-05, - "loss": 0.2037, + "loss": 0.0743, "step": 2200 }, { "epoch": 2.07, - "grad_norm": 0.34018927812576294, + "grad_norm": 0.19849738478660583, "learning_rate": 3.2316442605997934e-05, - "loss": 0.1796, + "loss": 0.1193, "step": 2210 }, { "epoch": 2.08, - "grad_norm": 2.8183701038360596, + "grad_norm": 5.642420768737793, "learning_rate": 3.221302998965874e-05, - "loss": 0.5003, + "loss": 0.2526, "step": 2220 }, { "epoch": 2.09, - "grad_norm": 0.478957861661911, + "grad_norm": 0.19763706624507904, "learning_rate": 3.210961737331955e-05, - "loss": 0.14, + "loss": 0.1146, "step": 2230 }, { "epoch": 2.1, - "grad_norm": 2.9625442028045654, + "grad_norm": 0.19127729535102844, "learning_rate": 3.200620475698035e-05, - "loss": 0.3229, + "loss": 0.1787, "step": 2240 }, { "epoch": 2.11, - "grad_norm": 0.5814473032951355, + "grad_norm": 0.23325544595718384, "learning_rate": 3.190279214064116e-05, - "loss": 0.1369, + "loss": 0.2123, "step": 2250 }, { "epoch": 2.12, - "grad_norm": 1.5362516641616821, + "grad_norm": 3.2584617137908936, "learning_rate": 3.1799379524301965e-05, - "loss": 0.1266, + "loss": 0.1797, "step": 2260 }, { "epoch": 2.13, - "grad_norm": 30.91729736328125, + "grad_norm": 66.83531951904297, "learning_rate": 3.169596690796277e-05, - "loss": 0.4261, + "loss": 0.315, "step": 2270 }, { "epoch": 2.14, - "grad_norm": 6.7020673751831055, + "grad_norm": 13.469511032104492, "learning_rate": 3.159255429162358e-05, - "loss": 0.3651, + "loss": 0.3296, "step": 2280 }, { "epoch": 2.15, - "grad_norm": 3.024595022201538, + "grad_norm": 1.0547977685928345, "learning_rate": 3.148914167528438e-05, - "loss": 0.2789, + "loss": 0.2146, "step": 2290 }, { "epoch": 2.16, - "grad_norm": 387.9389953613281, + "grad_norm": 32.664710998535156, "learning_rate": 3.138572905894519e-05, - "loss": 0.4301, + "loss": 0.2088, "step": 2300 }, { "epoch": 2.16, - "grad_norm": 0.2992786467075348, + "grad_norm": 1.26264226436615, "learning_rate": 3.1282316442605995e-05, - "loss": 0.276, + "loss": 0.1234, "step": 2310 }, { "epoch": 2.17, - "grad_norm": 4.288240432739258, + "grad_norm": 0.5536983609199524, "learning_rate": 3.117890382626681e-05, - "loss": 0.0722, + "loss": 0.1124, "step": 2320 }, { "epoch": 2.18, - "grad_norm": 0.20439238846302032, + "grad_norm": 34.93430709838867, "learning_rate": 3.1075491209927615e-05, - "loss": 0.1687, + "loss": 0.1095, "step": 2330 }, { "epoch": 2.19, - "grad_norm": 0.17426259815692902, + "grad_norm": 0.1729714721441269, "learning_rate": 3.097207859358842e-05, - "loss": 0.3444, + "loss": 0.2375, "step": 2340 }, { "epoch": 2.2, - "grad_norm": 2.802089214324951, + "grad_norm": 0.18604367971420288, "learning_rate": 3.086866597724923e-05, - "loss": 0.4715, + "loss": 0.0587, "step": 2350 }, { "epoch": 2.21, - "grad_norm": 0.27714866399765015, + "grad_norm": 0.1283494383096695, "learning_rate": 3.076525336091003e-05, - "loss": 0.2581, + "loss": 0.1216, "step": 2360 }, { "epoch": 2.22, - "grad_norm": 9.906682014465332, + "grad_norm": 78.32118225097656, "learning_rate": 3.066184074457084e-05, - "loss": 0.1077, + "loss": 0.2107, "step": 2370 }, { "epoch": 2.23, - "grad_norm": 17.114538192749023, + "grad_norm": 2.105651378631592, "learning_rate": 3.0558428128231646e-05, - "loss": 0.3108, + "loss": 0.3923, "step": 2380 }, { "epoch": 2.24, - "grad_norm": 5.7600178718566895, + "grad_norm": 0.33363184332847595, "learning_rate": 3.0455015511892452e-05, - "loss": 0.3055, + "loss": 0.213, "step": 2390 }, { "epoch": 2.25, - "grad_norm": 0.660796582698822, + "grad_norm": 0.2842453122138977, "learning_rate": 3.035160289555326e-05, - "loss": 0.1998, + "loss": 0.051, "step": 2400 }, { "epoch": 2.26, - "grad_norm": 0.19979248940944672, + "grad_norm": 0.2736751139163971, "learning_rate": 3.0248190279214066e-05, - "loss": 0.3523, + "loss": 0.3842, "step": 2410 }, { "epoch": 2.27, - "grad_norm": 0.6000668406486511, + "grad_norm": 0.2922070622444153, "learning_rate": 3.0144777662874873e-05, - "loss": 0.1009, + "loss": 0.2233, "step": 2420 }, { "epoch": 2.28, - "grad_norm": 0.15889067947864532, + "grad_norm": 0.28500527143478394, "learning_rate": 3.004136504653568e-05, - "loss": 0.0861, + "loss": 0.0617, "step": 2430 }, { "epoch": 2.29, - "grad_norm": 0.32534798979759216, + "grad_norm": 7.861285209655762, "learning_rate": 2.9937952430196486e-05, - "loss": 0.1119, + "loss": 0.2619, "step": 2440 }, { "epoch": 2.3, - "grad_norm": 6.747263431549072, + "grad_norm": 0.8819804787635803, "learning_rate": 2.9834539813857293e-05, - "loss": 0.2019, + "loss": 0.1612, "step": 2450 }, { "epoch": 2.31, - "grad_norm": 0.0872272402048111, + "grad_norm": 0.41944587230682373, "learning_rate": 2.97311271975181e-05, - "loss": 0.3013, + "loss": 0.1717, "step": 2460 }, { "epoch": 2.31, - "grad_norm": 0.418996125459671, + "grad_norm": 0.20882445573806763, "learning_rate": 2.9627714581178906e-05, - "loss": 0.3679, + "loss": 0.3839, "step": 2470 }, { "epoch": 2.32, - "grad_norm": 34.88379669189453, + "grad_norm": 3.0659005641937256, "learning_rate": 2.9524301964839713e-05, - "loss": 0.3748, + "loss": 0.2136, "step": 2480 }, { "epoch": 2.33, - "grad_norm": 0.8229822516441345, + "grad_norm": 0.2248445302248001, "learning_rate": 2.942088934850052e-05, - "loss": 0.3046, + "loss": 0.1435, "step": 2490 }, { "epoch": 2.34, - "grad_norm": 1.254721999168396, + "grad_norm": 0.26371681690216064, "learning_rate": 2.9317476732161327e-05, - "loss": 0.4162, + "loss": 0.1847, "step": 2500 }, { "epoch": 2.35, - "grad_norm": 0.805443286895752, + "grad_norm": 554.4179077148438, "learning_rate": 2.921406411582213e-05, - "loss": 0.2995, + "loss": 0.3933, "step": 2510 }, { "epoch": 2.36, - "grad_norm": 0.21225698292255402, + "grad_norm": 0.27695804834365845, "learning_rate": 2.9110651499482937e-05, - "loss": 0.2081, + "loss": 0.1195, "step": 2520 }, { "epoch": 2.37, - "grad_norm": 1.794771671295166, + "grad_norm": 0.5627778172492981, "learning_rate": 2.9007238883143744e-05, - "loss": 0.2845, + "loss": 0.2075, "step": 2530 }, { "epoch": 2.38, - "grad_norm": 0.45042118430137634, + "grad_norm": 0.26693472266197205, "learning_rate": 2.890382626680455e-05, - "loss": 0.1312, + "loss": 0.1704, "step": 2540 }, { "epoch": 2.39, - "grad_norm": 2.9366912841796875, + "grad_norm": 2.9638285636901855, "learning_rate": 2.8800413650465357e-05, - "loss": 0.2553, + "loss": 0.3962, "step": 2550 }, { "epoch": 2.4, - "grad_norm": 0.3621287941932678, + "grad_norm": 0.25458648800849915, "learning_rate": 2.8697001034126164e-05, - "loss": 0.419, + "loss": 0.1056, "step": 2560 }, { "epoch": 2.41, - "grad_norm": 0.33347466588020325, + "grad_norm": 0.3186055123806, "learning_rate": 2.859358841778697e-05, - "loss": 0.1815, + "loss": 0.2689, "step": 2570 }, { "epoch": 2.42, - "grad_norm": 2.7859878540039062, + "grad_norm": 0.272903710603714, "learning_rate": 2.8490175801447777e-05, - "loss": 0.2801, + "loss": 0.0668, "step": 2580 }, { "epoch": 2.43, - "grad_norm": 0.30300045013427734, + "grad_norm": 2.9451920986175537, "learning_rate": 2.8386763185108584e-05, - "loss": 0.1433, + "loss": 0.0815, "step": 2590 }, { "epoch": 2.44, - "grad_norm": 98.14810943603516, + "grad_norm": 193.8448486328125, "learning_rate": 2.828335056876939e-05, - "loss": 0.2292, + "loss": 0.2416, "step": 2600 }, { "epoch": 2.45, - "grad_norm": 4.406097412109375, + "grad_norm": 0.24242287874221802, "learning_rate": 2.8179937952430198e-05, - "loss": 0.1548, + "loss": 0.2184, "step": 2610 }, { "epoch": 2.46, - "grad_norm": 5.7536234855651855, + "grad_norm": 0.1883140653371811, "learning_rate": 2.8076525336091004e-05, - "loss": 0.1126, + "loss": 0.06, "step": 2620 }, { "epoch": 2.46, - "grad_norm": 36.3065299987793, + "grad_norm": 152.6375274658203, "learning_rate": 2.7973112719751808e-05, - "loss": 0.3183, + "loss": 0.2696, "step": 2630 }, { "epoch": 2.47, - "grad_norm": 34.32221603393555, + "grad_norm": 52.7486686706543, "learning_rate": 2.7869700103412615e-05, - "loss": 0.342, + "loss": 0.2919, "step": 2640 }, { "epoch": 2.48, - "grad_norm": 0.14024418592453003, + "grad_norm": 0.2107856720685959, "learning_rate": 2.776628748707342e-05, - "loss": 0.3198, + "loss": 0.2631, "step": 2650 }, { "epoch": 2.49, - "grad_norm": 0.32867252826690674, + "grad_norm": 0.21232011914253235, "learning_rate": 2.7662874870734228e-05, - "loss": 0.34, + "loss": 0.1414, "step": 2660 }, { "epoch": 2.5, - "grad_norm": 4.173491954803467, + "grad_norm": 0.27856922149658203, "learning_rate": 2.7559462254395035e-05, - "loss": 0.2842, + "loss": 0.0647, "step": 2670 }, { "epoch": 2.51, - "grad_norm": 167.7888641357422, + "grad_norm": 18.96074676513672, "learning_rate": 2.745604963805584e-05, - "loss": 0.2955, + "loss": 0.2706, "step": 2680 }, { "epoch": 2.52, - "grad_norm": 0.7819741368293762, + "grad_norm": 0.1531578004360199, "learning_rate": 2.735263702171665e-05, - "loss": 0.0981, + "loss": 0.0589, "step": 2690 }, { "epoch": 2.53, - "grad_norm": 3.086498498916626, + "grad_norm": 0.13904044032096863, "learning_rate": 2.7249224405377455e-05, - "loss": 0.2673, + "loss": 0.1598, "step": 2700 }, { "epoch": 2.54, - "grad_norm": 0.086996890604496, + "grad_norm": 0.5579696893692017, "learning_rate": 2.7145811789038262e-05, - "loss": 0.2067, + "loss": 0.0233, "step": 2710 }, { "epoch": 2.55, - "grad_norm": 0.09881442785263062, + "grad_norm": 3.028477430343628, "learning_rate": 2.7042399172699072e-05, - "loss": 0.365, + "loss": 0.2927, "step": 2720 }, { "epoch": 2.56, - "grad_norm": 43.90235137939453, + "grad_norm": 0.15513668954372406, "learning_rate": 2.693898655635988e-05, - "loss": 0.1421, + "loss": 0.0464, "step": 2730 }, { "epoch": 2.57, - "grad_norm": 0.06362267583608627, + "grad_norm": 0.11886773258447647, "learning_rate": 2.6835573940020685e-05, - "loss": 0.3048, + "loss": 0.0602, "step": 2740 }, { "epoch": 2.58, - "grad_norm": 0.08752346783876419, + "grad_norm": 0.1136392205953598, "learning_rate": 2.6732161323681492e-05, - "loss": 0.0187, + "loss": 0.007, "step": 2750 }, { "epoch": 2.59, - "grad_norm": 0.5300313234329224, + "grad_norm": 0.10946289449930191, "learning_rate": 2.66287487073423e-05, - "loss": 0.1348, + "loss": 0.2097, "step": 2760 }, { "epoch": 2.6, - "grad_norm": 0.04888344556093216, + "grad_norm": 0.10061541199684143, "learning_rate": 2.6525336091003106e-05, - "loss": 0.1468, + "loss": 0.1469, "step": 2770 }, { "epoch": 2.61, - "grad_norm": 13.05820369720459, + "grad_norm": 6.262234687805176, "learning_rate": 2.6421923474663913e-05, - "loss": 0.4363, + "loss": 0.2607, "step": 2780 }, { "epoch": 2.61, - "grad_norm": 3.226611852645874, + "grad_norm": 40.82796096801758, "learning_rate": 2.631851085832472e-05, - "loss": 0.2585, + "loss": 0.3085, "step": 2790 }, { "epoch": 2.62, - "grad_norm": 127.92499542236328, + "grad_norm": 0.13032160699367523, "learning_rate": 2.6215098241985526e-05, - "loss": 0.2791, + "loss": 0.0238, "step": 2800 }, { "epoch": 2.63, - "grad_norm": 4.131041526794434, + "grad_norm": 5.553137302398682, "learning_rate": 2.6111685625646333e-05, - "loss": 0.3342, + "loss": 0.0747, "step": 2810 }, { "epoch": 2.64, - "grad_norm": 9.45926570892334, + "grad_norm": 3.196852684020996, "learning_rate": 2.600827300930714e-05, - "loss": 0.4095, + "loss": 0.3373, "step": 2820 }, { "epoch": 2.65, - "grad_norm": 0.3252735435962677, + "grad_norm": 0.13691984117031097, "learning_rate": 2.5904860392967943e-05, - "loss": 0.1566, + "loss": 0.0726, "step": 2830 }, { "epoch": 2.66, - "grad_norm": 17.8929386138916, + "grad_norm": 0.11953440308570862, "learning_rate": 2.580144777662875e-05, - "loss": 0.2914, + "loss": 0.0064, "step": 2840 }, { "epoch": 2.67, - "grad_norm": 0.4428044557571411, + "grad_norm": 0.11957331746816635, "learning_rate": 2.5698035160289556e-05, - "loss": 0.1937, + "loss": 0.1053, "step": 2850 }, { "epoch": 2.68, - "grad_norm": 0.196681946516037, + "grad_norm": 54.33175277709961, "learning_rate": 2.5594622543950363e-05, - "loss": 0.1334, + "loss": 0.3292, "step": 2860 }, { "epoch": 2.69, - "grad_norm": 14.465700149536133, + "grad_norm": 4.716727256774902, "learning_rate": 2.549120992761117e-05, - "loss": 0.4305, + "loss": 0.3259, "step": 2870 }, { "epoch": 2.7, - "grad_norm": 0.25671684741973877, + "grad_norm": 3.1103293895721436, "learning_rate": 2.5387797311271977e-05, - "loss": 0.3097, + "loss": 0.2598, "step": 2880 }, { "epoch": 2.71, - "grad_norm": 8.047564506530762, + "grad_norm": 3.2426257133483887, "learning_rate": 2.5284384694932783e-05, - "loss": 0.483, + "loss": 0.2396, "step": 2890 }, { "epoch": 2.72, - "grad_norm": 12.367530822753906, + "grad_norm": 28.899869918823242, "learning_rate": 2.518097207859359e-05, - "loss": 0.1829, + "loss": 0.064, "step": 2900 }, { "epoch": 2.73, - "grad_norm": 0.40190449357032776, + "grad_norm": 0.18505991995334625, "learning_rate": 2.5077559462254397e-05, - "loss": 0.2601, + "loss": 0.1202, "step": 2910 }, { "epoch": 2.74, - "grad_norm": 0.18794836103916168, + "grad_norm": 0.1994311809539795, "learning_rate": 2.4974146845915204e-05, - "loss": 0.1182, + "loss": 0.302, "step": 2920 }, { "epoch": 2.75, - "grad_norm": 4.731940269470215, + "grad_norm": 0.22046104073524475, "learning_rate": 2.487073422957601e-05, - "loss": 0.4801, + "loss": 0.2079, "step": 2930 }, { "epoch": 2.76, - "grad_norm": 72.07913208007812, + "grad_norm": 2.033285617828369, "learning_rate": 2.4767321613236817e-05, - "loss": 0.1826, + "loss": 0.2222, "step": 2940 }, { "epoch": 2.76, - "grad_norm": 0.34144583344459534, + "grad_norm": 0.3028343915939331, "learning_rate": 2.466390899689762e-05, - "loss": 0.2458, + "loss": 0.2839, "step": 2950 }, { "epoch": 2.77, - "grad_norm": 2.847762107849121, + "grad_norm": 0.3970238268375397, "learning_rate": 2.4560496380558427e-05, - "loss": 0.2508, + "loss": 0.111, "step": 2960 }, { "epoch": 2.78, - "grad_norm": 1.0531044006347656, + "grad_norm": 38.20304489135742, "learning_rate": 2.4457083764219234e-05, - "loss": 0.3881, + "loss": 0.1972, "step": 2970 }, { "epoch": 2.79, - "grad_norm": 0.8281335830688477, + "grad_norm": 0.2983644902706146, "learning_rate": 2.435367114788004e-05, - "loss": 0.221, + "loss": 0.1076, "step": 2980 }, { "epoch": 2.8, - "grad_norm": 12.635905265808105, + "grad_norm": 0.24695682525634766, "learning_rate": 2.4250258531540848e-05, - "loss": 0.2008, + "loss": 0.0882, "step": 2990 }, { "epoch": 2.81, - "grad_norm": 0.37540680170059204, + "grad_norm": 0.22463589906692505, "learning_rate": 2.4146845915201654e-05, - "loss": 0.3256, + "loss": 0.2107, "step": 3000 }, { "epoch": 2.82, - "grad_norm": 0.306692510843277, + "grad_norm": 41.61905288696289, "learning_rate": 2.404343329886246e-05, - "loss": 0.2346, + "loss": 0.274, "step": 3010 }, { "epoch": 2.83, - "grad_norm": 0.23924803733825684, + "grad_norm": 0.194807231426239, "learning_rate": 2.394002068252327e-05, - "loss": 0.2269, + "loss": 0.1362, "step": 3020 }, { "epoch": 2.84, - "grad_norm": 43.663082122802734, + "grad_norm": 0.16440044343471527, "learning_rate": 2.3836608066184078e-05, - "loss": 0.2109, + "loss": 0.099, "step": 3030 }, { "epoch": 2.85, - "grad_norm": 3.9084434509277344, + "grad_norm": 0.43695831298828125, "learning_rate": 2.3733195449844885e-05, - "loss": 0.4289, + "loss": 0.148, "step": 3040 }, { "epoch": 2.86, - "grad_norm": 1.4723049402236938, + "grad_norm": 0.16196587681770325, "learning_rate": 2.3629782833505688e-05, - "loss": 0.2405, + "loss": 0.3657, "step": 3050 }, { "epoch": 2.87, - "grad_norm": 3.2489137649536133, + "grad_norm": 0.13062936067581177, "learning_rate": 2.3526370217166495e-05, - "loss": 0.2141, + "loss": 0.1996, "step": 3060 }, { "epoch": 2.88, - "grad_norm": 10.305293083190918, + "grad_norm": 17.27379608154297, "learning_rate": 2.3422957600827302e-05, - "loss": 0.4549, + "loss": 0.2486, "step": 3070 }, { "epoch": 2.89, - "grad_norm": 18.369117736816406, + "grad_norm": 2.8673007488250732, "learning_rate": 2.331954498448811e-05, - "loss": 0.3059, + "loss": 0.1317, "step": 3080 }, { "epoch": 2.9, - "grad_norm": 24.559249877929688, + "grad_norm": 156.50790405273438, "learning_rate": 2.3216132368148915e-05, - "loss": 0.1978, + "loss": 0.1618, "step": 3090 }, { "epoch": 2.91, - "grad_norm": 0.5222854018211365, + "grad_norm": 0.14409740269184113, "learning_rate": 2.3112719751809722e-05, - "loss": 0.1677, + "loss": 0.2678, "step": 3100 }, { "epoch": 2.91, - "grad_norm": 7.490653038024902, + "grad_norm": 55.17076873779297, "learning_rate": 2.300930713547053e-05, - "loss": 0.3634, + "loss": 0.2296, "step": 3110 }, { "epoch": 2.92, - "grad_norm": 0.39714524149894714, + "grad_norm": 0.35058945417404175, "learning_rate": 2.2905894519131335e-05, - "loss": 0.2277, + "loss": 0.1965, "step": 3120 }, { "epoch": 2.93, - "grad_norm": 0.16488385200500488, + "grad_norm": 0.17419961094856262, "learning_rate": 2.2802481902792142e-05, - "loss": 0.1301, + "loss": 0.2563, "step": 3130 }, { "epoch": 2.94, - "grad_norm": 3.3054399490356445, + "grad_norm": 0.13027483224868774, "learning_rate": 2.269906928645295e-05, - "loss": 0.3157, + "loss": 0.3372, "step": 3140 }, { "epoch": 2.95, - "grad_norm": 0.1860102266073227, + "grad_norm": 0.11983130872249603, "learning_rate": 2.2595656670113756e-05, - "loss": 0.1737, + "loss": 0.0509, "step": 3150 }, { "epoch": 2.96, - "grad_norm": 6.283505916595459, + "grad_norm": 2.41009783744812, "learning_rate": 2.2492244053774563e-05, - "loss": 0.1935, + "loss": 0.0769, "step": 3160 }, { "epoch": 2.97, - "grad_norm": 0.3177931010723114, + "grad_norm": 0.11595846712589264, "learning_rate": 2.2388831437435366e-05, - "loss": 0.2413, + "loss": 0.1942, "step": 3170 }, { "epoch": 2.98, - "grad_norm": 9.307920455932617, + "grad_norm": 136.9188690185547, "learning_rate": 2.2285418821096173e-05, - "loss": 0.2362, + "loss": 0.0619, "step": 3180 }, { "epoch": 2.99, - "grad_norm": 20.813262939453125, + "grad_norm": 10.190946578979492, "learning_rate": 2.218200620475698e-05, - "loss": 0.2896, + "loss": 0.196, "step": 3190 }, { "epoch": 3.0, - "grad_norm": 0.17595571279525757, + "grad_norm": 0.11687775701284409, "learning_rate": 2.2078593588417786e-05, - "loss": 0.1038, + "loss": 0.1802, "step": 3200 }, { "epoch": 3.0, - "eval_accuracy": 0.8667917448405253, - "eval_f1": 0.864244741873805, - "eval_loss": 0.5857056975364685, - "eval_precision": 0.8793774319066148, - "eval_recall": 0.849624060150376, - "eval_runtime": 1.5831, - "eval_samples_per_second": 336.679, - "eval_steps_per_second": 42.322, + "eval_accuracy": 0.8574108818011257, + "eval_f1": 0.8608058608058609, + "eval_loss": 0.7037833333015442, + "eval_precision": 0.8392857142857143, + "eval_recall": 0.8834586466165414, + "eval_runtime": 1.4332, + "eval_samples_per_second": 371.895, + "eval_steps_per_second": 46.749, "step": 3201 }, { "epoch": 3.01, - "grad_norm": 0.1377851516008377, + "grad_norm": 0.2782648205757141, "learning_rate": 2.1975180972078593e-05, - "loss": 0.0617, + "loss": 0.0062, "step": 3210 }, { "epoch": 3.02, - "grad_norm": 8.743398666381836, + "grad_norm": 32.99894714355469, "learning_rate": 2.1871768355739403e-05, - "loss": 0.2428, + "loss": 0.1245, "step": 3220 }, { "epoch": 3.03, - "grad_norm": 1.2507226467132568, + "grad_norm": 0.12525002658367157, "learning_rate": 2.176835573940021e-05, - "loss": 0.187, + "loss": 0.19, "step": 3230 }, { "epoch": 3.04, - "grad_norm": 0.17167934775352478, + "grad_norm": 0.10308614373207092, "learning_rate": 2.1664943123061017e-05, - "loss": 0.1137, + "loss": 0.0759, "step": 3240 }, { "epoch": 3.05, - "grad_norm": 5.110489845275879, + "grad_norm": 3.754702568054199, "learning_rate": 2.1561530506721823e-05, - "loss": 0.0637, + "loss": 0.0751, "step": 3250 }, { "epoch": 3.06, - "grad_norm": 0.11725780367851257, + "grad_norm": 0.09151796996593475, "learning_rate": 2.145811789038263e-05, - "loss": 0.1933, + "loss": 0.0074, "step": 3260 }, { "epoch": 3.06, - "grad_norm": 0.11883195489645004, + "grad_norm": 311.2441711425781, "learning_rate": 2.1354705274043433e-05, - "loss": 0.1123, + "loss": 0.1058, "step": 3270 }, { "epoch": 3.07, - "grad_norm": 7.5306315422058105, + "grad_norm": 0.10257299989461899, "learning_rate": 2.125129265770424e-05, - "loss": 0.1364, + "loss": 0.0054, "step": 3280 }, { "epoch": 3.08, - "grad_norm": 0.7140617370605469, + "grad_norm": 0.6756200194358826, "learning_rate": 2.1147880041365047e-05, - "loss": 0.148, + "loss": 0.0657, "step": 3290 }, { "epoch": 3.09, - "grad_norm": 0.9037841558456421, + "grad_norm": 0.131881982088089, "learning_rate": 2.1044467425025854e-05, - "loss": 0.0663, + "loss": 0.0735, "step": 3300 }, { "epoch": 3.1, - "grad_norm": 0.05070972070097923, + "grad_norm": 0.0882406160235405, "learning_rate": 2.094105480868666e-05, - "loss": 0.0595, + "loss": 0.0732, "step": 3310 }, { "epoch": 3.11, - "grad_norm": 0.06952597945928574, + "grad_norm": 0.06303223967552185, "learning_rate": 2.0837642192347467e-05, - "loss": 0.0802, + "loss": 0.0034, "step": 3320 }, { "epoch": 3.12, - "grad_norm": 0.09682538360357285, + "grad_norm": 0.06847697496414185, "learning_rate": 2.0734229576008274e-05, - "loss": 0.0756, + "loss": 0.1137, "step": 3330 }, { "epoch": 3.13, - "grad_norm": 7.944037914276123, + "grad_norm": 0.05840703472495079, "learning_rate": 2.063081695966908e-05, - "loss": 0.1648, + "loss": 0.003, "step": 3340 }, { "epoch": 3.14, - "grad_norm": 0.049520719796419144, + "grad_norm": 4.701480388641357, "learning_rate": 2.0527404343329888e-05, - "loss": 0.2814, + "loss": 0.057, "step": 3350 }, { "epoch": 3.15, - "grad_norm": 0.12401607632637024, + "grad_norm": 0.06599827110767365, "learning_rate": 2.0423991726990694e-05, - "loss": 0.1665, + "loss": 0.1948, "step": 3360 }, { "epoch": 3.16, - "grad_norm": 3.3132777214050293, + "grad_norm": 0.07265453785657883, "learning_rate": 2.03205791106515e-05, - "loss": 0.2163, + "loss": 0.2271, "step": 3370 }, { "epoch": 3.17, - "grad_norm": 0.06092935800552368, + "grad_norm": 0.07454103976488113, "learning_rate": 2.0217166494312308e-05, - "loss": 0.055, + "loss": 0.0787, "step": 3380 }, { "epoch": 3.18, - "grad_norm": 0.09410246461629868, + "grad_norm": 0.06254041194915771, "learning_rate": 2.011375387797311e-05, - "loss": 0.0564, + "loss": 0.0035, "step": 3390 }, { "epoch": 3.19, - "grad_norm": 0.09851285070180893, + "grad_norm": 0.07290501892566681, "learning_rate": 2.0010341261633918e-05, - "loss": 0.0996, + "loss": 0.0705, "step": 3400 }, { "epoch": 3.2, - "grad_norm": 0.15248961746692657, + "grad_norm": 0.07300185412168503, "learning_rate": 1.9906928645294725e-05, - "loss": 0.202, + "loss": 0.2761, "step": 3410 }, { "epoch": 3.21, - "grad_norm": 0.06347668915987015, + "grad_norm": 0.09265441447496414, "learning_rate": 1.980351602895553e-05, - "loss": 0.0841, + "loss": 0.2529, "step": 3420 }, { "epoch": 3.21, - "grad_norm": 193.50746154785156, + "grad_norm": 0.3778844475746155, "learning_rate": 1.970010341261634e-05, - "loss": 0.4587, + "loss": 0.1832, "step": 3430 }, { "epoch": 3.22, - "grad_norm": 0.1516660451889038, + "grad_norm": 2.613478899002075, "learning_rate": 1.959669079627715e-05, - "loss": 0.0655, + "loss": 0.1361, "step": 3440 }, { "epoch": 3.23, - "grad_norm": 0.42711296677589417, + "grad_norm": 0.08724066615104675, "learning_rate": 1.9493278179937955e-05, - "loss": 0.1857, + "loss": 0.2884, "step": 3450 }, { "epoch": 3.24, - "grad_norm": 0.11023885011672974, + "grad_norm": 0.09438508003950119, "learning_rate": 1.9389865563598762e-05, - "loss": 0.0828, + "loss": 0.1839, "step": 3460 }, { "epoch": 3.25, - "grad_norm": 0.061248745769262314, + "grad_norm": 0.09784245491027832, "learning_rate": 1.928645294725957e-05, - "loss": 0.1167, + "loss": 0.173, "step": 3470 }, { "epoch": 3.26, - "grad_norm": 0.06203962489962578, + "grad_norm": 0.08967234194278717, "learning_rate": 1.9183040330920375e-05, - "loss": 0.1089, + "loss": 0.1881, "step": 3480 }, { "epoch": 3.27, - "grad_norm": 19.638856887817383, + "grad_norm": 0.09921535849571228, "learning_rate": 1.907962771458118e-05, - "loss": 0.1296, + "loss": 0.1038, "step": 3490 }, { "epoch": 3.28, - "grad_norm": 0.05891411006450653, + "grad_norm": 0.08819156140089035, "learning_rate": 1.8976215098241985e-05, - "loss": 0.0137, + "loss": 0.0053, "step": 3500 }, { "epoch": 3.29, - "grad_norm": 0.10509899258613586, + "grad_norm": 74.00690460205078, "learning_rate": 1.8872802481902792e-05, - "loss": 0.0542, + "loss": 0.014, "step": 3510 }, { "epoch": 3.3, - "grad_norm": 3.27602481842041, + "grad_norm": 2.492738723754883, "learning_rate": 1.87693898655636e-05, - "loss": 0.1655, + "loss": 0.1475, "step": 3520 }, { "epoch": 3.31, - "grad_norm": 0.08927629142999649, + "grad_norm": 0.08610210567712784, "learning_rate": 1.8665977249224406e-05, - "loss": 0.0737, + "loss": 0.0365, "step": 3530 }, { "epoch": 3.32, - "grad_norm": 0.2776730954647064, + "grad_norm": 0.45213979482650757, "learning_rate": 1.8562564632885213e-05, - "loss": 0.1569, + "loss": 0.3655, "step": 3540 }, { "epoch": 3.33, - "grad_norm": 3.598266124725342, + "grad_norm": 4.162100791931152, "learning_rate": 1.845915201654602e-05, - "loss": 0.3042, + "loss": 0.2041, "step": 3550 }, { "epoch": 3.34, - "grad_norm": 3.0984160900115967, + "grad_norm": 2.3223049640655518, "learning_rate": 1.8355739400206826e-05, - "loss": 0.1836, + "loss": 0.0742, "step": 3560 }, { "epoch": 3.35, - "grad_norm": 0.09482892602682114, + "grad_norm": 0.09031692892313004, "learning_rate": 1.8252326783867633e-05, - "loss": 0.0684, + "loss": 0.0773, "step": 3570 }, { "epoch": 3.36, - "grad_norm": 4.153615474700928, + "grad_norm": 0.09234865009784698, "learning_rate": 1.814891416752844e-05, - "loss": 0.188, + "loss": 0.196, "step": 3580 }, { "epoch": 3.36, - "grad_norm": 241.65493774414062, + "grad_norm": 0.1055426225066185, "learning_rate": 1.8045501551189246e-05, - "loss": 0.1741, + "loss": 0.1085, "step": 3590 }, { "epoch": 3.37, - "grad_norm": 0.13454625010490417, + "grad_norm": 0.07118582725524902, "learning_rate": 1.7942088934850053e-05, - "loss": 0.0606, + "loss": 0.0076, "step": 3600 }, { "epoch": 3.38, - "grad_norm": 0.12659063935279846, + "grad_norm": 1.206527829170227, "learning_rate": 1.7838676318510856e-05, - "loss": 0.0743, + "loss": 0.0053, "step": 3610 }, { "epoch": 3.39, - "grad_norm": 0.12111490219831467, + "grad_norm": 446.3911437988281, "learning_rate": 1.7735263702171663e-05, - "loss": 0.0042, + "loss": 0.0745, "step": 3620 }, { "epoch": 3.4, - "grad_norm": 3.1387064456939697, + "grad_norm": 2.56962513923645, "learning_rate": 1.7631851085832473e-05, - "loss": 0.3181, + "loss": 0.2187, "step": 3630 }, { "epoch": 3.41, - "grad_norm": 66.55416870117188, + "grad_norm": 0.36162999272346497, "learning_rate": 1.752843846949328e-05, - "loss": 0.1883, + "loss": 0.0797, "step": 3640 }, { "epoch": 3.42, - "grad_norm": 9.684922218322754, + "grad_norm": 0.1231626346707344, "learning_rate": 1.7425025853154087e-05, - "loss": 0.129, + "loss": 0.0706, "step": 3650 }, { "epoch": 3.43, - "grad_norm": 0.1191384568810463, + "grad_norm": 0.07382514327764511, "learning_rate": 1.7321613236814894e-05, - "loss": 0.1684, + "loss": 0.2472, "step": 3660 }, { "epoch": 3.44, - "grad_norm": 0.09780813008546829, + "grad_norm": 0.08766525983810425, "learning_rate": 1.72182006204757e-05, - "loss": 0.0052, + "loss": 0.0043, "step": 3670 }, { "epoch": 3.45, - "grad_norm": 3.4690449237823486, + "grad_norm": 0.07739522308111191, "learning_rate": 1.7114788004136507e-05, - "loss": 0.2599, + "loss": 0.0544, "step": 3680 }, { "epoch": 3.46, - "grad_norm": 0.1335831582546234, + "grad_norm": 0.07944560796022415, "learning_rate": 1.7011375387797314e-05, - "loss": 0.2327, + "loss": 0.0655, "step": 3690 }, { "epoch": 3.47, - "grad_norm": 0.1467386782169342, + "grad_norm": 9.697904586791992, "learning_rate": 1.690796277145812e-05, - "loss": 0.1308, + "loss": 0.1178, "step": 3700 }, { "epoch": 3.48, - "grad_norm": 373.8140869140625, + "grad_norm": 2.5675041675567627, "learning_rate": 1.6804550155118924e-05, - "loss": 0.2381, + "loss": 0.1142, "step": 3710 }, { "epoch": 3.49, - "grad_norm": 0.08402853459119797, + "grad_norm": 0.14886552095413208, "learning_rate": 1.670113753877973e-05, - "loss": 0.0955, + "loss": 0.0074, "step": 3720 }, { "epoch": 3.5, - "grad_norm": 171.42713928222656, + "grad_norm": 106.72728729248047, "learning_rate": 1.6597724922440538e-05, - "loss": 0.127, + "loss": 0.355, "step": 3730 }, { "epoch": 3.51, - "grad_norm": 0.11677587032318115, + "grad_norm": 0.06539061665534973, "learning_rate": 1.6494312306101344e-05, - "loss": 0.0415, + "loss": 0.0476, "step": 3740 }, { "epoch": 3.51, - "grad_norm": 0.0933157354593277, + "grad_norm": 70.10369873046875, "learning_rate": 1.639089968976215e-05, - "loss": 0.1434, + "loss": 0.0446, "step": 3750 }, { "epoch": 3.52, - "grad_norm": 0.06422335654497147, + "grad_norm": 0.06210116297006607, "learning_rate": 1.6287487073422958e-05, - "loss": 0.2715, + "loss": 0.0033, "step": 3760 }, { "epoch": 3.53, - "grad_norm": 0.08810001611709595, + "grad_norm": 0.05276573449373245, "learning_rate": 1.6184074457083765e-05, - "loss": 0.077, + "loss": 0.003, "step": 3770 }, { "epoch": 3.54, - "grad_norm": 0.08342567831277847, + "grad_norm": 0.053160134702920914, "learning_rate": 1.608066184074457e-05, - "loss": 0.2652, + "loss": 0.1561, "step": 3780 }, { "epoch": 3.55, - "grad_norm": 0.10808378458023071, + "grad_norm": 0.07925640046596527, "learning_rate": 1.5977249224405378e-05, - "loss": 0.0818, + "loss": 0.0036, "step": 3790 }, { "epoch": 3.56, - "grad_norm": 0.06099862605333328, + "grad_norm": 0.053619351238012314, "learning_rate": 1.5873836608066185e-05, - "loss": 0.195, + "loss": 0.0907, "step": 3800 }, { "epoch": 3.57, - "grad_norm": 17.37858772277832, + "grad_norm": 0.6370459198951721, "learning_rate": 1.577042399172699e-05, - "loss": 0.2626, + "loss": 0.0026, "step": 3810 }, { "epoch": 3.58, - "grad_norm": 0.15915565192699432, + "grad_norm": 0.062730573117733, "learning_rate": 1.56670113753878e-05, - "loss": 0.0859, + "loss": 0.191, "step": 3820 }, { "epoch": 3.59, - "grad_norm": 13.893326759338379, + "grad_norm": 0.043475110083818436, "learning_rate": 1.5563598759048605e-05, - "loss": 0.1901, + "loss": 0.1617, "step": 3830 }, { "epoch": 3.6, - "grad_norm": 0.19214822351932526, + "grad_norm": 240.55931091308594, "learning_rate": 1.5460186142709412e-05, - "loss": 0.1584, + "loss": 0.0346, "step": 3840 }, { "epoch": 3.61, - "grad_norm": 0.07823023200035095, + "grad_norm": 0.06091690808534622, "learning_rate": 1.535677352637022e-05, - "loss": 0.1354, + "loss": 0.2554, "step": 3850 }, { "epoch": 3.62, - "grad_norm": 0.2098669558763504, + "grad_norm": 0.8543986678123474, "learning_rate": 1.5253360910031025e-05, - "loss": 0.2055, + "loss": 0.153, "step": 3860 }, { "epoch": 3.63, - "grad_norm": 3.7674732208251953, + "grad_norm": 11.088409423828125, "learning_rate": 1.5149948293691832e-05, - "loss": 0.1174, + "loss": 0.0748, "step": 3870 }, { "epoch": 3.64, - "grad_norm": 0.2199217975139618, + "grad_norm": 0.06965257227420807, "learning_rate": 1.5046535677352639e-05, - "loss": 0.2572, + "loss": 0.2284, "step": 3880 }, { "epoch": 3.65, - "grad_norm": 13.79918098449707, + "grad_norm": 116.66575622558594, "learning_rate": 1.4943123061013446e-05, - "loss": 0.1485, + "loss": 0.2113, "step": 3890 }, { "epoch": 3.66, - "grad_norm": 57.75919723510742, + "grad_norm": 1.2777670621871948, "learning_rate": 1.483971044467425e-05, - "loss": 0.2868, + "loss": 0.2787, "step": 3900 }, { "epoch": 3.66, - "grad_norm": 0.26549792289733887, + "grad_norm": 0.09148576855659485, "learning_rate": 1.4736297828335057e-05, - "loss": 0.1935, + "loss": 0.0842, "step": 3910 }, { "epoch": 3.67, - "grad_norm": 0.11525550484657288, + "grad_norm": 0.07752866297960281, "learning_rate": 1.4632885211995864e-05, - "loss": 0.1079, + "loss": 0.004, "step": 3920 }, { "epoch": 3.68, - "grad_norm": 0.11162678152322769, + "grad_norm": 3.558208703994751, "learning_rate": 1.4529472595656671e-05, - "loss": 0.2091, + "loss": 0.1208, "step": 3930 }, { "epoch": 3.69, - "grad_norm": 0.18426716327667236, + "grad_norm": 0.09885440021753311, "learning_rate": 1.4426059979317478e-05, - "loss": 0.188, + "loss": 0.1466, "step": 3940 }, { "epoch": 3.7, - "grad_norm": 0.4274304509162903, + "grad_norm": 3.113973617553711, "learning_rate": 1.4322647362978284e-05, - "loss": 0.272, + "loss": 0.2648, "step": 3950 }, { "epoch": 3.71, - "grad_norm": 0.33479902148246765, + "grad_norm": 1.2651716470718384, "learning_rate": 1.421923474663909e-05, - "loss": 0.0622, + "loss": 0.1344, "step": 3960 }, { "epoch": 3.72, - "grad_norm": 0.14429700374603271, + "grad_norm": 0.12687820196151733, "learning_rate": 1.4115822130299896e-05, - "loss": 0.3126, + "loss": 0.1268, "step": 3970 }, { "epoch": 3.73, - "grad_norm": 17.044048309326172, + "grad_norm": 3.607964038848877, "learning_rate": 1.4012409513960703e-05, - "loss": 0.1594, + "loss": 0.068, "step": 3980 }, { "epoch": 3.74, - "grad_norm": 0.23532900214195251, + "grad_norm": 0.4581145644187927, "learning_rate": 1.390899689762151e-05, - "loss": 0.1205, + "loss": 0.0703, "step": 3990 }, { "epoch": 3.75, - "grad_norm": 0.15815971791744232, + "grad_norm": 0.23489171266555786, "learning_rate": 1.3805584281282317e-05, - "loss": 0.1607, + "loss": 0.0055, "step": 4000 }, { "epoch": 3.76, - "grad_norm": 0.1729772984981537, + "grad_norm": 0.10075139254331589, "learning_rate": 1.3702171664943123e-05, - "loss": 0.0638, + "loss": 0.0048, "step": 4010 }, { "epoch": 3.77, - "grad_norm": 10.028935432434082, + "grad_norm": 0.10037043690681458, "learning_rate": 1.3598759048603928e-05, - "loss": 0.1221, + "loss": 0.0683, "step": 4020 }, { "epoch": 3.78, - "grad_norm": 0.1198321133852005, + "grad_norm": 0.08124475181102753, "learning_rate": 1.3495346432264739e-05, - "loss": 0.2659, + "loss": 0.004, "step": 4030 }, { "epoch": 3.79, - "grad_norm": 0.061706289649009705, + "grad_norm": 0.07579688727855682, "learning_rate": 1.3391933815925545e-05, - "loss": 0.0927, + "loss": 0.013, "step": 4040 }, { "epoch": 3.8, - "grad_norm": 18.801136016845703, + "grad_norm": 0.07704436779022217, "learning_rate": 1.3288521199586352e-05, - "loss": 0.3013, + "loss": 0.2172, "step": 4050 }, { "epoch": 3.81, - "grad_norm": 3.1690914630889893, + "grad_norm": 39.54595947265625, "learning_rate": 1.3185108583247157e-05, - "loss": 0.1682, + "loss": 0.1718, "step": 4060 }, { "epoch": 3.81, - "grad_norm": 1.2492997646331787, + "grad_norm": 0.06458831578493118, "learning_rate": 1.3081695966907964e-05, - "loss": 0.1623, + "loss": 0.1191, "step": 4070 }, { "epoch": 3.82, - "grad_norm": 0.12981970608234406, + "grad_norm": 0.06719919294118881, "learning_rate": 1.297828335056877e-05, - "loss": 0.091, + "loss": 0.2055, "step": 4080 }, { "epoch": 3.83, - "grad_norm": 1.0257341861724854, + "grad_norm": 462.3251953125, "learning_rate": 1.2874870734229577e-05, - "loss": 0.3142, + "loss": 0.2145, "step": 4090 }, { "epoch": 3.84, - "grad_norm": 0.25799766182899475, + "grad_norm": 6.475925922393799, "learning_rate": 1.2771458117890384e-05, - "loss": 0.1454, + "loss": 0.1391, "step": 4100 }, { "epoch": 3.85, - "grad_norm": 40.982479095458984, + "grad_norm": 2.5063371658325195, "learning_rate": 1.2668045501551191e-05, - "loss": 0.1361, + "loss": 0.1575, "step": 4110 }, { "epoch": 3.86, - "grad_norm": 0.11332831531763077, + "grad_norm": 0.09441405534744263, "learning_rate": 1.2564632885211996e-05, - "loss": 0.0518, + "loss": 0.0038, "step": 4120 }, { "epoch": 3.87, - "grad_norm": 0.04680272936820984, + "grad_norm": 0.06849459558725357, "learning_rate": 1.2461220268872803e-05, - "loss": 0.1327, + "loss": 0.0034, "step": 4130 }, { "epoch": 3.88, - "grad_norm": 0.40334421396255493, + "grad_norm": 0.056417569518089294, "learning_rate": 1.235780765253361e-05, - "loss": 0.2707, + "loss": 0.0033, "step": 4140 }, { "epoch": 3.89, - "grad_norm": 0.09748423099517822, + "grad_norm": 0.06035558879375458, "learning_rate": 1.2254395036194416e-05, - "loss": 0.081, + "loss": 0.0027, "step": 4150 }, { "epoch": 3.9, - "grad_norm": 0.09192584455013275, + "grad_norm": 2.3210813999176025, "learning_rate": 1.2150982419855223e-05, - "loss": 0.2279, + "loss": 0.0739, "step": 4160 }, { "epoch": 3.91, - "grad_norm": 0.05917090177536011, + "grad_norm": 0.052967045456171036, "learning_rate": 1.204756980351603e-05, - "loss": 0.0764, + "loss": 0.0245, "step": 4170 }, { "epoch": 3.92, - "grad_norm": 10.152327537536621, + "grad_norm": 0.05248899757862091, "learning_rate": 1.1944157187176836e-05, - "loss": 0.1237, + "loss": 0.0029, "step": 4180 }, { "epoch": 3.93, - "grad_norm": 0.07578109949827194, + "grad_norm": 0.05719129368662834, "learning_rate": 1.1840744570837643e-05, - "loss": 0.136, + "loss": 0.0805, "step": 4190 }, { "epoch": 3.94, - "grad_norm": 0.13006837666034698, + "grad_norm": 2.66552734375, "learning_rate": 1.173733195449845e-05, - "loss": 0.0878, + "loss": 0.1218, "step": 4200 }, { "epoch": 3.95, - "grad_norm": 0.2893955111503601, + "grad_norm": 24.393230438232422, "learning_rate": 1.1633919338159257e-05, - "loss": 0.1322, + "loss": 0.1188, "step": 4210 }, { "epoch": 3.96, - "grad_norm": 0.04363380745053291, + "grad_norm": 0.0423579104244709, "learning_rate": 1.1530506721820064e-05, - "loss": 0.0566, + "loss": 0.0032, "step": 4220 }, { "epoch": 3.96, - "grad_norm": 17.78626823425293, + "grad_norm": 17.768678665161133, "learning_rate": 1.1427094105480869e-05, - "loss": 0.1348, + "loss": 0.0915, "step": 4230 }, { "epoch": 3.97, - "grad_norm": 0.08856278657913208, + "grad_norm": 0.04556003212928772, "learning_rate": 1.1323681489141675e-05, - "loss": 0.0527, + "loss": 0.0023, "step": 4240 }, { "epoch": 3.98, - "grad_norm": 3.122804880142212, + "grad_norm": 2.7925045490264893, "learning_rate": 1.1220268872802482e-05, - "loss": 0.1003, + "loss": 0.0756, "step": 4250 }, { "epoch": 3.99, - "grad_norm": 0.032136429101228714, + "grad_norm": 0.05207202211022377, "learning_rate": 1.1116856256463289e-05, - "loss": 0.1428, + "loss": 0.0026, "step": 4260 }, { "epoch": 4.0, - "eval_accuracy": 0.8799249530956847, - "eval_f1": 0.8801498127340824, - "eval_loss": 0.6459251046180725, - "eval_precision": 0.8768656716417911, - "eval_recall": 0.8834586466165414, - "eval_runtime": 1.6082, - "eval_samples_per_second": 331.423, - "eval_steps_per_second": 41.661, + "eval_accuracy": 0.8555347091932458, + "eval_f1": 0.8566108007448789, + "eval_loss": 0.7934179306030273, + "eval_precision": 0.8487084870848709, + "eval_recall": 0.8646616541353384, + "eval_runtime": 1.4626, + "eval_samples_per_second": 364.425, + "eval_steps_per_second": 45.809, "step": 4268 }, { "epoch": 4.0, - "grad_norm": 12.53495979309082, + "grad_norm": 5.6651482582092285, "learning_rate": 1.1013443640124096e-05, - "loss": 0.2006, + "loss": 0.2031, "step": 4270 }, { "epoch": 4.01, - "grad_norm": 0.36368077993392944, + "grad_norm": 0.05581251159310341, "learning_rate": 1.0910031023784902e-05, - "loss": 0.0682, + "loss": 0.0802, "step": 4280 }, { "epoch": 4.02, - "grad_norm": 0.036783114075660706, + "grad_norm": 0.06616955250501633, "learning_rate": 1.0806618407445709e-05, - "loss": 0.0578, + "loss": 0.1276, "step": 4290 }, { "epoch": 4.03, - "grad_norm": 0.032453957945108414, + "grad_norm": 0.0667022317647934, "learning_rate": 1.0703205791106516e-05, - "loss": 0.0511, + "loss": 0.0744, "step": 4300 }, { "epoch": 4.04, - "grad_norm": 0.0314195491373539, + "grad_norm": 0.0499153807759285, "learning_rate": 1.0599793174767323e-05, - "loss": 0.0016, + "loss": 0.0026, "step": 4310 }, { "epoch": 4.05, - "grad_norm": 0.34390100836753845, + "grad_norm": 2.668574810028076, "learning_rate": 1.049638055842813e-05, - "loss": 0.063, + "loss": 0.0706, "step": 4320 }, { "epoch": 4.06, - "grad_norm": 0.08080201596021652, + "grad_norm": 0.07278812676668167, "learning_rate": 1.0392967942088936e-05, - "loss": 0.0625, + "loss": 0.0751, "step": 4330 }, { "epoch": 4.07, - "grad_norm": 0.07401977479457855, + "grad_norm": 0.062381431460380554, "learning_rate": 1.0289555325749741e-05, - "loss": 0.1792, + "loss": 0.0459, "step": 4340 }, { "epoch": 4.08, - "grad_norm": 0.02694222889840603, + "grad_norm": 0.07199248671531677, "learning_rate": 1.0186142709410548e-05, - "loss": 0.064, + "loss": 0.0763, "step": 4350 }, { "epoch": 4.09, - "grad_norm": 5.257876396179199, + "grad_norm": 0.04358450695872307, "learning_rate": 1.0082730093071355e-05, - "loss": 0.094, + "loss": 0.0025, "step": 4360 }, { "epoch": 4.1, - "grad_norm": 0.13338564336299896, + "grad_norm": 1.5874137878417969, "learning_rate": 9.979317476732161e-06, - "loss": 0.0827, + "loss": 0.0025, "step": 4370 }, { "epoch": 4.1, - "grad_norm": 0.04957311972975731, + "grad_norm": 0.05725986883044243, "learning_rate": 9.875904860392968e-06, - "loss": 0.224, + "loss": 0.0853, "step": 4380 }, { "epoch": 4.11, - "grad_norm": 0.3861539363861084, + "grad_norm": 2.704402446746826, "learning_rate": 9.772492244053775e-06, - "loss": 0.0612, + "loss": 0.1137, "step": 4390 }, { "epoch": 4.12, - "grad_norm": 0.1312892585992813, + "grad_norm": 0.05556065961718559, "learning_rate": 9.669079627714582e-06, - "loss": 0.1828, + "loss": 0.0309, "step": 4400 }, { "epoch": 4.13, - "grad_norm": 0.11486046761274338, + "grad_norm": 0.041595447808504105, "learning_rate": 9.565667011375389e-06, - "loss": 0.0034, + "loss": 0.0025, "step": 4410 }, { "epoch": 4.14, - "grad_norm": 0.09139719605445862, + "grad_norm": 0.033831994980573654, "learning_rate": 9.462254395036195e-06, - "loss": 0.079, + "loss": 0.002, "step": 4420 }, { "epoch": 4.15, - "grad_norm": 0.033867135643959045, + "grad_norm": 0.046929676085710526, "learning_rate": 9.358841778697002e-06, - "loss": 0.0016, + "loss": 0.0023, "step": 4430 }, { "epoch": 4.16, - "grad_norm": 0.30136638879776, + "grad_norm": 0.04889731854200363, "learning_rate": 9.255429162357807e-06, - "loss": 0.0799, + "loss": 0.1443, "step": 4440 }, { "epoch": 4.17, - "grad_norm": 0.03786735609173775, + "grad_norm": 0.04792765527963638, "learning_rate": 9.152016546018614e-06, - "loss": 0.2652, + "loss": 0.084, "step": 4450 }, { "epoch": 4.18, - "grad_norm": 0.047059349715709686, + "grad_norm": 0.05229383334517479, "learning_rate": 9.04860392967942e-06, - "loss": 0.0014, + "loss": 0.0081, "step": 4460 }, { "epoch": 4.19, - "grad_norm": 0.054901909083127975, + "grad_norm": 0.0474727489054203, "learning_rate": 8.945191313340227e-06, - "loss": 0.0509, + "loss": 0.0022, "step": 4470 }, { "epoch": 4.2, - "grad_norm": 227.820556640625, + "grad_norm": 0.041655492037534714, "learning_rate": 8.841778697001036e-06, - "loss": 0.0222, + "loss": 0.0021, "step": 4480 }, { "epoch": 4.21, - "grad_norm": 0.043635886162519455, + "grad_norm": 0.06119935214519501, "learning_rate": 8.738366080661841e-06, - "loss": 0.0011, + "loss": 0.0021, "step": 4490 }, { "epoch": 4.22, - "grad_norm": 0.0816226452589035, + "grad_norm": 0.05464434251189232, "learning_rate": 8.634953464322648e-06, - "loss": 0.0873, + "loss": 0.0717, "step": 4500 }, { "epoch": 4.23, - "grad_norm": 0.07973457872867584, + "grad_norm": 0.057566653937101364, "learning_rate": 8.531540847983454e-06, - "loss": 0.0016, + "loss": 0.017, "step": 4510 }, { "epoch": 4.24, - "grad_norm": 0.03563067317008972, + "grad_norm": 0.04644325375556946, "learning_rate": 8.428128231644261e-06, - "loss": 0.0758, + "loss": 0.0019, "step": 4520 }, { "epoch": 4.25, - "grad_norm": 0.09928788989782333, + "grad_norm": 0.046763256192207336, "learning_rate": 8.324715615305068e-06, - "loss": 0.0595, + "loss": 0.0019, "step": 4530 }, { "epoch": 4.25, - "grad_norm": 0.027137843891978264, + "grad_norm": 0.030382856726646423, "learning_rate": 8.221302998965875e-06, - "loss": 0.0263, + "loss": 0.0053, "step": 4540 }, { "epoch": 4.26, - "grad_norm": 0.021461045369505882, + "grad_norm": 0.03069426119327545, "learning_rate": 8.11789038262668e-06, - "loss": 0.0635, + "loss": 0.0097, "step": 4550 }, { "epoch": 4.27, - "grad_norm": 0.026392286643385887, + "grad_norm": 0.03841962292790413, "learning_rate": 8.014477766287486e-06, - "loss": 0.2159, + "loss": 0.1735, "step": 4560 }, { "epoch": 4.28, - "grad_norm": 15.914426803588867, + "grad_norm": 0.05885951593518257, "learning_rate": 7.911065149948293e-06, - "loss": 0.0597, + "loss": 0.0048, "step": 4570 }, { "epoch": 4.29, - "grad_norm": 0.028651505708694458, + "grad_norm": 0.032764993607997894, "learning_rate": 7.807652533609102e-06, - "loss": 0.0014, + "loss": 0.0016, "step": 4580 }, { "epoch": 4.3, - "grad_norm": 0.07976741343736649, + "grad_norm": 0.033063892275094986, "learning_rate": 7.704239917269908e-06, - "loss": 0.0019, + "loss": 0.0015, "step": 4590 }, { "epoch": 4.31, - "grad_norm": 0.7245219945907593, + "grad_norm": 121.3236083984375, "learning_rate": 7.600827300930714e-06, - "loss": 0.1196, + "loss": 0.1054, "step": 4600 }, { "epoch": 4.32, - "grad_norm": 3.421379327774048, + "grad_norm": 4.730100631713867, "learning_rate": 7.497414684591521e-06, - "loss": 0.1707, + "loss": 0.0787, "step": 4610 }, { "epoch": 4.33, - "grad_norm": 54.59723663330078, + "grad_norm": 69.30009460449219, "learning_rate": 7.394002068252327e-06, - "loss": 0.1257, + "loss": 0.0041, "step": 4620 }, { "epoch": 4.34, - "grad_norm": 0.02159339189529419, + "grad_norm": 0.03414592146873474, "learning_rate": 7.290589451913134e-06, - "loss": 0.0011, + "loss": 0.0349, "step": 4630 }, { "epoch": 4.35, - "grad_norm": 0.02801990881562233, + "grad_norm": 0.06496775895357132, "learning_rate": 7.1871768355739405e-06, - "loss": 0.0013, + "loss": 0.0017, "step": 4640 }, { "epoch": 4.36, - "grad_norm": 0.12835495173931122, + "grad_norm": 0.03325049951672554, "learning_rate": 7.0837642192347465e-06, - "loss": 0.1343, + "loss": 0.0014, "step": 4650 }, { "epoch": 4.37, - "grad_norm": 0.06059630960226059, + "grad_norm": 0.026971573010087013, "learning_rate": 6.980351602895553e-06, - "loss": 0.0029, + "loss": 0.0014, "step": 4660 }, { "epoch": 4.38, - "grad_norm": 0.021486638113856316, + "grad_norm": 0.028269225731492043, "learning_rate": 6.87693898655636e-06, - "loss": 0.0872, + "loss": 0.0918, "step": 4670 }, { "epoch": 4.39, - "grad_norm": 0.024658987298607826, + "grad_norm": 0.026689806953072548, "learning_rate": 6.773526370217166e-06, - "loss": 0.0013, + "loss": 0.0909, "step": 4680 }, { "epoch": 4.4, - "grad_norm": 0.020257245749235153, + "grad_norm": 0.03431650996208191, "learning_rate": 6.670113753877974e-06, - "loss": 0.0012, + "loss": 0.0015, "step": 4690 }, { "epoch": 4.4, - "grad_norm": 0.022628970444202423, + "grad_norm": 0.031223468482494354, "learning_rate": 6.56670113753878e-06, - "loss": 0.0879, + "loss": 0.0015, "step": 4700 }, { "epoch": 4.41, - "grad_norm": 0.17840439081192017, + "grad_norm": 0.03299291059374809, "learning_rate": 6.463288521199587e-06, - "loss": 0.1423, + "loss": 0.0013, "step": 4710 }, { "epoch": 4.42, - "grad_norm": 248.1060333251953, + "grad_norm": 132.70225524902344, "learning_rate": 6.359875904860394e-06, - "loss": 0.1144, + "loss": 0.2171, "step": 4720 }, { "epoch": 4.43, - "grad_norm": 3.23333740234375, + "grad_norm": 2.8790526390075684, "learning_rate": 6.2564632885212e-06, - "loss": 0.0923, + "loss": 0.0792, "step": 4730 }, { "epoch": 4.44, - "grad_norm": 0.022127114236354828, + "grad_norm": 0.03240809589624405, "learning_rate": 6.153050672182006e-06, - "loss": 0.1237, + "loss": 0.0014, "step": 4740 }, { "epoch": 4.45, - "grad_norm": 0.03299199789762497, + "grad_norm": 0.03706195205450058, "learning_rate": 6.049638055842813e-06, - "loss": 0.0742, + "loss": 0.1544, "step": 4750 }, { "epoch": 4.46, - "grad_norm": 0.041477978229522705, + "grad_norm": 446.900634765625, "learning_rate": 5.94622543950362e-06, - "loss": 0.0012, + "loss": 0.0677, "step": 4760 }, { "epoch": 4.47, - "grad_norm": 0.28576919436454773, + "grad_norm": 3.300240993499756, "learning_rate": 5.842812823164427e-06, - "loss": 0.0737, + "loss": 0.2351, "step": 4770 }, { "epoch": 4.48, - "grad_norm": 0.03505215421319008, + "grad_norm": 0.03419492393732071, "learning_rate": 5.739400206825233e-06, - "loss": 0.168, + "loss": 0.0015, "step": 4780 }, { "epoch": 4.49, - "grad_norm": 0.023467887192964554, + "grad_norm": 0.03599884361028671, "learning_rate": 5.635987590486039e-06, - "loss": 0.0011, + "loss": 0.0019, "step": 4790 }, { "epoch": 4.5, - "grad_norm": 0.0256861113011837, + "grad_norm": 0.028266843408346176, "learning_rate": 5.532574974146846e-06, - "loss": 0.1276, + "loss": 0.0016, "step": 4800 }, { "epoch": 4.51, - "grad_norm": 89.85589599609375, + "grad_norm": 0.11148073524236679, "learning_rate": 5.429162357807653e-06, - "loss": 0.0627, + "loss": 0.0027, "step": 4810 }, { "epoch": 4.52, - "grad_norm": 0.019946351647377014, + "grad_norm": 0.03269073739647865, "learning_rate": 5.32574974146846e-06, - "loss": 0.0012, + "loss": 0.0315, "step": 4820 }, { "epoch": 4.53, - "grad_norm": 0.1361633539199829, + "grad_norm": 0.03238886967301369, "learning_rate": 5.222337125129266e-06, - "loss": 0.0013, + "loss": 0.0449, "step": 4830 }, { "epoch": 4.54, - "grad_norm": 0.06002867594361305, + "grad_norm": 0.0361521914601326, "learning_rate": 5.118924508790072e-06, "loss": 0.0014, "step": 4840 }, { "epoch": 4.55, - "grad_norm": 0.02299017459154129, + "grad_norm": 0.029334863647818565, "learning_rate": 5.015511892450879e-06, - "loss": 0.0895, + "loss": 0.1483, "step": 4850 }, { "epoch": 4.55, - "grad_norm": 0.01912473328411579, + "grad_norm": 0.03407660499215126, "learning_rate": 4.912099276111686e-06, - "loss": 0.0163, + "loss": 0.0015, "step": 4860 }, { "epoch": 4.56, - "grad_norm": 0.020831523463129997, + "grad_norm": 0.031936611980199814, "learning_rate": 4.8086866597724926e-06, - "loss": 0.001, + "loss": 0.0043, "step": 4870 }, { "epoch": 4.57, - "grad_norm": 0.017449194565415382, + "grad_norm": 0.024707814678549767, "learning_rate": 4.705274043433299e-06, - "loss": 0.0637, + "loss": 0.0309, "step": 4880 }, { "epoch": 4.58, - "grad_norm": 0.14713767170906067, + "grad_norm": 0.0379851795732975, "learning_rate": 4.601861427094105e-06, - "loss": 0.0035, + "loss": 0.0016, "step": 4890 }, { "epoch": 4.59, - "grad_norm": 3.178408145904541, + "grad_norm": 0.031074857339262962, "learning_rate": 4.498448810754912e-06, - "loss": 0.091, + "loss": 0.0021, "step": 4900 }, { "epoch": 4.6, - "grad_norm": 0.0492083765566349, + "grad_norm": 0.03219422325491905, "learning_rate": 4.39503619441572e-06, - "loss": 0.24, + "loss": 0.078, "step": 4910 }, { "epoch": 4.61, - "grad_norm": 0.01996445097029209, + "grad_norm": 0.02124185487627983, "learning_rate": 4.2916235780765255e-06, - "loss": 0.1126, + "loss": 0.0909, "step": 4920 }, { "epoch": 4.62, - "grad_norm": 0.021161267533898354, + "grad_norm": 0.028623027727007866, "learning_rate": 4.188210961737332e-06, - "loss": 0.0011, + "loss": 0.0022, "step": 4930 }, { "epoch": 4.63, - "grad_norm": 0.019828295335173607, + "grad_norm": 0.0264423917979002, "learning_rate": 4.084798345398139e-06, - "loss": 0.0025, + "loss": 0.0019, "step": 4940 }, { "epoch": 4.64, - "grad_norm": 0.021803775802254677, + "grad_norm": 0.033193424344062805, "learning_rate": 3.981385729058945e-06, - "loss": 0.1434, + "loss": 0.1499, "step": 4950 }, { "epoch": 4.65, - "grad_norm": 0.13735263049602509, + "grad_norm": 0.03795929625630379, "learning_rate": 3.8779731127197525e-06, - "loss": 0.1412, + "loss": 0.0799, "step": 4960 }, { "epoch": 4.66, - "grad_norm": 0.24444641172885895, + "grad_norm": 0.03771360218524933, "learning_rate": 3.774560496380559e-06, - "loss": 0.0046, + "loss": 0.0015, "step": 4970 }, { "epoch": 4.67, - "grad_norm": 0.023776857182383537, + "grad_norm": 0.03688567876815796, "learning_rate": 3.671147880041365e-06, - "loss": 0.1191, + "loss": 0.0028, "step": 4980 }, { "epoch": 4.68, - "grad_norm": 0.024539895355701447, + "grad_norm": 0.02699306234717369, "learning_rate": 3.5677352637021715e-06, - "loss": 0.1884, + "loss": 0.082, "step": 4990 }, { "epoch": 4.69, - "grad_norm": 0.01984153315424919, + "grad_norm": 0.03286897763609886, "learning_rate": 3.4643226473629783e-06, - "loss": 0.1128, + "loss": 0.0013, "step": 5000 }, { "epoch": 4.7, - "grad_norm": 2.897346258163452, + "grad_norm": 0.03281855955719948, "learning_rate": 3.3609100310237855e-06, - "loss": 0.0782, + "loss": 0.0016, "step": 5010 }, { "epoch": 4.7, - "grad_norm": 0.04367586970329285, + "grad_norm": 0.028866132721304893, "learning_rate": 3.257497414684592e-06, - "loss": 0.1729, + "loss": 0.2369, "step": 5020 }, { "epoch": 4.71, - "grad_norm": 0.022409643977880478, + "grad_norm": 0.031186798587441444, "learning_rate": 3.154084798345398e-06, - "loss": 0.0013, + "loss": 0.0014, "step": 5030 }, { "epoch": 4.72, - "grad_norm": 0.02230711840093136, + "grad_norm": 0.02863004244863987, "learning_rate": 3.050672182006205e-06, - "loss": 0.0605, + "loss": 0.0018, "step": 5040 }, { "epoch": 4.73, - "grad_norm": 0.08053728193044662, + "grad_norm": 0.030251089483499527, "learning_rate": 2.9472595656670117e-06, - "loss": 0.0581, + "loss": 0.0866, "step": 5050 }, { "epoch": 4.74, - "grad_norm": 0.382304847240448, + "grad_norm": 0.11614389717578888, "learning_rate": 2.843846949327818e-06, - "loss": 0.0396, + "loss": 0.1602, "step": 5060 }, { "epoch": 4.75, - "grad_norm": 0.0988391637802124, + "grad_norm": 0.02916896715760231, "learning_rate": 2.7404343329886247e-06, - "loss": 0.0291, + "loss": 0.002, "step": 5070 }, { "epoch": 4.76, - "grad_norm": 0.025470012798905373, + "grad_norm": 0.03606222942471504, "learning_rate": 2.6370217166494315e-06, - "loss": 0.0012, + "loss": 0.0014, "step": 5080 }, { "epoch": 4.77, - "grad_norm": 0.01994398981332779, + "grad_norm": 0.03456837683916092, "learning_rate": 2.533609100310238e-06, - "loss": 0.0021, + "loss": 0.0015, "step": 5090 }, { "epoch": 4.78, - "grad_norm": 0.062303327023983, + "grad_norm": 0.02772245742380619, "learning_rate": 2.4301964839710446e-06, - "loss": 0.1794, + "loss": 0.0022, "step": 5100 }, { "epoch": 4.79, - "grad_norm": 0.02839079312980175, + "grad_norm": 0.0416284054517746, "learning_rate": 2.3267838676318514e-06, - "loss": 0.0827, + "loss": 0.0014, "step": 5110 }, { "epoch": 4.8, - "grad_norm": 0.02093564346432686, + "grad_norm": 0.028300166130065918, "learning_rate": 2.2233712512926577e-06, - "loss": 0.1439, + "loss": 0.002, "step": 5120 }, { "epoch": 4.81, - "grad_norm": 0.024724125862121582, + "grad_norm": 0.03322126343846321, "learning_rate": 2.1199586349534644e-06, - "loss": 0.2626, + "loss": 0.2574, "step": 5130 }, { "epoch": 4.82, - "grad_norm": 0.025263186544179916, + "grad_norm": 0.025889359414577484, "learning_rate": 2.0165460186142708e-06, - "loss": 0.0011, + "loss": 0.0014, "step": 5140 }, { "epoch": 4.83, - "grad_norm": 0.026100702583789825, + "grad_norm": 0.037663087248802185, "learning_rate": 1.913133402275078e-06, - "loss": 0.0012, + "loss": 0.079, "step": 5150 }, { "epoch": 4.84, - "grad_norm": 0.42885711789131165, + "grad_norm": 0.024740027263760567, "learning_rate": 1.8097207859358843e-06, - "loss": 0.0014, + "loss": 0.102, "step": 5160 }, { "epoch": 4.85, - "grad_norm": 0.09349822252988815, + "grad_norm": 0.029313886538147926, "learning_rate": 1.7063081695966906e-06, - "loss": 0.0014, + "loss": 0.0966, "step": 5170 }, { "epoch": 4.85, - "grad_norm": 0.020912567153573036, + "grad_norm": 0.02809065394103527, "learning_rate": 1.6028955532574976e-06, - "loss": 0.2671, + "loss": 0.2161, "step": 5180 }, { "epoch": 4.86, - "grad_norm": 0.0201337318867445, + "grad_norm": 0.02252083458006382, "learning_rate": 1.499482936918304e-06, - "loss": 0.0653, + "loss": 0.0016, "step": 5190 }, { "epoch": 4.87, - "grad_norm": 0.12457222491502762, + "grad_norm": 0.030461903661489487, "learning_rate": 1.3960703205791107e-06, - "loss": 0.0855, + "loss": 0.1729, "step": 5200 }, { "epoch": 4.88, - "grad_norm": 0.06253860145807266, + "grad_norm": 0.032880086451768875, "learning_rate": 1.2926577042399172e-06, - "loss": 0.0576, + "loss": 0.0815, "step": 5210 }, { "epoch": 4.89, - "grad_norm": 0.033292561769485474, + "grad_norm": 0.033105768263339996, "learning_rate": 1.189245087900724e-06, - "loss": 0.1763, + "loss": 0.0186, "step": 5220 }, { "epoch": 4.9, - "grad_norm": 5.004137992858887, + "grad_norm": 0.03529038280248642, "learning_rate": 1.0858324715615305e-06, - "loss": 0.0863, + "loss": 0.1081, "step": 5230 }, { "epoch": 4.91, - "grad_norm": 10.605862617492676, + "grad_norm": 3.352219343185425, "learning_rate": 9.82419855222337e-07, - "loss": 0.1183, + "loss": 0.0897, "step": 5240 }, { "epoch": 4.92, - "grad_norm": 0.02466098591685295, + "grad_norm": 0.035365719348192215, "learning_rate": 8.790072388831437e-07, - "loss": 0.0013, + "loss": 0.0021, "step": 5250 }, { "epoch": 4.93, - "grad_norm": 47.83183288574219, + "grad_norm": 6.3308587074279785, "learning_rate": 7.755946225439504e-07, - "loss": 0.1172, + "loss": 0.092, "step": 5260 }, { "epoch": 4.94, - "grad_norm": 0.05490906536579132, + "grad_norm": 0.03557182475924492, "learning_rate": 6.72182006204757e-07, - "loss": 0.0862, + "loss": 0.0022, "step": 5270 }, { "epoch": 4.95, - "grad_norm": 0.09379950165748596, + "grad_norm": 0.05220606178045273, "learning_rate": 5.687693898655636e-07, - "loss": 0.0013, + "loss": 0.0743, "step": 5280 }, { "epoch": 4.96, - "grad_norm": 0.023985665291547775, + "grad_norm": 248.59735107421875, "learning_rate": 4.6535677352637023e-07, - "loss": 0.0016, + "loss": 0.0558, "step": 5290 }, { "epoch": 4.97, - "grad_norm": 0.18879929184913635, + "grad_norm": 0.029809845611453056, "learning_rate": 3.619441571871769e-07, - "loss": 0.0607, + "loss": 0.0015, "step": 5300 }, { "epoch": 4.98, - "grad_norm": 0.043404027819633484, + "grad_norm": 0.030335601419210434, "learning_rate": 2.585315408479835e-07, - "loss": 0.0933, + "loss": 0.0736, "step": 5310 }, { "epoch": 4.99, - "grad_norm": 10.754716873168945, + "grad_norm": 0.039527297019958496, "learning_rate": 1.5511892450879008e-07, - "loss": 0.0017, + "loss": 0.137, "step": 5320 }, { "epoch": 5.0, - "grad_norm": 0.027110379189252853, + "grad_norm": 0.023937204852700233, "learning_rate": 5.1706308169596694e-08, - "loss": 0.0889, + "loss": 0.0819, "step": 5330 }, { "epoch": 5.0, - "eval_accuracy": 0.8761726078799249, - "eval_f1": 0.8735632183908046, - "eval_loss": 0.7370651364326477, - "eval_precision": 0.890625, - "eval_recall": 0.8571428571428571, - "eval_runtime": 1.5139, - "eval_samples_per_second": 352.061, - "eval_steps_per_second": 44.255, + "eval_accuracy": 0.8667917448405253, + "eval_f1": 0.8672897196261682, + "eval_loss": 0.7846159934997559, + "eval_precision": 0.862453531598513, + "eval_recall": 0.8721804511278195, + "eval_runtime": 1.4269, + "eval_samples_per_second": 373.542, + "eval_steps_per_second": 46.956, "step": 5335 } ],