{ "best_global_step": 1100, "best_metric": 2.467733144760132, "best_model_checkpoint": "./swin-ena24/checkpoint-1100", "epoch": 7.0, "eval_steps": 100, "global_step": 2779, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02518891687657431, "grad_norm": 254123.9375, "learning_rate": 0.00019935228499460238, "loss": 2.9691, "step": 10 }, { "epoch": 0.05037783375314862, "grad_norm": 426010.46875, "learning_rate": 0.00019863260165527168, "loss": 2.8919, "step": 20 }, { "epoch": 0.07556675062972293, "grad_norm": 438250.125, "learning_rate": 0.000197912918315941, "loss": 2.7195, "step": 30 }, { "epoch": 0.10075566750629723, "grad_norm": 643403.375, "learning_rate": 0.00019719323497661032, "loss": 2.567, "step": 40 }, { "epoch": 0.12594458438287154, "grad_norm": 416875.5, "learning_rate": 0.0001964735516372796, "loss": 2.6196, "step": 50 }, { "epoch": 0.15113350125944586, "grad_norm": 412576.78125, "learning_rate": 0.0001957538682979489, "loss": 2.4359, "step": 60 }, { "epoch": 0.17632241813602015, "grad_norm": 371569.9375, "learning_rate": 0.00019503418495861824, "loss": 2.4815, "step": 70 }, { "epoch": 0.20151133501259447, "grad_norm": 297408.5625, "learning_rate": 0.0001943145016192875, "loss": 2.508, "step": 80 }, { "epoch": 0.22670025188916876, "grad_norm": 405330.71875, "learning_rate": 0.00019359481827995682, "loss": 2.385, "step": 90 }, { "epoch": 0.2518891687657431, "grad_norm": 578770.75, "learning_rate": 0.00019287513494062612, "loss": 1.9888, "step": 100 }, { "epoch": 0.2518891687657431, "eval_accuracy": 0.1630859375, "eval_f1_macro": 0.08928735942932195, "eval_loss": 3.412175178527832, "eval_runtime": 11.0442, "eval_samples_per_second": 92.718, "eval_steps_per_second": 5.795, "step": 100 }, { "epoch": 0.2770780856423174, "grad_norm": 510874.125, "learning_rate": 0.00019215545160129545, "loss": 2.188, "step": 110 }, { "epoch": 0.3022670025188917, "grad_norm": 513859.9375, "learning_rate": 0.00019143576826196473, "loss": 2.1743, "step": 120 }, { "epoch": 0.327455919395466, "grad_norm": 527887.5, "learning_rate": 0.00019071608492263404, "loss": 2.2732, "step": 130 }, { "epoch": 0.3526448362720403, "grad_norm": 460399.375, "learning_rate": 0.00018999640158330337, "loss": 2.1564, "step": 140 }, { "epoch": 0.3778337531486146, "grad_norm": 569056.8125, "learning_rate": 0.00018927671824397267, "loss": 1.8225, "step": 150 }, { "epoch": 0.40302267002518893, "grad_norm": 909920.6875, "learning_rate": 0.00018855703490464195, "loss": 1.771, "step": 160 }, { "epoch": 0.4282115869017632, "grad_norm": 494884.5, "learning_rate": 0.00018783735156531128, "loss": 1.8349, "step": 170 }, { "epoch": 0.4534005037783375, "grad_norm": 550597.375, "learning_rate": 0.0001871176682259806, "loss": 1.5891, "step": 180 }, { "epoch": 0.47858942065491183, "grad_norm": 462934.75, "learning_rate": 0.00018639798488664987, "loss": 1.8926, "step": 190 }, { "epoch": 0.5037783375314862, "grad_norm": 724803.5, "learning_rate": 0.00018567830154731917, "loss": 1.6111, "step": 200 }, { "epoch": 0.5037783375314862, "eval_accuracy": 0.2578125, "eval_f1_macro": 0.15353256242114557, "eval_loss": 2.9077086448669434, "eval_runtime": 11.368, "eval_samples_per_second": 90.078, "eval_steps_per_second": 5.63, "step": 200 }, { "epoch": 0.5289672544080605, "grad_norm": 559555.8125, "learning_rate": 0.0001849586182079885, "loss": 1.6636, "step": 210 }, { "epoch": 0.5541561712846348, "grad_norm": 1021320.3125, "learning_rate": 0.0001842389348686578, "loss": 1.462, "step": 220 }, { "epoch": 0.5793450881612091, "grad_norm": 692250.375, "learning_rate": 0.0001835192515293271, "loss": 1.5007, "step": 230 }, { "epoch": 0.6045340050377834, "grad_norm": 774498.1875, "learning_rate": 0.00018279956818999642, "loss": 1.5122, "step": 240 }, { "epoch": 0.6297229219143576, "grad_norm": 497269.53125, "learning_rate": 0.00018207988485066572, "loss": 1.3379, "step": 250 }, { "epoch": 0.654911838790932, "grad_norm": 786711.4375, "learning_rate": 0.00018136020151133503, "loss": 1.5591, "step": 260 }, { "epoch": 0.6801007556675063, "grad_norm": 561642.375, "learning_rate": 0.00018064051817200433, "loss": 1.3499, "step": 270 }, { "epoch": 0.7052896725440806, "grad_norm": 372517.21875, "learning_rate": 0.00017992083483267364, "loss": 1.3296, "step": 280 }, { "epoch": 0.7304785894206549, "grad_norm": 783448.4375, "learning_rate": 0.00017920115149334294, "loss": 1.2106, "step": 290 }, { "epoch": 0.7556675062972292, "grad_norm": 748525.0, "learning_rate": 0.00017848146815401222, "loss": 1.1276, "step": 300 }, { "epoch": 0.7556675062972292, "eval_accuracy": 0.357421875, "eval_f1_macro": 0.28272669387224364, "eval_loss": 2.6503801345825195, "eval_runtime": 11.5522, "eval_samples_per_second": 88.641, "eval_steps_per_second": 5.54, "step": 300 }, { "epoch": 0.7808564231738035, "grad_norm": 481374.1875, "learning_rate": 0.00017776178481468155, "loss": 1.1727, "step": 310 }, { "epoch": 0.8060453400503779, "grad_norm": 1038731.1875, "learning_rate": 0.00017704210147535086, "loss": 1.1783, "step": 320 }, { "epoch": 0.8312342569269522, "grad_norm": 369614.8125, "learning_rate": 0.00017632241813602016, "loss": 1.4703, "step": 330 }, { "epoch": 0.8564231738035264, "grad_norm": 411708.9375, "learning_rate": 0.00017560273479668947, "loss": 1.0237, "step": 340 }, { "epoch": 0.8816120906801007, "grad_norm": 777554.375, "learning_rate": 0.00017488305145735877, "loss": 1.3113, "step": 350 }, { "epoch": 0.906801007556675, "grad_norm": 589113.125, "learning_rate": 0.00017416336811802808, "loss": 1.4282, "step": 360 }, { "epoch": 0.9319899244332494, "grad_norm": 537991.9375, "learning_rate": 0.00017344368477869738, "loss": 1.0802, "step": 370 }, { "epoch": 0.9571788413098237, "grad_norm": 745339.875, "learning_rate": 0.0001727240014393667, "loss": 1.1078, "step": 380 }, { "epoch": 0.982367758186398, "grad_norm": 515597.875, "learning_rate": 0.000172004318100036, "loss": 1.3261, "step": 390 }, { "epoch": 1.0075566750629723, "grad_norm": 438800.96875, "learning_rate": 0.0001712846347607053, "loss": 1.0234, "step": 400 }, { "epoch": 1.0075566750629723, "eval_accuracy": 0.390625, "eval_f1_macro": 0.31557618159267287, "eval_loss": 2.572810173034668, "eval_runtime": 11.7819, "eval_samples_per_second": 86.913, "eval_steps_per_second": 5.432, "step": 400 }, { "epoch": 1.0327455919395465, "grad_norm": 360076.0, "learning_rate": 0.0001705649514213746, "loss": 1.0415, "step": 410 }, { "epoch": 1.057934508816121, "grad_norm": 468955.34375, "learning_rate": 0.0001698452680820439, "loss": 0.8579, "step": 420 }, { "epoch": 1.0831234256926952, "grad_norm": 765571.4375, "learning_rate": 0.0001691255847427132, "loss": 0.9408, "step": 430 }, { "epoch": 1.1083123425692696, "grad_norm": 682154.8125, "learning_rate": 0.00016840590140338252, "loss": 1.0962, "step": 440 }, { "epoch": 1.1335012594458438, "grad_norm": 861960.5, "learning_rate": 0.00016768621806405182, "loss": 0.8124, "step": 450 }, { "epoch": 1.1586901763224182, "grad_norm": 583161.8125, "learning_rate": 0.00016696653472472113, "loss": 1.1014, "step": 460 }, { "epoch": 1.1838790931989924, "grad_norm": 390205.40625, "learning_rate": 0.00016624685138539046, "loss": 0.8475, "step": 470 }, { "epoch": 1.2090680100755669, "grad_norm": 293254.3125, "learning_rate": 0.00016552716804605974, "loss": 1.0482, "step": 480 }, { "epoch": 1.234256926952141, "grad_norm": 571449.25, "learning_rate": 0.00016480748470672904, "loss": 0.8776, "step": 490 }, { "epoch": 1.2594458438287153, "grad_norm": 362773.25, "learning_rate": 0.00016408780136739835, "loss": 0.8909, "step": 500 }, { "epoch": 1.2594458438287153, "eval_accuracy": 0.421875, "eval_f1_macro": 0.3388453334601594, "eval_loss": 2.500704765319824, "eval_runtime": 11.4557, "eval_samples_per_second": 89.388, "eval_steps_per_second": 5.587, "step": 500 }, { "epoch": 1.2846347607052897, "grad_norm": 351467.09375, "learning_rate": 0.00016336811802806765, "loss": 0.8244, "step": 510 }, { "epoch": 1.309823677581864, "grad_norm": 894287.0625, "learning_rate": 0.00016264843468873696, "loss": 1.048, "step": 520 }, { "epoch": 1.3350125944584383, "grad_norm": 474119.75, "learning_rate": 0.00016192875134940626, "loss": 0.9196, "step": 530 }, { "epoch": 1.3602015113350125, "grad_norm": 1069011.125, "learning_rate": 0.0001612090680100756, "loss": 0.804, "step": 540 }, { "epoch": 1.385390428211587, "grad_norm": 587531.6875, "learning_rate": 0.00016048938467074487, "loss": 0.6696, "step": 550 }, { "epoch": 1.4105793450881612, "grad_norm": 879147.25, "learning_rate": 0.00015976970133141418, "loss": 0.7831, "step": 560 }, { "epoch": 1.4357682619647356, "grad_norm": 219215.859375, "learning_rate": 0.0001590500179920835, "loss": 0.5681, "step": 570 }, { "epoch": 1.4609571788413098, "grad_norm": 447798.375, "learning_rate": 0.00015833033465275279, "loss": 0.8663, "step": 580 }, { "epoch": 1.486146095717884, "grad_norm": 708370.625, "learning_rate": 0.0001576106513134221, "loss": 0.7913, "step": 590 }, { "epoch": 1.5113350125944585, "grad_norm": 822008.0625, "learning_rate": 0.0001568909679740914, "loss": 0.8008, "step": 600 }, { "epoch": 1.5113350125944585, "eval_accuracy": 0.404296875, "eval_f1_macro": 0.3618779716937303, "eval_loss": 2.7039053440093994, "eval_runtime": 11.8087, "eval_samples_per_second": 86.716, "eval_steps_per_second": 5.42, "step": 600 }, { "epoch": 1.536523929471033, "grad_norm": 467169.0625, "learning_rate": 0.00015617128463476073, "loss": 0.7902, "step": 610 }, { "epoch": 1.561712846347607, "grad_norm": 385000.3125, "learning_rate": 0.00015545160129543, "loss": 0.7146, "step": 620 }, { "epoch": 1.5869017632241813, "grad_norm": 366275.21875, "learning_rate": 0.0001547319179560993, "loss": 0.6558, "step": 630 }, { "epoch": 1.6120906801007555, "grad_norm": 432902.1875, "learning_rate": 0.00015401223461676864, "loss": 0.7669, "step": 640 }, { "epoch": 1.63727959697733, "grad_norm": 426131.71875, "learning_rate": 0.00015329255127743795, "loss": 0.7121, "step": 650 }, { "epoch": 1.6624685138539044, "grad_norm": 426302.0, "learning_rate": 0.00015257286793810722, "loss": 0.9564, "step": 660 }, { "epoch": 1.6876574307304786, "grad_norm": 405949.34375, "learning_rate": 0.00015185318459877656, "loss": 0.6072, "step": 670 }, { "epoch": 1.7128463476070528, "grad_norm": 319128.53125, "learning_rate": 0.00015113350125944586, "loss": 0.5537, "step": 680 }, { "epoch": 1.7380352644836272, "grad_norm": 405533.625, "learning_rate": 0.00015041381792011514, "loss": 0.8143, "step": 690 }, { "epoch": 1.7632241813602016, "grad_norm": 357302.3125, "learning_rate": 0.00014969413458078447, "loss": 0.6885, "step": 700 }, { "epoch": 1.7632241813602016, "eval_accuracy": 0.3701171875, "eval_f1_macro": 0.2926079223907959, "eval_loss": 3.1089859008789062, "eval_runtime": 11.4817, "eval_samples_per_second": 89.186, "eval_steps_per_second": 5.574, "step": 700 }, { "epoch": 1.7884130982367759, "grad_norm": 448783.3125, "learning_rate": 0.00014897445124145378, "loss": 0.8981, "step": 710 }, { "epoch": 1.81360201511335, "grad_norm": 663808.1875, "learning_rate": 0.00014825476790212308, "loss": 0.6493, "step": 720 }, { "epoch": 1.8387909319899243, "grad_norm": 1166550.75, "learning_rate": 0.00014753508456279236, "loss": 0.6288, "step": 730 }, { "epoch": 1.8639798488664987, "grad_norm": 774232.6875, "learning_rate": 0.0001468154012234617, "loss": 0.8717, "step": 740 }, { "epoch": 1.8891687657430731, "grad_norm": 565763.0, "learning_rate": 0.000146095717884131, "loss": 0.7199, "step": 750 }, { "epoch": 1.9143576826196473, "grad_norm": 365545.625, "learning_rate": 0.0001453760345448003, "loss": 0.721, "step": 760 }, { "epoch": 1.9395465994962215, "grad_norm": 616748.25, "learning_rate": 0.0001446563512054696, "loss": 0.6793, "step": 770 }, { "epoch": 1.964735516372796, "grad_norm": 503789.65625, "learning_rate": 0.0001439366678661389, "loss": 0.5703, "step": 780 }, { "epoch": 1.9899244332493704, "grad_norm": 619143.4375, "learning_rate": 0.00014321698452680822, "loss": 0.679, "step": 790 }, { "epoch": 2.0151133501259446, "grad_norm": 661451.3125, "learning_rate": 0.00014249730118747752, "loss": 0.839, "step": 800 }, { "epoch": 2.0151133501259446, "eval_accuracy": 0.484375, "eval_f1_macro": 0.414907768165119, "eval_loss": 2.584502935409546, "eval_runtime": 11.5153, "eval_samples_per_second": 88.925, "eval_steps_per_second": 5.558, "step": 800 }, { "epoch": 2.040302267002519, "grad_norm": 316634.78125, "learning_rate": 0.00014177761784814683, "loss": 0.4317, "step": 810 }, { "epoch": 2.065491183879093, "grad_norm": 347729.625, "learning_rate": 0.00014105793450881613, "loss": 0.3792, "step": 820 }, { "epoch": 2.0906801007556677, "grad_norm": 659445.4375, "learning_rate": 0.00014033825116948544, "loss": 0.452, "step": 830 }, { "epoch": 2.115869017632242, "grad_norm": 735795.1875, "learning_rate": 0.00013961856783015474, "loss": 0.5447, "step": 840 }, { "epoch": 2.141057934508816, "grad_norm": 107777.5390625, "learning_rate": 0.00013889888449082404, "loss": 0.5247, "step": 850 }, { "epoch": 2.1662468513853903, "grad_norm": 577874.5, "learning_rate": 0.00013817920115149335, "loss": 0.5108, "step": 860 }, { "epoch": 2.1914357682619645, "grad_norm": 441668.625, "learning_rate": 0.00013745951781216265, "loss": 0.4324, "step": 870 }, { "epoch": 2.216624685138539, "grad_norm": 489653.59375, "learning_rate": 0.00013673983447283196, "loss": 0.4427, "step": 880 }, { "epoch": 2.2418136020151134, "grad_norm": 296346.84375, "learning_rate": 0.00013602015113350126, "loss": 0.5105, "step": 890 }, { "epoch": 2.2670025188916876, "grad_norm": 187145.796875, "learning_rate": 0.00013530046779417057, "loss": 0.325, "step": 900 }, { "epoch": 2.2670025188916876, "eval_accuracy": 0.5068359375, "eval_f1_macro": 0.41278416064519763, "eval_loss": 2.5142855644226074, "eval_runtime": 12.173, "eval_samples_per_second": 84.12, "eval_steps_per_second": 5.258, "step": 900 }, { "epoch": 2.292191435768262, "grad_norm": 406714.6875, "learning_rate": 0.00013458078445483987, "loss": 0.4576, "step": 910 }, { "epoch": 2.3173803526448364, "grad_norm": 379892.40625, "learning_rate": 0.00013386110111550918, "loss": 0.4179, "step": 920 }, { "epoch": 2.3425692695214106, "grad_norm": 540661.9375, "learning_rate": 0.00013314141777617848, "loss": 0.3386, "step": 930 }, { "epoch": 2.367758186397985, "grad_norm": 850949.25, "learning_rate": 0.0001324217344368478, "loss": 0.5695, "step": 940 }, { "epoch": 2.392947103274559, "grad_norm": 363627.53125, "learning_rate": 0.0001317020510975171, "loss": 0.3362, "step": 950 }, { "epoch": 2.4181360201511337, "grad_norm": 644468.0625, "learning_rate": 0.0001309823677581864, "loss": 0.444, "step": 960 }, { "epoch": 2.443324937027708, "grad_norm": 164236.78125, "learning_rate": 0.00013026268441885573, "loss": 0.285, "step": 970 }, { "epoch": 2.468513853904282, "grad_norm": 688494.375, "learning_rate": 0.000129543001079525, "loss": 0.4024, "step": 980 }, { "epoch": 2.4937027707808563, "grad_norm": 486211.0625, "learning_rate": 0.0001288233177401943, "loss": 0.4544, "step": 990 }, { "epoch": 2.5188916876574305, "grad_norm": 273390.03125, "learning_rate": 0.00012810363440086365, "loss": 0.4501, "step": 1000 }, { "epoch": 2.5188916876574305, "eval_accuracy": 0.4482421875, "eval_f1_macro": 0.40562802574511003, "eval_loss": 2.7683629989624023, "eval_runtime": 11.8873, "eval_samples_per_second": 86.142, "eval_steps_per_second": 5.384, "step": 1000 }, { "epoch": 2.544080604534005, "grad_norm": 730712.125, "learning_rate": 0.00012738395106153292, "loss": 0.5234, "step": 1010 }, { "epoch": 2.5692695214105794, "grad_norm": 369909.34375, "learning_rate": 0.00012666426772220223, "loss": 0.4051, "step": 1020 }, { "epoch": 2.5944584382871536, "grad_norm": 507635.78125, "learning_rate": 0.00012594458438287153, "loss": 0.2967, "step": 1030 }, { "epoch": 2.619647355163728, "grad_norm": 688013.125, "learning_rate": 0.00012522490104354087, "loss": 0.5321, "step": 1040 }, { "epoch": 2.644836272040302, "grad_norm": 505216.40625, "learning_rate": 0.00012450521770421014, "loss": 0.4807, "step": 1050 }, { "epoch": 2.6700251889168767, "grad_norm": 252679.53125, "learning_rate": 0.00012378553436487945, "loss": 0.4088, "step": 1060 }, { "epoch": 2.695214105793451, "grad_norm": 235546.9375, "learning_rate": 0.00012306585102554878, "loss": 0.393, "step": 1070 }, { "epoch": 2.720403022670025, "grad_norm": 398018.84375, "learning_rate": 0.00012234616768621808, "loss": 0.3694, "step": 1080 }, { "epoch": 2.7455919395465997, "grad_norm": 374467.3125, "learning_rate": 0.00012162648434688738, "loss": 0.3599, "step": 1090 }, { "epoch": 2.770780856423174, "grad_norm": 533788.9375, "learning_rate": 0.0001209068010075567, "loss": 0.3191, "step": 1100 }, { "epoch": 2.770780856423174, "eval_accuracy": 0.5146484375, "eval_f1_macro": 0.432831730682114, "eval_loss": 2.467733144760132, "eval_runtime": 11.9402, "eval_samples_per_second": 85.761, "eval_steps_per_second": 5.36, "step": 1100 }, { "epoch": 2.795969773299748, "grad_norm": 27070.849609375, "learning_rate": 0.00012018711766822599, "loss": 0.2222, "step": 1110 }, { "epoch": 2.8211586901763224, "grad_norm": 742011.0625, "learning_rate": 0.00011946743432889529, "loss": 0.3194, "step": 1120 }, { "epoch": 2.8463476070528966, "grad_norm": 508084.53125, "learning_rate": 0.00011874775098956458, "loss": 0.5068, "step": 1130 }, { "epoch": 2.8715365239294712, "grad_norm": 472090.21875, "learning_rate": 0.00011802806765023391, "loss": 0.3457, "step": 1140 }, { "epoch": 2.8967254408060454, "grad_norm": 345946.625, "learning_rate": 0.0001173083843109032, "loss": 0.4024, "step": 1150 }, { "epoch": 2.9219143576826196, "grad_norm": 116557.78125, "learning_rate": 0.00011658870097157251, "loss": 0.3248, "step": 1160 }, { "epoch": 2.947103274559194, "grad_norm": 544007.125, "learning_rate": 0.00011586901763224183, "loss": 0.2957, "step": 1170 }, { "epoch": 2.972292191435768, "grad_norm": 558989.0, "learning_rate": 0.00011514933429291112, "loss": 0.3386, "step": 1180 }, { "epoch": 2.9974811083123427, "grad_norm": 509623.65625, "learning_rate": 0.00011442965095358043, "loss": 0.4525, "step": 1190 }, { "epoch": 3.022670025188917, "grad_norm": 374462.28125, "learning_rate": 0.00011370996761424974, "loss": 0.1664, "step": 1200 }, { "epoch": 3.022670025188917, "eval_accuracy": 0.5361328125, "eval_f1_macro": 0.4597358092759388, "eval_loss": 2.477670192718506, "eval_runtime": 11.8885, "eval_samples_per_second": 86.134, "eval_steps_per_second": 5.383, "step": 1200 }, { "epoch": 3.047858942065491, "grad_norm": 417895.8125, "learning_rate": 0.00011299028427491905, "loss": 0.2631, "step": 1210 }, { "epoch": 3.0730478589420653, "grad_norm": 307081.21875, "learning_rate": 0.00011227060093558834, "loss": 0.1544, "step": 1220 }, { "epoch": 3.09823677581864, "grad_norm": 1034528.125, "learning_rate": 0.00011155091759625764, "loss": 0.2107, "step": 1230 }, { "epoch": 3.123425692695214, "grad_norm": 659800.0625, "learning_rate": 0.00011083123425692696, "loss": 0.3405, "step": 1240 }, { "epoch": 3.1486146095717884, "grad_norm": 129759.8671875, "learning_rate": 0.00011011155091759627, "loss": 0.2225, "step": 1250 }, { "epoch": 3.1738035264483626, "grad_norm": 560157.25, "learning_rate": 0.00010939186757826556, "loss": 0.1545, "step": 1260 }, { "epoch": 3.1989924433249373, "grad_norm": 253132.671875, "learning_rate": 0.00010867218423893488, "loss": 0.3073, "step": 1270 }, { "epoch": 3.2241813602015115, "grad_norm": 407181.90625, "learning_rate": 0.00010795250089960418, "loss": 0.3247, "step": 1280 }, { "epoch": 3.2493702770780857, "grad_norm": 433259.09375, "learning_rate": 0.00010723281756027347, "loss": 0.127, "step": 1290 }, { "epoch": 3.27455919395466, "grad_norm": 187479.984375, "learning_rate": 0.0001065131342209428, "loss": 0.1469, "step": 1300 }, { "epoch": 3.27455919395466, "eval_accuracy": 0.5205078125, "eval_f1_macro": 0.4495261806770087, "eval_loss": 2.6402528285980225, "eval_runtime": 11.5773, "eval_samples_per_second": 88.449, "eval_steps_per_second": 5.528, "step": 1300 }, { "epoch": 3.299748110831234, "grad_norm": 736520.0625, "learning_rate": 0.0001057934508816121, "loss": 0.2567, "step": 1310 }, { "epoch": 3.3249370277078087, "grad_norm": 376107.03125, "learning_rate": 0.0001050737675422814, "loss": 0.1442, "step": 1320 }, { "epoch": 3.350125944584383, "grad_norm": 723020.3125, "learning_rate": 0.0001043540842029507, "loss": 0.1422, "step": 1330 }, { "epoch": 3.375314861460957, "grad_norm": 60796.48046875, "learning_rate": 0.00010363440086362001, "loss": 0.2077, "step": 1340 }, { "epoch": 3.4005037783375314, "grad_norm": 624072.0, "learning_rate": 0.00010291471752428932, "loss": 0.1422, "step": 1350 }, { "epoch": 3.4256926952141056, "grad_norm": 191143.5625, "learning_rate": 0.00010219503418495862, "loss": 0.2425, "step": 1360 }, { "epoch": 3.4508816120906802, "grad_norm": 291693.75, "learning_rate": 0.00010147535084562794, "loss": 0.2693, "step": 1370 }, { "epoch": 3.4760705289672544, "grad_norm": 516212.4375, "learning_rate": 0.00010075566750629723, "loss": 0.2397, "step": 1380 }, { "epoch": 3.5012594458438286, "grad_norm": 273365.9375, "learning_rate": 0.00010003598416696654, "loss": 0.1774, "step": 1390 }, { "epoch": 3.5264483627204033, "grad_norm": 518260.21875, "learning_rate": 9.931630082763584e-05, "loss": 0.3063, "step": 1400 }, { "epoch": 3.5264483627204033, "eval_accuracy": 0.5009765625, "eval_f1_macro": 0.441499104615912, "eval_loss": 2.7999606132507324, "eval_runtime": 11.7898, "eval_samples_per_second": 86.855, "eval_steps_per_second": 5.428, "step": 1400 }, { "epoch": 3.551637279596977, "grad_norm": 552209.5, "learning_rate": 9.859661748830516e-05, "loss": 0.2254, "step": 1410 }, { "epoch": 3.5768261964735517, "grad_norm": 7195.90625, "learning_rate": 9.787693414897445e-05, "loss": 0.1378, "step": 1420 }, { "epoch": 3.602015113350126, "grad_norm": 879434.0625, "learning_rate": 9.715725080964376e-05, "loss": 0.2466, "step": 1430 }, { "epoch": 3.6272040302267, "grad_norm": 116491.9609375, "learning_rate": 9.643756747031306e-05, "loss": 0.2094, "step": 1440 }, { "epoch": 3.652392947103275, "grad_norm": 524990.0625, "learning_rate": 9.571788413098237e-05, "loss": 0.2522, "step": 1450 }, { "epoch": 3.677581863979849, "grad_norm": 939990.8125, "learning_rate": 9.499820079165168e-05, "loss": 0.2382, "step": 1460 }, { "epoch": 3.702770780856423, "grad_norm": 337334.625, "learning_rate": 9.427851745232098e-05, "loss": 0.299, "step": 1470 }, { "epoch": 3.7279596977329974, "grad_norm": 322309.28125, "learning_rate": 9.35588341129903e-05, "loss": 0.2307, "step": 1480 }, { "epoch": 3.7531486146095716, "grad_norm": 80700.328125, "learning_rate": 9.283915077365959e-05, "loss": 0.1816, "step": 1490 }, { "epoch": 3.7783375314861463, "grad_norm": 667783.125, "learning_rate": 9.21194674343289e-05, "loss": 0.1786, "step": 1500 }, { "epoch": 3.7783375314861463, "eval_accuracy": 0.533203125, "eval_f1_macro": 0.45250014527358284, "eval_loss": 2.8164846897125244, "eval_runtime": 11.728, "eval_samples_per_second": 87.312, "eval_steps_per_second": 5.457, "step": 1500 }, { "epoch": 3.8035264483627205, "grad_norm": 340034.53125, "learning_rate": 9.139978409499821e-05, "loss": 0.1934, "step": 1510 }, { "epoch": 3.8287153652392947, "grad_norm": 569343.8125, "learning_rate": 9.068010075566751e-05, "loss": 0.3571, "step": 1520 }, { "epoch": 3.853904282115869, "grad_norm": 73828.2578125, "learning_rate": 8.996041741633682e-05, "loss": 0.2595, "step": 1530 }, { "epoch": 3.879093198992443, "grad_norm": 610885.5625, "learning_rate": 8.924073407700611e-05, "loss": 0.2454, "step": 1540 }, { "epoch": 3.9042821158690177, "grad_norm": 549705.5, "learning_rate": 8.852105073767543e-05, "loss": 0.2246, "step": 1550 }, { "epoch": 3.929471032745592, "grad_norm": 946495.1875, "learning_rate": 8.780136739834473e-05, "loss": 0.3156, "step": 1560 }, { "epoch": 3.954659949622166, "grad_norm": 89126.0546875, "learning_rate": 8.708168405901404e-05, "loss": 0.127, "step": 1570 }, { "epoch": 3.979848866498741, "grad_norm": 364322.40625, "learning_rate": 8.636200071968334e-05, "loss": 0.2114, "step": 1580 }, { "epoch": 4.005037783375315, "grad_norm": 646444.625, "learning_rate": 8.564231738035265e-05, "loss": 0.1633, "step": 1590 }, { "epoch": 4.030226700251889, "grad_norm": 281335.90625, "learning_rate": 8.492263404102195e-05, "loss": 0.0687, "step": 1600 }, { "epoch": 4.030226700251889, "eval_accuracy": 0.568359375, "eval_f1_macro": 0.49420184108051124, "eval_loss": 2.9026849269866943, "eval_runtime": 11.365, "eval_samples_per_second": 90.101, "eval_steps_per_second": 5.631, "step": 1600 }, { "epoch": 4.055415617128464, "grad_norm": 12626.9794921875, "learning_rate": 8.420295070169126e-05, "loss": 0.0641, "step": 1610 }, { "epoch": 4.080604534005038, "grad_norm": 55986.578125, "learning_rate": 8.348326736236056e-05, "loss": 0.0647, "step": 1620 }, { "epoch": 4.105793450881612, "grad_norm": 509303.3125, "learning_rate": 8.276358402302987e-05, "loss": 0.0746, "step": 1630 }, { "epoch": 4.130982367758186, "grad_norm": 21481.740234375, "learning_rate": 8.204390068369917e-05, "loss": 0.0746, "step": 1640 }, { "epoch": 4.156171284634761, "grad_norm": 11360.7412109375, "learning_rate": 8.132421734436848e-05, "loss": 0.0555, "step": 1650 }, { "epoch": 4.181360201511335, "grad_norm": 396739.0, "learning_rate": 8.06045340050378e-05, "loss": 0.0595, "step": 1660 }, { "epoch": 4.206549118387909, "grad_norm": 8099.65478515625, "learning_rate": 7.988485066570709e-05, "loss": 0.1391, "step": 1670 }, { "epoch": 4.231738035264484, "grad_norm": 208596.65625, "learning_rate": 7.916516732637639e-05, "loss": 0.1038, "step": 1680 }, { "epoch": 4.2569269521410575, "grad_norm": 359488.34375, "learning_rate": 7.84454839870457e-05, "loss": 0.025, "step": 1690 }, { "epoch": 4.282115869017632, "grad_norm": 247004.875, "learning_rate": 7.7725800647715e-05, "loss": 0.0427, "step": 1700 }, { "epoch": 4.282115869017632, "eval_accuracy": 0.4912109375, "eval_f1_macro": 0.4362345681944286, "eval_loss": 3.321627616882324, "eval_runtime": 11.8095, "eval_samples_per_second": 86.71, "eval_steps_per_second": 5.419, "step": 1700 }, { "epoch": 4.307304785894207, "grad_norm": 22134.611328125, "learning_rate": 7.700611730838432e-05, "loss": 0.1015, "step": 1710 }, { "epoch": 4.332493702770781, "grad_norm": 163005.84375, "learning_rate": 7.628643396905361e-05, "loss": 0.0524, "step": 1720 }, { "epoch": 4.357682619647355, "grad_norm": 25011.78515625, "learning_rate": 7.556675062972293e-05, "loss": 0.1, "step": 1730 }, { "epoch": 4.382871536523929, "grad_norm": 606683.75, "learning_rate": 7.484706729039224e-05, "loss": 0.1503, "step": 1740 }, { "epoch": 4.408060453400504, "grad_norm": 411167.28125, "learning_rate": 7.412738395106154e-05, "loss": 0.1075, "step": 1750 }, { "epoch": 4.433249370277078, "grad_norm": 20846.5390625, "learning_rate": 7.340770061173085e-05, "loss": 0.0793, "step": 1760 }, { "epoch": 4.458438287153652, "grad_norm": 35871.06640625, "learning_rate": 7.268801727240015e-05, "loss": 0.0607, "step": 1770 }, { "epoch": 4.483627204030227, "grad_norm": 58382.01171875, "learning_rate": 7.196833393306946e-05, "loss": 0.082, "step": 1780 }, { "epoch": 4.508816120906801, "grad_norm": 13096.5205078125, "learning_rate": 7.124865059373876e-05, "loss": 0.0754, "step": 1790 }, { "epoch": 4.534005037783375, "grad_norm": 21618.96875, "learning_rate": 7.052896725440807e-05, "loss": 0.1825, "step": 1800 }, { "epoch": 4.534005037783375, "eval_accuracy": 0.53125, "eval_f1_macro": 0.4663662196170286, "eval_loss": 3.1456074714660645, "eval_runtime": 11.9035, "eval_samples_per_second": 86.025, "eval_steps_per_second": 5.377, "step": 1800 }, { "epoch": 4.55919395465995, "grad_norm": 97136.0546875, "learning_rate": 6.980928391507737e-05, "loss": 0.0821, "step": 1810 }, { "epoch": 4.584382871536524, "grad_norm": 14197.41796875, "learning_rate": 6.908960057574667e-05, "loss": 0.1245, "step": 1820 }, { "epoch": 4.609571788413098, "grad_norm": 792662.5, "learning_rate": 6.836991723641598e-05, "loss": 0.0678, "step": 1830 }, { "epoch": 4.634760705289673, "grad_norm": 70302.1484375, "learning_rate": 6.765023389708528e-05, "loss": 0.0369, "step": 1840 }, { "epoch": 4.659949622166247, "grad_norm": 315541.75, "learning_rate": 6.693055055775459e-05, "loss": 0.086, "step": 1850 }, { "epoch": 4.685138539042821, "grad_norm": 426322.5625, "learning_rate": 6.62108672184239e-05, "loss": 0.1367, "step": 1860 }, { "epoch": 4.710327455919396, "grad_norm": 5201.7265625, "learning_rate": 6.54911838790932e-05, "loss": 0.0518, "step": 1870 }, { "epoch": 4.73551637279597, "grad_norm": 16552.916015625, "learning_rate": 6.47715005397625e-05, "loss": 0.0857, "step": 1880 }, { "epoch": 4.760705289672544, "grad_norm": 74746.0234375, "learning_rate": 6.405181720043182e-05, "loss": 0.178, "step": 1890 }, { "epoch": 4.785894206549118, "grad_norm": 30216.1328125, "learning_rate": 6.333213386110111e-05, "loss": 0.0758, "step": 1900 }, { "epoch": 4.785894206549118, "eval_accuracy": 0.5546875, "eval_f1_macro": 0.4578418057053371, "eval_loss": 3.2782468795776367, "eval_runtime": 11.4304, "eval_samples_per_second": 89.586, "eval_steps_per_second": 5.599, "step": 1900 }, { "epoch": 4.811083123425693, "grad_norm": 122862.0703125, "learning_rate": 6.261245052177043e-05, "loss": 0.1008, "step": 1910 }, { "epoch": 4.836272040302267, "grad_norm": 355039.96875, "learning_rate": 6.189276718243972e-05, "loss": 0.0619, "step": 1920 }, { "epoch": 4.861460957178841, "grad_norm": 180546.546875, "learning_rate": 6.117308384310904e-05, "loss": 0.212, "step": 1930 }, { "epoch": 4.886649874055416, "grad_norm": 1149145.875, "learning_rate": 6.045340050377835e-05, "loss": 0.1821, "step": 1940 }, { "epoch": 4.91183879093199, "grad_norm": 197082.046875, "learning_rate": 5.9733717164447645e-05, "loss": 0.0539, "step": 1950 }, { "epoch": 4.937027707808564, "grad_norm": 361.24676513671875, "learning_rate": 5.901403382511696e-05, "loss": 0.1526, "step": 1960 }, { "epoch": 4.962216624685139, "grad_norm": 362799.34375, "learning_rate": 5.8294350485786255e-05, "loss": 0.0838, "step": 1970 }, { "epoch": 4.987405541561713, "grad_norm": 4683.50732421875, "learning_rate": 5.757466714645556e-05, "loss": 0.0995, "step": 1980 }, { "epoch": 5.012594458438287, "grad_norm": 5118.85986328125, "learning_rate": 5.685498380712487e-05, "loss": 0.0488, "step": 1990 }, { "epoch": 5.037783375314861, "grad_norm": 6831.2880859375, "learning_rate": 5.613530046779417e-05, "loss": 0.0471, "step": 2000 }, { "epoch": 5.037783375314861, "eval_accuracy": 0.5517578125, "eval_f1_macro": 0.4725336026660133, "eval_loss": 3.334784507751465, "eval_runtime": 11.4554, "eval_samples_per_second": 89.39, "eval_steps_per_second": 5.587, "step": 2000 }, { "epoch": 5.062972292191436, "grad_norm": 88275.8359375, "learning_rate": 5.541561712846348e-05, "loss": 0.0503, "step": 2010 }, { "epoch": 5.08816120906801, "grad_norm": 1161589.25, "learning_rate": 5.469593378913278e-05, "loss": 0.0326, "step": 2020 }, { "epoch": 5.113350125944584, "grad_norm": 263008.125, "learning_rate": 5.397625044980209e-05, "loss": 0.103, "step": 2030 }, { "epoch": 5.138539042821159, "grad_norm": 2268.762939453125, "learning_rate": 5.32565671104714e-05, "loss": 0.0093, "step": 2040 }, { "epoch": 5.163727959697733, "grad_norm": 530844.6875, "learning_rate": 5.25368837711407e-05, "loss": 0.0919, "step": 2050 }, { "epoch": 5.188916876574307, "grad_norm": 181022.703125, "learning_rate": 5.1817200431810006e-05, "loss": 0.049, "step": 2060 }, { "epoch": 5.214105793450882, "grad_norm": 1360.029541015625, "learning_rate": 5.109751709247931e-05, "loss": 0.0656, "step": 2070 }, { "epoch": 5.239294710327456, "grad_norm": 541578.0625, "learning_rate": 5.0377833753148616e-05, "loss": 0.0287, "step": 2080 }, { "epoch": 5.26448362720403, "grad_norm": 8820.35546875, "learning_rate": 4.965815041381792e-05, "loss": 0.0252, "step": 2090 }, { "epoch": 5.289672544080605, "grad_norm": 50239.13671875, "learning_rate": 4.8938467074487226e-05, "loss": 0.0512, "step": 2100 }, { "epoch": 5.289672544080605, "eval_accuracy": 0.5283203125, "eval_f1_macro": 0.45143425422893263, "eval_loss": 3.718236207962036, "eval_runtime": 11.3333, "eval_samples_per_second": 90.353, "eval_steps_per_second": 5.647, "step": 2100 }, { "epoch": 5.314861460957179, "grad_norm": 1060627.125, "learning_rate": 4.821878373515653e-05, "loss": 0.0412, "step": 2110 }, { "epoch": 5.340050377833753, "grad_norm": 258190.28125, "learning_rate": 4.749910039582584e-05, "loss": 0.1177, "step": 2120 }, { "epoch": 5.365239294710327, "grad_norm": 126000.2265625, "learning_rate": 4.677941705649515e-05, "loss": 0.0049, "step": 2130 }, { "epoch": 5.390428211586902, "grad_norm": 2938.684814453125, "learning_rate": 4.605973371716445e-05, "loss": 0.1016, "step": 2140 }, { "epoch": 5.415617128463476, "grad_norm": 191.32302856445312, "learning_rate": 4.534005037783376e-05, "loss": 0.0051, "step": 2150 }, { "epoch": 5.44080604534005, "grad_norm": 70.82958984375, "learning_rate": 4.4620367038503055e-05, "loss": 0.0335, "step": 2160 }, { "epoch": 5.465994962216625, "grad_norm": 2224.07568359375, "learning_rate": 4.390068369917237e-05, "loss": 0.0019, "step": 2170 }, { "epoch": 5.491183879093199, "grad_norm": 7065.70703125, "learning_rate": 4.318100035984167e-05, "loss": 0.0489, "step": 2180 }, { "epoch": 5.516372795969773, "grad_norm": 218.8759765625, "learning_rate": 4.246131702051098e-05, "loss": 0.0761, "step": 2190 }, { "epoch": 5.541561712846348, "grad_norm": 2789.82763671875, "learning_rate": 4.174163368118028e-05, "loss": 0.0095, "step": 2200 }, { "epoch": 5.541561712846348, "eval_accuracy": 0.5341796875, "eval_f1_macro": 0.478510457034419, "eval_loss": 3.902801036834717, "eval_runtime": 11.7659, "eval_samples_per_second": 87.031, "eval_steps_per_second": 5.439, "step": 2200 }, { "epoch": 5.566750629722922, "grad_norm": 33521.52734375, "learning_rate": 4.1021950341849587e-05, "loss": 0.0445, "step": 2210 }, { "epoch": 5.591939546599496, "grad_norm": 339091.78125, "learning_rate": 4.03022670025189e-05, "loss": 0.0717, "step": 2220 }, { "epoch": 5.617128463476071, "grad_norm": 570797.6875, "learning_rate": 3.9582583663188196e-05, "loss": 0.0664, "step": 2230 }, { "epoch": 5.642317380352645, "grad_norm": 638298.25, "learning_rate": 3.88629003238575e-05, "loss": 0.0152, "step": 2240 }, { "epoch": 5.667506297229219, "grad_norm": 1308.215087890625, "learning_rate": 3.8143216984526806e-05, "loss": 0.0029, "step": 2250 }, { "epoch": 5.692695214105793, "grad_norm": 47.38713836669922, "learning_rate": 3.742353364519612e-05, "loss": 0.0199, "step": 2260 }, { "epoch": 5.717884130982368, "grad_norm": 56138.89453125, "learning_rate": 3.670385030586542e-05, "loss": 0.0748, "step": 2270 }, { "epoch": 5.7430730478589425, "grad_norm": 49.50386047363281, "learning_rate": 3.598416696653473e-05, "loss": 0.0174, "step": 2280 }, { "epoch": 5.768261964735516, "grad_norm": 7627.2783203125, "learning_rate": 3.526448362720403e-05, "loss": 0.002, "step": 2290 }, { "epoch": 5.793450881612091, "grad_norm": 123.52435302734375, "learning_rate": 3.454480028787334e-05, "loss": 0.0247, "step": 2300 }, { "epoch": 5.793450881612091, "eval_accuracy": 0.5712890625, "eval_f1_macro": 0.48791698609028533, "eval_loss": 3.960580348968506, "eval_runtime": 11.8204, "eval_samples_per_second": 86.63, "eval_steps_per_second": 5.414, "step": 2300 }, { "epoch": 5.818639798488665, "grad_norm": 534.7195434570312, "learning_rate": 3.382511694854264e-05, "loss": 0.0496, "step": 2310 }, { "epoch": 5.843828715365239, "grad_norm": 17011.57421875, "learning_rate": 3.310543360921195e-05, "loss": 0.1476, "step": 2320 }, { "epoch": 5.869017632241814, "grad_norm": 386354.28125, "learning_rate": 3.238575026988125e-05, "loss": 0.0593, "step": 2330 }, { "epoch": 5.894206549118388, "grad_norm": 102287.3828125, "learning_rate": 3.166606693055056e-05, "loss": 0.0117, "step": 2340 }, { "epoch": 5.919395465994962, "grad_norm": 29.17066764831543, "learning_rate": 3.094638359121986e-05, "loss": 0.0702, "step": 2350 }, { "epoch": 5.944584382871536, "grad_norm": 1611.7230224609375, "learning_rate": 3.0226700251889174e-05, "loss": 0.0687, "step": 2360 }, { "epoch": 5.969773299748111, "grad_norm": 168.02320861816406, "learning_rate": 2.950701691255848e-05, "loss": 0.0872, "step": 2370 }, { "epoch": 5.994962216624685, "grad_norm": 1058.51904296875, "learning_rate": 2.878733357322778e-05, "loss": 0.0008, "step": 2380 }, { "epoch": 6.020151133501259, "grad_norm": 23.99247932434082, "learning_rate": 2.8067650233897085e-05, "loss": 0.0526, "step": 2390 }, { "epoch": 6.045340050377834, "grad_norm": 2630.893798828125, "learning_rate": 2.734796689456639e-05, "loss": 0.0008, "step": 2400 }, { "epoch": 6.045340050377834, "eval_accuracy": 0.5654296875, "eval_f1_macro": 0.49182304765519874, "eval_loss": 4.129029750823975, "eval_runtime": 11.7471, "eval_samples_per_second": 87.17, "eval_steps_per_second": 5.448, "step": 2400 }, { "epoch": 6.0705289672544085, "grad_norm": 3671.589111328125, "learning_rate": 2.66282835552357e-05, "loss": 0.0009, "step": 2410 }, { "epoch": 6.095717884130982, "grad_norm": 246.3783721923828, "learning_rate": 2.5908600215905003e-05, "loss": 0.0279, "step": 2420 }, { "epoch": 6.120906801007557, "grad_norm": 250481.90625, "learning_rate": 2.5188916876574308e-05, "loss": 0.0367, "step": 2430 }, { "epoch": 6.146095717884131, "grad_norm": 5997.31396484375, "learning_rate": 2.4469233537243613e-05, "loss": 0.0002, "step": 2440 }, { "epoch": 6.171284634760705, "grad_norm": 294020.6875, "learning_rate": 2.374955019791292e-05, "loss": 0.0447, "step": 2450 }, { "epoch": 6.19647355163728, "grad_norm": 5173.7607421875, "learning_rate": 2.3029866858582226e-05, "loss": 0.0006, "step": 2460 }, { "epoch": 6.221662468513854, "grad_norm": 4.966667175292969, "learning_rate": 2.2310183519251528e-05, "loss": 0.0394, "step": 2470 }, { "epoch": 6.246851385390428, "grad_norm": 328896.6875, "learning_rate": 2.1590500179920836e-05, "loss": 0.0725, "step": 2480 }, { "epoch": 6.272040302267002, "grad_norm": 855.643798828125, "learning_rate": 2.087081684059014e-05, "loss": 0.0001, "step": 2490 }, { "epoch": 6.297229219143577, "grad_norm": 18.617643356323242, "learning_rate": 2.015113350125945e-05, "loss": 0.0024, "step": 2500 }, { "epoch": 6.297229219143577, "eval_accuracy": 0.5654296875, "eval_f1_macro": 0.4862593826724424, "eval_loss": 4.414713382720947, "eval_runtime": 11.3987, "eval_samples_per_second": 89.835, "eval_steps_per_second": 5.615, "step": 2500 }, { "epoch": 6.3224181360201515, "grad_norm": 43.19185256958008, "learning_rate": 1.943145016192875e-05, "loss": 0.005, "step": 2510 }, { "epoch": 6.347607052896725, "grad_norm": 636742.8125, "learning_rate": 1.871176682259806e-05, "loss": 0.0187, "step": 2520 }, { "epoch": 6.3727959697733, "grad_norm": 14.576433181762695, "learning_rate": 1.7992083483267364e-05, "loss": 0.0008, "step": 2530 }, { "epoch": 6.3979848866498745, "grad_norm": 22.115917205810547, "learning_rate": 1.727240014393667e-05, "loss": 0.0038, "step": 2540 }, { "epoch": 6.423173803526448, "grad_norm": 618.1704711914062, "learning_rate": 1.6552716804605974e-05, "loss": 0.0685, "step": 2550 }, { "epoch": 6.448362720403023, "grad_norm": 279531.71875, "learning_rate": 1.583303346527528e-05, "loss": 0.0035, "step": 2560 }, { "epoch": 6.473551637279597, "grad_norm": 265.0247802734375, "learning_rate": 1.5113350125944587e-05, "loss": 0.0245, "step": 2570 }, { "epoch": 6.498740554156171, "grad_norm": 635805.875, "learning_rate": 1.439366678661389e-05, "loss": 0.0052, "step": 2580 }, { "epoch": 6.523929471032746, "grad_norm": 8.843326568603516, "learning_rate": 1.3673983447283195e-05, "loss": 0.0, "step": 2590 }, { "epoch": 6.54911838790932, "grad_norm": 87.60523986816406, "learning_rate": 1.2954300107952502e-05, "loss": 0.0002, "step": 2600 }, { "epoch": 6.54911838790932, "eval_accuracy": 0.5654296875, "eval_f1_macro": 0.4913330578351924, "eval_loss": 4.520939826965332, "eval_runtime": 11.8845, "eval_samples_per_second": 86.163, "eval_steps_per_second": 5.385, "step": 2600 }, { "epoch": 6.574307304785894, "grad_norm": 8.957763671875, "learning_rate": 1.2234616768621806e-05, "loss": 0.0, "step": 2610 }, { "epoch": 6.599496221662468, "grad_norm": 6.1030144691467285, "learning_rate": 1.1514933429291113e-05, "loss": 0.0009, "step": 2620 }, { "epoch": 6.624685138539043, "grad_norm": 4.862893581390381, "learning_rate": 1.0795250089960418e-05, "loss": 0.1231, "step": 2630 }, { "epoch": 6.6498740554156175, "grad_norm": 598.0420532226562, "learning_rate": 1.0075566750629725e-05, "loss": 0.0001, "step": 2640 }, { "epoch": 6.675062972292191, "grad_norm": 5.682479381561279, "learning_rate": 9.35588341129903e-06, "loss": 0.0365, "step": 2650 }, { "epoch": 6.700251889168766, "grad_norm": 1.800890564918518, "learning_rate": 8.636200071968334e-06, "loss": 0.133, "step": 2660 }, { "epoch": 6.72544080604534, "grad_norm": 718.6954345703125, "learning_rate": 7.91651673263764e-06, "loss": 0.0, "step": 2670 }, { "epoch": 6.750629722921914, "grad_norm": 338166.46875, "learning_rate": 7.196833393306945e-06, "loss": 0.034, "step": 2680 }, { "epoch": 6.775818639798489, "grad_norm": 471.62005615234375, "learning_rate": 6.477150053976251e-06, "loss": 0.0599, "step": 2690 }, { "epoch": 6.801007556675063, "grad_norm": 308.09417724609375, "learning_rate": 5.7574667146455565e-06, "loss": 0.0055, "step": 2700 }, { "epoch": 6.801007556675063, "eval_accuracy": 0.58203125, "eval_f1_macro": 0.5067161880167751, "eval_loss": 4.515384197235107, "eval_runtime": 11.8883, "eval_samples_per_second": 86.135, "eval_steps_per_second": 5.383, "step": 2700 }, { "epoch": 6.826196473551637, "grad_norm": 407.3280334472656, "learning_rate": 5.037783375314862e-06, "loss": 0.0, "step": 2710 }, { "epoch": 6.851385390428211, "grad_norm": 396.4019470214844, "learning_rate": 4.318100035984167e-06, "loss": 0.0015, "step": 2720 }, { "epoch": 6.876574307304786, "grad_norm": 2.2462317943573, "learning_rate": 3.5984166966534725e-06, "loss": 0.0007, "step": 2730 }, { "epoch": 6.9017632241813605, "grad_norm": 40.78224182128906, "learning_rate": 2.8787333573227783e-06, "loss": 0.0001, "step": 2740 }, { "epoch": 6.926952141057934, "grad_norm": 11996.0771484375, "learning_rate": 2.1590500179920836e-06, "loss": 0.0002, "step": 2750 }, { "epoch": 6.952141057934509, "grad_norm": 2.2435834407806396, "learning_rate": 1.4393666786613891e-06, "loss": 0.0001, "step": 2760 }, { "epoch": 6.977329974811083, "grad_norm": 2109.901123046875, "learning_rate": 7.196833393306946e-07, "loss": 0.0832, "step": 2770 }, { "epoch": 7.0, "step": 2779, "total_flos": 2.5585840915697664e+18, "train_loss": 0.4836694428009911, "train_runtime": 1538.4023, "train_samples_per_second": 28.894, "train_steps_per_second": 1.806 } ], "logging_steps": 10, "max_steps": 2779, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5585840915697664e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }