{ "best_global_step": 2464, "best_metric": 0.7966146756469337, "best_model_checkpoint": "Obstacle/dinov2/checkpoint-2464", "epoch": 22.0, "eval_steps": 500, "global_step": 2464, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0894854586129754, "grad_norm": 68.26221466064453, "learning_rate": 3.214285714285714e-07, "loss": 0.9281, "step": 10 }, { "epoch": 0.1789709172259508, "grad_norm": 49.921424865722656, "learning_rate": 6.785714285714286e-07, "loss": 0.8782, "step": 20 }, { "epoch": 0.2684563758389262, "grad_norm": 63.9870491027832, "learning_rate": 1.0357142857142859e-06, "loss": 0.769, "step": 30 }, { "epoch": 0.3579418344519016, "grad_norm": 27.94622230529785, "learning_rate": 1.392857142857143e-06, "loss": 0.7484, "step": 40 }, { "epoch": 0.44742729306487694, "grad_norm": 31.261272430419922, "learning_rate": 1.75e-06, "loss": 0.7224, "step": 50 }, { "epoch": 0.5369127516778524, "grad_norm": 30.373384475708008, "learning_rate": 2.1071428571428572e-06, "loss": 0.6935, "step": 60 }, { "epoch": 0.6263982102908278, "grad_norm": 25.922801971435547, "learning_rate": 2.4642857142857147e-06, "loss": 0.656, "step": 70 }, { "epoch": 0.7158836689038032, "grad_norm": 21.445323944091797, "learning_rate": 2.8214285714285718e-06, "loss": 0.6086, "step": 80 }, { "epoch": 0.8053691275167785, "grad_norm": 47.42407989501953, "learning_rate": 3.178571428571429e-06, "loss": 0.7216, "step": 90 }, { "epoch": 0.8948545861297539, "grad_norm": 23.003158569335938, "learning_rate": 3.5357142857142863e-06, "loss": 0.6132, "step": 100 }, { "epoch": 0.9843400447427293, "grad_norm": 31.152210235595703, "learning_rate": 3.892857142857143e-06, "loss": 0.6263, "step": 110 }, { "epoch": 1.0, "eval_loss": 0.572486162185669, "eval_macro_f1": 0.6811600432857552, "eval_runtime": 118.5033, "eval_samples_per_second": 6.456, "eval_steps_per_second": 0.81, "step": 112 }, { "epoch": 1.0715883668903803, "grad_norm": 19.629968643188477, "learning_rate": 4.25e-06, "loss": 0.5971, "step": 120 }, { "epoch": 1.1610738255033557, "grad_norm": 38.47784423828125, "learning_rate": 4.6071428571428574e-06, "loss": 0.5379, "step": 130 }, { "epoch": 1.250559284116331, "grad_norm": 24.479284286499023, "learning_rate": 4.964285714285715e-06, "loss": 0.529, "step": 140 }, { "epoch": 1.3400447427293065, "grad_norm": 20.997512817382812, "learning_rate": 5.3214285714285715e-06, "loss": 0.6006, "step": 150 }, { "epoch": 1.429530201342282, "grad_norm": 52.60095977783203, "learning_rate": 5.678571428571429e-06, "loss": 0.7468, "step": 160 }, { "epoch": 1.5190156599552571, "grad_norm": 15.642265319824219, "learning_rate": 6.035714285714286e-06, "loss": 0.559, "step": 170 }, { "epoch": 1.6085011185682325, "grad_norm": 70.16302490234375, "learning_rate": 6.392857142857143e-06, "loss": 0.6043, "step": 180 }, { "epoch": 1.697986577181208, "grad_norm": 26.643476486206055, "learning_rate": 6.750000000000001e-06, "loss": 0.5636, "step": 190 }, { "epoch": 1.7874720357941833, "grad_norm": 177.88034057617188, "learning_rate": 7.107142857142858e-06, "loss": 0.5427, "step": 200 }, { "epoch": 1.8769574944071588, "grad_norm": 38.7111701965332, "learning_rate": 7.464285714285715e-06, "loss": 0.5486, "step": 210 }, { "epoch": 1.9664429530201342, "grad_norm": 21.803434371948242, "learning_rate": 7.821428571428571e-06, "loss": 0.609, "step": 220 }, { "epoch": 2.0, "eval_loss": 0.5907321572303772, "eval_macro_f1": 0.7001507404932478, "eval_runtime": 121.4035, "eval_samples_per_second": 6.301, "eval_steps_per_second": 0.791, "step": 224 }, { "epoch": 2.053691275167785, "grad_norm": 23.68389320373535, "learning_rate": 8.17857142857143e-06, "loss": 0.4989, "step": 230 }, { "epoch": 2.1431767337807606, "grad_norm": 26.48926544189453, "learning_rate": 8.535714285714286e-06, "loss": 0.5119, "step": 240 }, { "epoch": 2.232662192393736, "grad_norm": 25.900455474853516, "learning_rate": 8.892857142857143e-06, "loss": 0.5046, "step": 250 }, { "epoch": 2.3221476510067114, "grad_norm": 26.200103759765625, "learning_rate": 9.250000000000001e-06, "loss": 0.4793, "step": 260 }, { "epoch": 2.411633109619687, "grad_norm": 36.33053970336914, "learning_rate": 9.607142857142858e-06, "loss": 0.5127, "step": 270 }, { "epoch": 2.501118568232662, "grad_norm": 51.3528938293457, "learning_rate": 9.964285714285714e-06, "loss": 0.5188, "step": 280 }, { "epoch": 2.5906040268456376, "grad_norm": 28.10676383972168, "learning_rate": 9.964285714285714e-06, "loss": 0.4858, "step": 290 }, { "epoch": 2.680089485458613, "grad_norm": 25.275537490844727, "learning_rate": 9.924603174603175e-06, "loss": 0.497, "step": 300 }, { "epoch": 2.7695749440715884, "grad_norm": 23.899168014526367, "learning_rate": 9.884920634920636e-06, "loss": 0.4843, "step": 310 }, { "epoch": 2.859060402684564, "grad_norm": 21.87393569946289, "learning_rate": 9.845238095238097e-06, "loss": 0.5006, "step": 320 }, { "epoch": 2.9485458612975393, "grad_norm": 27.74087905883789, "learning_rate": 9.805555555555556e-06, "loss": 0.4471, "step": 330 }, { "epoch": 3.0, "eval_loss": 0.5459941029548645, "eval_macro_f1": 0.726109493936894, "eval_runtime": 121.5217, "eval_samples_per_second": 6.295, "eval_steps_per_second": 0.79, "step": 336 }, { "epoch": 3.0357941834451903, "grad_norm": 41.323997497558594, "learning_rate": 9.765873015873017e-06, "loss": 0.5304, "step": 340 }, { "epoch": 3.1252796420581657, "grad_norm": 31.197467803955078, "learning_rate": 9.726190476190477e-06, "loss": 0.4863, "step": 350 }, { "epoch": 3.214765100671141, "grad_norm": 15.407756805419922, "learning_rate": 9.686507936507938e-06, "loss": 0.4116, "step": 360 }, { "epoch": 3.3042505592841165, "grad_norm": 19.091278076171875, "learning_rate": 9.646825396825397e-06, "loss": 0.4092, "step": 370 }, { "epoch": 3.393736017897092, "grad_norm": 42.068511962890625, "learning_rate": 9.607142857142858e-06, "loss": 0.4627, "step": 380 }, { "epoch": 3.4832214765100673, "grad_norm": 26.687232971191406, "learning_rate": 9.567460317460319e-06, "loss": 0.4487, "step": 390 }, { "epoch": 3.5727069351230423, "grad_norm": 22.164098739624023, "learning_rate": 9.527777777777778e-06, "loss": 0.48, "step": 400 }, { "epoch": 3.662192393736018, "grad_norm": 22.220373153686523, "learning_rate": 9.488095238095238e-06, "loss": 0.53, "step": 410 }, { "epoch": 3.751677852348993, "grad_norm": 8.822561264038086, "learning_rate": 9.4484126984127e-06, "loss": 0.4999, "step": 420 }, { "epoch": 3.841163310961969, "grad_norm": 11.927675247192383, "learning_rate": 9.40873015873016e-06, "loss": 0.4642, "step": 430 }, { "epoch": 3.930648769574944, "grad_norm": 14.933186531066895, "learning_rate": 9.36904761904762e-06, "loss": 0.4145, "step": 440 }, { "epoch": 4.0, "eval_loss": 0.5586118102073669, "eval_macro_f1": 0.741893986276926, "eval_runtime": 120.2217, "eval_samples_per_second": 6.363, "eval_steps_per_second": 0.799, "step": 448 }, { "epoch": 4.017897091722595, "grad_norm": 23.335527420043945, "learning_rate": 9.32936507936508e-06, "loss": 0.4222, "step": 450 }, { "epoch": 4.10738255033557, "grad_norm": 28.71408462524414, "learning_rate": 9.28968253968254e-06, "loss": 0.3959, "step": 460 }, { "epoch": 4.196868008948546, "grad_norm": 20.4088077545166, "learning_rate": 9.250000000000001e-06, "loss": 0.3423, "step": 470 }, { "epoch": 4.286353467561521, "grad_norm": 39.36516189575195, "learning_rate": 9.21031746031746e-06, "loss": 0.3851, "step": 480 }, { "epoch": 4.375838926174497, "grad_norm": 32.29376983642578, "learning_rate": 9.170634920634921e-06, "loss": 0.3766, "step": 490 }, { "epoch": 4.465324384787472, "grad_norm": 13.637434959411621, "learning_rate": 9.130952380952382e-06, "loss": 0.3328, "step": 500 }, { "epoch": 4.554809843400448, "grad_norm": 23.359638214111328, "learning_rate": 9.091269841269843e-06, "loss": 0.3988, "step": 510 }, { "epoch": 4.644295302013423, "grad_norm": 25.244564056396484, "learning_rate": 9.051587301587302e-06, "loss": 0.3149, "step": 520 }, { "epoch": 4.733780760626399, "grad_norm": 273.609619140625, "learning_rate": 9.011904761904762e-06, "loss": 0.3685, "step": 530 }, { "epoch": 4.823266219239374, "grad_norm": 18.818504333496094, "learning_rate": 8.972222222222223e-06, "loss": 0.2867, "step": 540 }, { "epoch": 4.912751677852349, "grad_norm": 19.458040237426758, "learning_rate": 8.932539682539684e-06, "loss": 0.3952, "step": 550 }, { "epoch": 5.0, "grad_norm": 11.257533073425293, "learning_rate": 8.892857142857143e-06, "loss": 0.3553, "step": 560 }, { "epoch": 5.0, "eval_loss": 0.5237393379211426, "eval_macro_f1": 0.7685140098500236, "eval_runtime": 124.8114, "eval_samples_per_second": 6.129, "eval_steps_per_second": 0.769, "step": 560 }, { "epoch": 5.089485458612975, "grad_norm": 16.261404037475586, "learning_rate": 8.853174603174604e-06, "loss": 0.3108, "step": 570 }, { "epoch": 5.178970917225951, "grad_norm": 16.059083938598633, "learning_rate": 8.813492063492064e-06, "loss": 0.2986, "step": 580 }, { "epoch": 5.268456375838926, "grad_norm": 22.006534576416016, "learning_rate": 8.773809523809525e-06, "loss": 0.2952, "step": 590 }, { "epoch": 5.357941834451902, "grad_norm": 16.75338363647461, "learning_rate": 8.734126984126984e-06, "loss": 0.2512, "step": 600 }, { "epoch": 5.447427293064877, "grad_norm": 36.52522659301758, "learning_rate": 8.694444444444445e-06, "loss": 0.2308, "step": 610 }, { "epoch": 5.5369127516778525, "grad_norm": 14.535757064819336, "learning_rate": 8.654761904761906e-06, "loss": 0.3012, "step": 620 }, { "epoch": 5.626398210290827, "grad_norm": 22.867900848388672, "learning_rate": 8.615079365079366e-06, "loss": 0.3232, "step": 630 }, { "epoch": 5.715883668903803, "grad_norm": 19.417451858520508, "learning_rate": 8.575396825396826e-06, "loss": 0.3173, "step": 640 }, { "epoch": 5.805369127516778, "grad_norm": 21.25806427001953, "learning_rate": 8.535714285714286e-06, "loss": 0.3376, "step": 650 }, { "epoch": 5.894854586129754, "grad_norm": 11.842672348022461, "learning_rate": 8.496031746031747e-06, "loss": 0.3756, "step": 660 }, { "epoch": 5.984340044742729, "grad_norm": 16.4525203704834, "learning_rate": 8.456349206349208e-06, "loss": 0.2923, "step": 670 }, { "epoch": 6.0, "eval_loss": 0.5606415271759033, "eval_macro_f1": 0.7435031036046287, "eval_runtime": 115.5398, "eval_samples_per_second": 6.621, "eval_steps_per_second": 0.831, "step": 672 }, { "epoch": 6.0715883668903805, "grad_norm": 20.93574333190918, "learning_rate": 8.416666666666667e-06, "loss": 0.2473, "step": 680 }, { "epoch": 6.1610738255033555, "grad_norm": 26.189205169677734, "learning_rate": 8.376984126984128e-06, "loss": 0.2403, "step": 690 }, { "epoch": 6.250559284116331, "grad_norm": 13.84333610534668, "learning_rate": 8.337301587301588e-06, "loss": 0.231, "step": 700 }, { "epoch": 6.340044742729306, "grad_norm": 22.295377731323242, "learning_rate": 8.297619047619049e-06, "loss": 0.2416, "step": 710 }, { "epoch": 6.429530201342282, "grad_norm": 14.893708229064941, "learning_rate": 8.257936507936508e-06, "loss": 0.2409, "step": 720 }, { "epoch": 6.519015659955257, "grad_norm": 14.828768730163574, "learning_rate": 8.218253968253969e-06, "loss": 0.1927, "step": 730 }, { "epoch": 6.608501118568233, "grad_norm": 23.54037094116211, "learning_rate": 8.17857142857143e-06, "loss": 0.2373, "step": 740 }, { "epoch": 6.697986577181208, "grad_norm": 20.361324310302734, "learning_rate": 8.138888888888889e-06, "loss": 0.2266, "step": 750 }, { "epoch": 6.787472035794184, "grad_norm": 32.86820602416992, "learning_rate": 8.09920634920635e-06, "loss": 0.3065, "step": 760 }, { "epoch": 6.876957494407159, "grad_norm": 25.08152198791504, "learning_rate": 8.05952380952381e-06, "loss": 0.307, "step": 770 }, { "epoch": 6.966442953020135, "grad_norm": 21.257457733154297, "learning_rate": 8.019841269841271e-06, "loss": 0.2943, "step": 780 }, { "epoch": 7.0, "eval_loss": 0.5504983067512512, "eval_macro_f1": 0.7728941735285386, "eval_runtime": 119.5498, "eval_samples_per_second": 6.399, "eval_steps_per_second": 0.803, "step": 784 }, { "epoch": 7.053691275167785, "grad_norm": 13.639359474182129, "learning_rate": 7.980158730158732e-06, "loss": 0.2103, "step": 790 }, { "epoch": 7.143176733780761, "grad_norm": 27.568639755249023, "learning_rate": 7.94047619047619e-06, "loss": 0.1676, "step": 800 }, { "epoch": 7.232662192393736, "grad_norm": 17.152692794799805, "learning_rate": 7.900793650793652e-06, "loss": 0.1818, "step": 810 }, { "epoch": 7.322147651006711, "grad_norm": 16.47798728942871, "learning_rate": 7.861111111111112e-06, "loss": 0.2058, "step": 820 }, { "epoch": 7.411633109619687, "grad_norm": 17.176942825317383, "learning_rate": 7.821428571428571e-06, "loss": 0.1508, "step": 830 }, { "epoch": 7.501118568232663, "grad_norm": 29.886573791503906, "learning_rate": 7.781746031746032e-06, "loss": 0.2434, "step": 840 }, { "epoch": 7.590604026845638, "grad_norm": 22.851221084594727, "learning_rate": 7.742063492063493e-06, "loss": 0.2431, "step": 850 }, { "epoch": 7.680089485458613, "grad_norm": 9.74971866607666, "learning_rate": 7.702380952380954e-06, "loss": 0.1905, "step": 860 }, { "epoch": 7.769574944071588, "grad_norm": 22.99750328063965, "learning_rate": 7.662698412698414e-06, "loss": 0.2215, "step": 870 }, { "epoch": 7.859060402684563, "grad_norm": 29.295093536376953, "learning_rate": 7.623015873015873e-06, "loss": 0.2222, "step": 880 }, { "epoch": 7.948545861297539, "grad_norm": 13.15281867980957, "learning_rate": 7.583333333333333e-06, "loss": 0.2172, "step": 890 }, { "epoch": 8.0, "eval_loss": 0.5693853497505188, "eval_macro_f1": 0.7617472396955967, "eval_runtime": 71.7674, "eval_samples_per_second": 10.659, "eval_steps_per_second": 1.338, "step": 896 }, { "epoch": 8.03579418344519, "grad_norm": 17.01239585876465, "learning_rate": 7.543650793650794e-06, "loss": 0.2183, "step": 900 }, { "epoch": 8.125279642058166, "grad_norm": 16.31254768371582, "learning_rate": 7.503968253968255e-06, "loss": 0.1393, "step": 910 }, { "epoch": 8.21476510067114, "grad_norm": 21.926393508911133, "learning_rate": 7.464285714285715e-06, "loss": 0.2104, "step": 920 }, { "epoch": 8.304250559284116, "grad_norm": 16.77183723449707, "learning_rate": 7.4246031746031754e-06, "loss": 0.1808, "step": 930 }, { "epoch": 8.393736017897092, "grad_norm": 10.125628471374512, "learning_rate": 7.384920634920636e-06, "loss": 0.142, "step": 940 }, { "epoch": 8.483221476510067, "grad_norm": 8.853920936584473, "learning_rate": 7.345238095238096e-06, "loss": 0.1674, "step": 950 }, { "epoch": 8.572706935123042, "grad_norm": 16.407033920288086, "learning_rate": 7.305555555555556e-06, "loss": 0.1684, "step": 960 }, { "epoch": 8.662192393736017, "grad_norm": 19.892669677734375, "learning_rate": 7.265873015873016e-06, "loss": 0.1673, "step": 970 }, { "epoch": 8.751677852348994, "grad_norm": 5.113985061645508, "learning_rate": 7.226190476190477e-06, "loss": 0.2087, "step": 980 }, { "epoch": 8.841163310961969, "grad_norm": 10.283279418945312, "learning_rate": 7.186507936507937e-06, "loss": 0.1728, "step": 990 }, { "epoch": 8.930648769574944, "grad_norm": 11.258045196533203, "learning_rate": 7.146825396825397e-06, "loss": 0.2067, "step": 1000 }, { "epoch": 9.0, "eval_loss": 0.6666957139968872, "eval_macro_f1": 0.7585227272727273, "eval_runtime": 75.2405, "eval_samples_per_second": 10.167, "eval_steps_per_second": 1.276, "step": 1008 }, { "epoch": 9.017897091722595, "grad_norm": 31.886018753051758, "learning_rate": 7.107142857142858e-06, "loss": 0.1853, "step": 1010 }, { "epoch": 9.10738255033557, "grad_norm": 21.961132049560547, "learning_rate": 7.067460317460319e-06, "loss": 0.1823, "step": 1020 }, { "epoch": 9.196868008948545, "grad_norm": 9.536689758300781, "learning_rate": 7.027777777777778e-06, "loss": 0.1289, "step": 1030 }, { "epoch": 9.286353467561522, "grad_norm": 17.52619171142578, "learning_rate": 6.988095238095239e-06, "loss": 0.1421, "step": 1040 }, { "epoch": 9.375838926174497, "grad_norm": 5.5908379554748535, "learning_rate": 6.9484126984126985e-06, "loss": 0.1024, "step": 1050 }, { "epoch": 9.465324384787472, "grad_norm": 15.923222541809082, "learning_rate": 6.908730158730159e-06, "loss": 0.1363, "step": 1060 }, { "epoch": 9.554809843400447, "grad_norm": 7.426005840301514, "learning_rate": 6.86904761904762e-06, "loss": 0.1851, "step": 1070 }, { "epoch": 9.644295302013422, "grad_norm": 2.362064838409424, "learning_rate": 6.82936507936508e-06, "loss": 0.1177, "step": 1080 }, { "epoch": 9.733780760626399, "grad_norm": 16.235544204711914, "learning_rate": 6.789682539682541e-06, "loss": 0.0991, "step": 1090 }, { "epoch": 9.823266219239374, "grad_norm": 8.337503433227539, "learning_rate": 6.750000000000001e-06, "loss": 0.13, "step": 1100 }, { "epoch": 9.912751677852349, "grad_norm": 34.09331130981445, "learning_rate": 6.7103174603174605e-06, "loss": 0.1171, "step": 1110 }, { "epoch": 10.0, "grad_norm": 25.452791213989258, "learning_rate": 6.67063492063492e-06, "loss": 0.1817, "step": 1120 }, { "epoch": 10.0, "eval_loss": 0.8301987051963806, "eval_macro_f1": 0.7432961635470596, "eval_runtime": 71.6774, "eval_samples_per_second": 10.673, "eval_steps_per_second": 1.339, "step": 1120 }, { "epoch": 10.089485458612975, "grad_norm": 19.931293487548828, "learning_rate": 6.630952380952381e-06, "loss": 0.1405, "step": 1130 }, { "epoch": 10.17897091722595, "grad_norm": 20.297443389892578, "learning_rate": 6.591269841269842e-06, "loss": 0.1359, "step": 1140 }, { "epoch": 10.268456375838927, "grad_norm": 11.013289451599121, "learning_rate": 6.551587301587302e-06, "loss": 0.0918, "step": 1150 }, { "epoch": 10.357941834451902, "grad_norm": 19.394319534301758, "learning_rate": 6.5119047619047626e-06, "loss": 0.1392, "step": 1160 }, { "epoch": 10.447427293064877, "grad_norm": 28.343791961669922, "learning_rate": 6.472222222222223e-06, "loss": 0.1866, "step": 1170 }, { "epoch": 10.536912751677852, "grad_norm": 14.38354778289795, "learning_rate": 6.432539682539683e-06, "loss": 0.1381, "step": 1180 }, { "epoch": 10.626398210290828, "grad_norm": 24.752470016479492, "learning_rate": 6.392857142857143e-06, "loss": 0.202, "step": 1190 }, { "epoch": 10.715883668903803, "grad_norm": 19.56192970275879, "learning_rate": 6.353174603174603e-06, "loss": 0.1566, "step": 1200 }, { "epoch": 10.805369127516778, "grad_norm": 20.234485626220703, "learning_rate": 6.313492063492064e-06, "loss": 0.1061, "step": 1210 }, { "epoch": 10.894854586129753, "grad_norm": 19.531757354736328, "learning_rate": 6.2738095238095245e-06, "loss": 0.1873, "step": 1220 }, { "epoch": 10.98434004474273, "grad_norm": 16.655208587646484, "learning_rate": 6.2341269841269844e-06, "loss": 0.1102, "step": 1230 }, { "epoch": 11.0, "eval_loss": 0.6939279437065125, "eval_macro_f1": 0.769257323981233, "eval_runtime": 73.198, "eval_samples_per_second": 10.451, "eval_steps_per_second": 1.312, "step": 1232 }, { "epoch": 11.07158836689038, "grad_norm": 5.695398330688477, "learning_rate": 6.194444444444445e-06, "loss": 0.1283, "step": 1240 }, { "epoch": 11.161073825503356, "grad_norm": 9.851438522338867, "learning_rate": 6.154761904761906e-06, "loss": 0.1065, "step": 1250 }, { "epoch": 11.250559284116331, "grad_norm": 29.838871002197266, "learning_rate": 6.115079365079366e-06, "loss": 0.1426, "step": 1260 }, { "epoch": 11.340044742729306, "grad_norm": 7.144505500793457, "learning_rate": 6.075396825396826e-06, "loss": 0.093, "step": 1270 }, { "epoch": 11.429530201342281, "grad_norm": 15.980908393859863, "learning_rate": 6.035714285714286e-06, "loss": 0.1508, "step": 1280 }, { "epoch": 11.519015659955258, "grad_norm": 23.028108596801758, "learning_rate": 5.996031746031746e-06, "loss": 0.1719, "step": 1290 }, { "epoch": 11.608501118568233, "grad_norm": 14.000625610351562, "learning_rate": 5.956349206349207e-06, "loss": 0.1063, "step": 1300 }, { "epoch": 11.697986577181208, "grad_norm": 13.606029510498047, "learning_rate": 5.916666666666667e-06, "loss": 0.1194, "step": 1310 }, { "epoch": 11.787472035794183, "grad_norm": 13.779529571533203, "learning_rate": 5.876984126984128e-06, "loss": 0.1728, "step": 1320 }, { "epoch": 11.87695749440716, "grad_norm": 10.599024772644043, "learning_rate": 5.8373015873015886e-06, "loss": 0.0797, "step": 1330 }, { "epoch": 11.966442953020135, "grad_norm": 11.666370391845703, "learning_rate": 5.7976190476190485e-06, "loss": 0.1175, "step": 1340 }, { "epoch": 12.0, "eval_loss": 0.6560544967651367, "eval_macro_f1": 0.7722084367245657, "eval_runtime": 65.1441, "eval_samples_per_second": 11.743, "eval_steps_per_second": 1.474, "step": 1344 }, { "epoch": 12.053691275167786, "grad_norm": 12.950461387634277, "learning_rate": 5.7579365079365075e-06, "loss": 0.1042, "step": 1350 }, { "epoch": 12.143176733780761, "grad_norm": 9.609066009521484, "learning_rate": 5.718253968253968e-06, "loss": 0.12, "step": 1360 }, { "epoch": 12.232662192393736, "grad_norm": 23.64732551574707, "learning_rate": 5.678571428571429e-06, "loss": 0.1145, "step": 1370 }, { "epoch": 12.322147651006711, "grad_norm": 32.57529067993164, "learning_rate": 5.638888888888889e-06, "loss": 0.0992, "step": 1380 }, { "epoch": 12.411633109619686, "grad_norm": 7.419304370880127, "learning_rate": 5.59920634920635e-06, "loss": 0.0988, "step": 1390 }, { "epoch": 12.501118568232663, "grad_norm": 6.912314414978027, "learning_rate": 5.5595238095238104e-06, "loss": 0.1157, "step": 1400 }, { "epoch": 12.590604026845638, "grad_norm": 19.71913719177246, "learning_rate": 5.51984126984127e-06, "loss": 0.1478, "step": 1410 }, { "epoch": 12.680089485458613, "grad_norm": 20.64476203918457, "learning_rate": 5.480158730158731e-06, "loss": 0.1196, "step": 1420 }, { "epoch": 12.769574944071588, "grad_norm": 15.437020301818848, "learning_rate": 5.44047619047619e-06, "loss": 0.0766, "step": 1430 }, { "epoch": 12.859060402684564, "grad_norm": 9.825302124023438, "learning_rate": 5.400793650793651e-06, "loss": 0.1254, "step": 1440 }, { "epoch": 12.94854586129754, "grad_norm": 26.381696701049805, "learning_rate": 5.361111111111112e-06, "loss": 0.1025, "step": 1450 }, { "epoch": 13.0, "eval_loss": 0.7549577951431274, "eval_macro_f1": 0.7658037701779237, "eval_runtime": 75.4344, "eval_samples_per_second": 10.141, "eval_steps_per_second": 1.273, "step": 1456 }, { "epoch": 13.03579418344519, "grad_norm": 25.113908767700195, "learning_rate": 5.3214285714285715e-06, "loss": 0.1438, "step": 1460 }, { "epoch": 13.125279642058166, "grad_norm": 10.833968162536621, "learning_rate": 5.281746031746032e-06, "loss": 0.1749, "step": 1470 }, { "epoch": 13.21476510067114, "grad_norm": 22.27955436706543, "learning_rate": 5.242063492063493e-06, "loss": 0.1608, "step": 1480 }, { "epoch": 13.304250559284116, "grad_norm": 15.125386238098145, "learning_rate": 5.202380952380953e-06, "loss": 0.1503, "step": 1490 }, { "epoch": 13.393736017897092, "grad_norm": 3.4376182556152344, "learning_rate": 5.162698412698414e-06, "loss": 0.0819, "step": 1500 }, { "epoch": 13.483221476510067, "grad_norm": 15.98349380493164, "learning_rate": 5.123015873015873e-06, "loss": 0.0837, "step": 1510 }, { "epoch": 13.572706935123042, "grad_norm": 11.065319061279297, "learning_rate": 5.0833333333333335e-06, "loss": 0.1035, "step": 1520 }, { "epoch": 13.662192393736017, "grad_norm": 10.306619644165039, "learning_rate": 5.043650793650794e-06, "loss": 0.1075, "step": 1530 }, { "epoch": 13.751677852348994, "grad_norm": 15.42297077178955, "learning_rate": 5.003968253968254e-06, "loss": 0.1208, "step": 1540 }, { "epoch": 13.841163310961969, "grad_norm": 20.406225204467773, "learning_rate": 4.964285714285715e-06, "loss": 0.1014, "step": 1550 }, { "epoch": 13.930648769574944, "grad_norm": 16.1427059173584, "learning_rate": 4.924603174603176e-06, "loss": 0.1593, "step": 1560 }, { "epoch": 14.0, "eval_loss": 0.7316491007804871, "eval_macro_f1": 0.7694998475634216, "eval_runtime": 69.4982, "eval_samples_per_second": 11.007, "eval_steps_per_second": 1.381, "step": 1568 }, { "epoch": 14.017897091722595, "grad_norm": 9.524744987487793, "learning_rate": 4.8849206349206356e-06, "loss": 0.1199, "step": 1570 }, { "epoch": 14.10738255033557, "grad_norm": 14.937419891357422, "learning_rate": 4.8452380952380955e-06, "loss": 0.1189, "step": 1580 }, { "epoch": 14.196868008948545, "grad_norm": 19.528654098510742, "learning_rate": 4.805555555555556e-06, "loss": 0.1114, "step": 1590 }, { "epoch": 14.286353467561522, "grad_norm": 2.1810483932495117, "learning_rate": 4.765873015873016e-06, "loss": 0.0805, "step": 1600 }, { "epoch": 14.375838926174497, "grad_norm": 5.020853042602539, "learning_rate": 4.726190476190476e-06, "loss": 0.1097, "step": 1610 }, { "epoch": 14.465324384787472, "grad_norm": 12.4754638671875, "learning_rate": 4.686507936507937e-06, "loss": 0.1231, "step": 1620 }, { "epoch": 14.554809843400447, "grad_norm": 25.44110679626465, "learning_rate": 4.6468253968253975e-06, "loss": 0.0903, "step": 1630 }, { "epoch": 14.644295302013422, "grad_norm": 27.849111557006836, "learning_rate": 4.6071428571428574e-06, "loss": 0.0905, "step": 1640 }, { "epoch": 14.733780760626399, "grad_norm": 18.283781051635742, "learning_rate": 4.567460317460317e-06, "loss": 0.0617, "step": 1650 }, { "epoch": 14.823266219239374, "grad_norm": 17.507619857788086, "learning_rate": 4.527777777777778e-06, "loss": 0.13, "step": 1660 }, { "epoch": 14.912751677852349, "grad_norm": 21.756675720214844, "learning_rate": 4.488095238095239e-06, "loss": 0.0926, "step": 1670 }, { "epoch": 15.0, "grad_norm": 19.568700790405273, "learning_rate": 4.448412698412699e-06, "loss": 0.0954, "step": 1680 }, { "epoch": 15.0, "eval_loss": 0.6716505885124207, "eval_macro_f1": 0.7774340887550624, "eval_runtime": 72.4554, "eval_samples_per_second": 10.558, "eval_steps_per_second": 1.325, "step": 1680 }, { "epoch": 15.089485458612975, "grad_norm": 18.1299991607666, "learning_rate": 4.408730158730159e-06, "loss": 0.0678, "step": 1690 }, { "epoch": 15.17897091722595, "grad_norm": 4.439563751220703, "learning_rate": 4.369047619047619e-06, "loss": 0.0767, "step": 1700 }, { "epoch": 15.268456375838927, "grad_norm": 14.997693061828613, "learning_rate": 4.32936507936508e-06, "loss": 0.0664, "step": 1710 }, { "epoch": 15.357941834451902, "grad_norm": 7.552863597869873, "learning_rate": 4.28968253968254e-06, "loss": 0.0842, "step": 1720 }, { "epoch": 15.447427293064877, "grad_norm": 8.728134155273438, "learning_rate": 4.25e-06, "loss": 0.1015, "step": 1730 }, { "epoch": 15.536912751677852, "grad_norm": 11.534658432006836, "learning_rate": 4.210317460317461e-06, "loss": 0.1085, "step": 1740 }, { "epoch": 15.626398210290828, "grad_norm": 22.487648010253906, "learning_rate": 4.1706349206349215e-06, "loss": 0.0993, "step": 1750 }, { "epoch": 15.715883668903803, "grad_norm": 4.885320663452148, "learning_rate": 4.130952380952381e-06, "loss": 0.093, "step": 1760 }, { "epoch": 15.805369127516778, "grad_norm": 16.688884735107422, "learning_rate": 4.091269841269841e-06, "loss": 0.0732, "step": 1770 }, { "epoch": 15.894854586129753, "grad_norm": 30.58871841430664, "learning_rate": 4.051587301587302e-06, "loss": 0.0867, "step": 1780 }, { "epoch": 15.98434004474273, "grad_norm": 17.19178009033203, "learning_rate": 4.011904761904763e-06, "loss": 0.0676, "step": 1790 }, { "epoch": 16.0, "eval_loss": 0.7994140386581421, "eval_macro_f1": 0.7741348066298342, "eval_runtime": 75.3551, "eval_samples_per_second": 10.152, "eval_steps_per_second": 1.274, "step": 1792 }, { "epoch": 16.07158836689038, "grad_norm": 10.005722999572754, "learning_rate": 3.972222222222223e-06, "loss": 0.0858, "step": 1800 }, { "epoch": 16.161073825503355, "grad_norm": 20.64703369140625, "learning_rate": 3.932539682539683e-06, "loss": 0.0836, "step": 1810 }, { "epoch": 16.25055928411633, "grad_norm": 8.218932151794434, "learning_rate": 3.892857142857143e-06, "loss": 0.0969, "step": 1820 }, { "epoch": 16.340044742729308, "grad_norm": 0.8766313195228577, "learning_rate": 3.853174603174604e-06, "loss": 0.0432, "step": 1830 }, { "epoch": 16.42953020134228, "grad_norm": 12.166719436645508, "learning_rate": 3.8134920634920636e-06, "loss": 0.0844, "step": 1840 }, { "epoch": 16.519015659955258, "grad_norm": 8.594962120056152, "learning_rate": 3.773809523809524e-06, "loss": 0.1276, "step": 1850 }, { "epoch": 16.60850111856823, "grad_norm": 2.2263548374176025, "learning_rate": 3.7341269841269846e-06, "loss": 0.0661, "step": 1860 }, { "epoch": 16.697986577181208, "grad_norm": 6.170251846313477, "learning_rate": 3.694444444444445e-06, "loss": 0.1007, "step": 1870 }, { "epoch": 16.787472035794185, "grad_norm": 8.359641075134277, "learning_rate": 3.654761904761905e-06, "loss": 0.0993, "step": 1880 }, { "epoch": 16.876957494407158, "grad_norm": 12.583647727966309, "learning_rate": 3.615079365079365e-06, "loss": 0.0826, "step": 1890 }, { "epoch": 16.966442953020135, "grad_norm": 6.046731948852539, "learning_rate": 3.575396825396826e-06, "loss": 0.0444, "step": 1900 }, { "epoch": 17.0, "eval_loss": 0.747003436088562, "eval_macro_f1": 0.7839196689592837, "eval_runtime": 75.1906, "eval_samples_per_second": 10.174, "eval_steps_per_second": 1.277, "step": 1904 }, { "epoch": 17.053691275167786, "grad_norm": 20.228485107421875, "learning_rate": 3.5357142857142863e-06, "loss": 0.0961, "step": 1910 }, { "epoch": 17.14317673378076, "grad_norm": 26.031396865844727, "learning_rate": 3.496031746031746e-06, "loss": 0.0873, "step": 1920 }, { "epoch": 17.232662192393736, "grad_norm": 12.642768859863281, "learning_rate": 3.4563492063492065e-06, "loss": 0.1064, "step": 1930 }, { "epoch": 17.322147651006713, "grad_norm": 18.097814559936523, "learning_rate": 3.416666666666667e-06, "loss": 0.0768, "step": 1940 }, { "epoch": 17.411633109619686, "grad_norm": 10.280755043029785, "learning_rate": 3.3769841269841276e-06, "loss": 0.1336, "step": 1950 }, { "epoch": 17.501118568232663, "grad_norm": 11.690203666687012, "learning_rate": 3.3373015873015875e-06, "loss": 0.0707, "step": 1960 }, { "epoch": 17.59060402684564, "grad_norm": 22.514507293701172, "learning_rate": 3.297619047619048e-06, "loss": 0.1073, "step": 1970 }, { "epoch": 17.680089485458613, "grad_norm": 10.396025657653809, "learning_rate": 3.257936507936508e-06, "loss": 0.0543, "step": 1980 }, { "epoch": 17.76957494407159, "grad_norm": 12.947179794311523, "learning_rate": 3.218253968253969e-06, "loss": 0.0768, "step": 1990 }, { "epoch": 17.859060402684563, "grad_norm": 2.5873477458953857, "learning_rate": 3.178571428571429e-06, "loss": 0.0766, "step": 2000 }, { "epoch": 17.94854586129754, "grad_norm": 6.163917064666748, "learning_rate": 3.138888888888889e-06, "loss": 0.0455, "step": 2010 }, { "epoch": 18.0, "eval_loss": 0.8047569990158081, "eval_macro_f1": 0.7781014378358804, "eval_runtime": 66.3878, "eval_samples_per_second": 11.523, "eval_steps_per_second": 1.446, "step": 2016 }, { "epoch": 18.03579418344519, "grad_norm": 4.654578685760498, "learning_rate": 3.0992063492063495e-06, "loss": 0.0684, "step": 2020 }, { "epoch": 18.125279642058164, "grad_norm": 1.1822065114974976, "learning_rate": 3.05952380952381e-06, "loss": 0.0961, "step": 2030 }, { "epoch": 18.21476510067114, "grad_norm": 21.223234176635742, "learning_rate": 3.0198412698412697e-06, "loss": 0.1265, "step": 2040 }, { "epoch": 18.304250559284117, "grad_norm": 12.335346221923828, "learning_rate": 2.9801587301587305e-06, "loss": 0.1095, "step": 2050 }, { "epoch": 18.39373601789709, "grad_norm": 4.7988715171813965, "learning_rate": 2.9404761904761908e-06, "loss": 0.0944, "step": 2060 }, { "epoch": 18.483221476510067, "grad_norm": 28.275365829467773, "learning_rate": 2.900793650793651e-06, "loss": 0.0549, "step": 2070 }, { "epoch": 18.572706935123044, "grad_norm": 7.988637447357178, "learning_rate": 2.861111111111111e-06, "loss": 0.0881, "step": 2080 }, { "epoch": 18.662192393736017, "grad_norm": 9.342594146728516, "learning_rate": 2.8214285714285718e-06, "loss": 0.0528, "step": 2090 }, { "epoch": 18.751677852348994, "grad_norm": 12.687505722045898, "learning_rate": 2.781746031746032e-06, "loss": 0.0711, "step": 2100 }, { "epoch": 18.841163310961967, "grad_norm": 7.692240238189697, "learning_rate": 2.7420634920634924e-06, "loss": 0.072, "step": 2110 }, { "epoch": 18.930648769574944, "grad_norm": 14.148133277893066, "learning_rate": 2.7023809523809523e-06, "loss": 0.0677, "step": 2120 }, { "epoch": 19.0, "eval_loss": 0.8003770709037781, "eval_macro_f1": 0.766772035108551, "eval_runtime": 25.3025, "eval_samples_per_second": 30.234, "eval_steps_per_second": 3.794, "step": 2128 }, { "epoch": 19.017897091722595, "grad_norm": 17.935680389404297, "learning_rate": 2.662698412698413e-06, "loss": 0.0416, "step": 2130 }, { "epoch": 19.107382550335572, "grad_norm": 7.1221537590026855, "learning_rate": 2.6230158730158734e-06, "loss": 0.1003, "step": 2140 }, { "epoch": 19.196868008948545, "grad_norm": 17.375965118408203, "learning_rate": 2.5833333333333337e-06, "loss": 0.0854, "step": 2150 }, { "epoch": 19.286353467561522, "grad_norm": 13.114810943603516, "learning_rate": 2.5436507936507936e-06, "loss": 0.0759, "step": 2160 }, { "epoch": 19.375838926174495, "grad_norm": 20.650806427001953, "learning_rate": 2.503968253968254e-06, "loss": 0.0576, "step": 2170 }, { "epoch": 19.465324384787472, "grad_norm": 1.7908034324645996, "learning_rate": 2.4642857142857147e-06, "loss": 0.0314, "step": 2180 }, { "epoch": 19.55480984340045, "grad_norm": 15.814742088317871, "learning_rate": 2.4246031746031746e-06, "loss": 0.0754, "step": 2190 }, { "epoch": 19.644295302013422, "grad_norm": 18.996606826782227, "learning_rate": 2.3849206349206354e-06, "loss": 0.0502, "step": 2200 }, { "epoch": 19.7337807606264, "grad_norm": 24.4049015045166, "learning_rate": 2.3452380952380953e-06, "loss": 0.0502, "step": 2210 }, { "epoch": 19.823266219239372, "grad_norm": 6.1143879890441895, "learning_rate": 2.305555555555556e-06, "loss": 0.054, "step": 2220 }, { "epoch": 19.91275167785235, "grad_norm": 12.98304557800293, "learning_rate": 2.265873015873016e-06, "loss": 0.0737, "step": 2230 }, { "epoch": 20.0, "grad_norm": 0.3279534876346588, "learning_rate": 2.2261904761904763e-06, "loss": 0.0353, "step": 2240 }, { "epoch": 20.0, "eval_loss": 0.8695369958877563, "eval_macro_f1": 0.7699478748997595, "eval_runtime": 25.1606, "eval_samples_per_second": 30.405, "eval_steps_per_second": 3.815, "step": 2240 }, { "epoch": 20.089485458612977, "grad_norm": 5.161200523376465, "learning_rate": 2.1865079365079366e-06, "loss": 0.0489, "step": 2250 }, { "epoch": 20.17897091722595, "grad_norm": 24.046892166137695, "learning_rate": 2.146825396825397e-06, "loss": 0.0887, "step": 2260 }, { "epoch": 20.268456375838927, "grad_norm": 3.9411873817443848, "learning_rate": 2.1071428571428572e-06, "loss": 0.0804, "step": 2270 }, { "epoch": 20.3579418344519, "grad_norm": 12.20919418334961, "learning_rate": 2.0674603174603176e-06, "loss": 0.0318, "step": 2280 }, { "epoch": 20.447427293064877, "grad_norm": 5.021272659301758, "learning_rate": 2.027777777777778e-06, "loss": 0.0983, "step": 2290 }, { "epoch": 20.536912751677853, "grad_norm": 15.741971969604492, "learning_rate": 1.9880952380952382e-06, "loss": 0.0335, "step": 2300 }, { "epoch": 20.626398210290827, "grad_norm": 16.82331085205078, "learning_rate": 1.9484126984126985e-06, "loss": 0.0744, "step": 2310 }, { "epoch": 20.715883668903803, "grad_norm": 0.5343822240829468, "learning_rate": 1.908730158730159e-06, "loss": 0.0432, "step": 2320 }, { "epoch": 20.80536912751678, "grad_norm": 18.707128524780273, "learning_rate": 1.8690476190476192e-06, "loss": 0.0517, "step": 2330 }, { "epoch": 20.894854586129753, "grad_norm": 10.502820014953613, "learning_rate": 1.8293650793650793e-06, "loss": 0.0596, "step": 2340 }, { "epoch": 20.98434004474273, "grad_norm": 3.260993242263794, "learning_rate": 1.7896825396825399e-06, "loss": 0.0262, "step": 2350 }, { "epoch": 21.0, "eval_loss": 0.8509567975997925, "eval_macro_f1": 0.7710241615578796, "eval_runtime": 21.2423, "eval_samples_per_second": 36.013, "eval_steps_per_second": 4.519, "step": 2352 }, { "epoch": 21.07158836689038, "grad_norm": 21.054967880249023, "learning_rate": 1.75e-06, "loss": 0.0623, "step": 2360 }, { "epoch": 21.161073825503355, "grad_norm": 7.076012134552002, "learning_rate": 1.7103174603174605e-06, "loss": 0.0567, "step": 2370 }, { "epoch": 21.25055928411633, "grad_norm": 9.261219024658203, "learning_rate": 1.6706349206349206e-06, "loss": 0.0805, "step": 2380 }, { "epoch": 21.340044742729308, "grad_norm": 21.940967559814453, "learning_rate": 1.6309523809523812e-06, "loss": 0.062, "step": 2390 }, { "epoch": 21.42953020134228, "grad_norm": 0.5533654093742371, "learning_rate": 1.5912698412698413e-06, "loss": 0.0443, "step": 2400 }, { "epoch": 21.519015659955258, "grad_norm": 1.7748634815216064, "learning_rate": 1.5515873015873018e-06, "loss": 0.0465, "step": 2410 }, { "epoch": 21.60850111856823, "grad_norm": 14.389286994934082, "learning_rate": 1.511904761904762e-06, "loss": 0.0426, "step": 2420 }, { "epoch": 21.697986577181208, "grad_norm": 0.1624564379453659, "learning_rate": 1.4722222222222225e-06, "loss": 0.0399, "step": 2430 }, { "epoch": 21.787472035794185, "grad_norm": 16.362260818481445, "learning_rate": 1.4325396825396826e-06, "loss": 0.0737, "step": 2440 }, { "epoch": 21.876957494407158, "grad_norm": 9.44295883178711, "learning_rate": 1.392857142857143e-06, "loss": 0.1062, "step": 2450 }, { "epoch": 21.966442953020135, "grad_norm": 12.541874885559082, "learning_rate": 1.3531746031746033e-06, "loss": 0.0628, "step": 2460 }, { "epoch": 22.0, "eval_loss": 0.8052472472190857, "eval_macro_f1": 0.7966146756469337, "eval_runtime": 25.2623, "eval_samples_per_second": 30.282, "eval_steps_per_second": 3.8, "step": 2464 } ], "logging_steps": 10, "max_steps": 2800, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8247393356405015e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }