| { | |
| "best_global_step": 2464, | |
| "best_metric": 0.7966146756469337, | |
| "best_model_checkpoint": "Obstacle/dinov2/checkpoint-2464", | |
| "epoch": 22.0, | |
| "eval_steps": 500, | |
| "global_step": 2464, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0894854586129754, | |
| "grad_norm": 68.26221466064453, | |
| "learning_rate": 3.214285714285714e-07, | |
| "loss": 0.9281, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1789709172259508, | |
| "grad_norm": 49.921424865722656, | |
| "learning_rate": 6.785714285714286e-07, | |
| "loss": 0.8782, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2684563758389262, | |
| "grad_norm": 63.9870491027832, | |
| "learning_rate": 1.0357142857142859e-06, | |
| "loss": 0.769, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3579418344519016, | |
| "grad_norm": 27.94622230529785, | |
| "learning_rate": 1.392857142857143e-06, | |
| "loss": 0.7484, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.44742729306487694, | |
| "grad_norm": 31.261272430419922, | |
| "learning_rate": 1.75e-06, | |
| "loss": 0.7224, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5369127516778524, | |
| "grad_norm": 30.373384475708008, | |
| "learning_rate": 2.1071428571428572e-06, | |
| "loss": 0.6935, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6263982102908278, | |
| "grad_norm": 25.922801971435547, | |
| "learning_rate": 2.4642857142857147e-06, | |
| "loss": 0.656, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7158836689038032, | |
| "grad_norm": 21.445323944091797, | |
| "learning_rate": 2.8214285714285718e-06, | |
| "loss": 0.6086, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.8053691275167785, | |
| "grad_norm": 47.42407989501953, | |
| "learning_rate": 3.178571428571429e-06, | |
| "loss": 0.7216, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8948545861297539, | |
| "grad_norm": 23.003158569335938, | |
| "learning_rate": 3.5357142857142863e-06, | |
| "loss": 0.6132, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9843400447427293, | |
| "grad_norm": 31.152210235595703, | |
| "learning_rate": 3.892857142857143e-06, | |
| "loss": 0.6263, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.572486162185669, | |
| "eval_macro_f1": 0.6811600432857552, | |
| "eval_runtime": 118.5033, | |
| "eval_samples_per_second": 6.456, | |
| "eval_steps_per_second": 0.81, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.0715883668903803, | |
| "grad_norm": 19.629968643188477, | |
| "learning_rate": 4.25e-06, | |
| "loss": 0.5971, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1610738255033557, | |
| "grad_norm": 38.47784423828125, | |
| "learning_rate": 4.6071428571428574e-06, | |
| "loss": 0.5379, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.250559284116331, | |
| "grad_norm": 24.479284286499023, | |
| "learning_rate": 4.964285714285715e-06, | |
| "loss": 0.529, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.3400447427293065, | |
| "grad_norm": 20.997512817382812, | |
| "learning_rate": 5.3214285714285715e-06, | |
| "loss": 0.6006, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.429530201342282, | |
| "grad_norm": 52.60095977783203, | |
| "learning_rate": 5.678571428571429e-06, | |
| "loss": 0.7468, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5190156599552571, | |
| "grad_norm": 15.642265319824219, | |
| "learning_rate": 6.035714285714286e-06, | |
| "loss": 0.559, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6085011185682325, | |
| "grad_norm": 70.16302490234375, | |
| "learning_rate": 6.392857142857143e-06, | |
| "loss": 0.6043, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.697986577181208, | |
| "grad_norm": 26.643476486206055, | |
| "learning_rate": 6.750000000000001e-06, | |
| "loss": 0.5636, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.7874720357941833, | |
| "grad_norm": 177.88034057617188, | |
| "learning_rate": 7.107142857142858e-06, | |
| "loss": 0.5427, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8769574944071588, | |
| "grad_norm": 38.7111701965332, | |
| "learning_rate": 7.464285714285715e-06, | |
| "loss": 0.5486, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.9664429530201342, | |
| "grad_norm": 21.803434371948242, | |
| "learning_rate": 7.821428571428571e-06, | |
| "loss": 0.609, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.5907321572303772, | |
| "eval_macro_f1": 0.7001507404932478, | |
| "eval_runtime": 121.4035, | |
| "eval_samples_per_second": 6.301, | |
| "eval_steps_per_second": 0.791, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.053691275167785, | |
| "grad_norm": 23.68389320373535, | |
| "learning_rate": 8.17857142857143e-06, | |
| "loss": 0.4989, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.1431767337807606, | |
| "grad_norm": 26.48926544189453, | |
| "learning_rate": 8.535714285714286e-06, | |
| "loss": 0.5119, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.232662192393736, | |
| "grad_norm": 25.900455474853516, | |
| "learning_rate": 8.892857142857143e-06, | |
| "loss": 0.5046, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.3221476510067114, | |
| "grad_norm": 26.200103759765625, | |
| "learning_rate": 9.250000000000001e-06, | |
| "loss": 0.4793, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.411633109619687, | |
| "grad_norm": 36.33053970336914, | |
| "learning_rate": 9.607142857142858e-06, | |
| "loss": 0.5127, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.501118568232662, | |
| "grad_norm": 51.3528938293457, | |
| "learning_rate": 9.964285714285714e-06, | |
| "loss": 0.5188, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.5906040268456376, | |
| "grad_norm": 28.10676383972168, | |
| "learning_rate": 9.964285714285714e-06, | |
| "loss": 0.4858, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.680089485458613, | |
| "grad_norm": 25.275537490844727, | |
| "learning_rate": 9.924603174603175e-06, | |
| "loss": 0.497, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.7695749440715884, | |
| "grad_norm": 23.899168014526367, | |
| "learning_rate": 9.884920634920636e-06, | |
| "loss": 0.4843, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.859060402684564, | |
| "grad_norm": 21.87393569946289, | |
| "learning_rate": 9.845238095238097e-06, | |
| "loss": 0.5006, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.9485458612975393, | |
| "grad_norm": 27.74087905883789, | |
| "learning_rate": 9.805555555555556e-06, | |
| "loss": 0.4471, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.5459941029548645, | |
| "eval_macro_f1": 0.726109493936894, | |
| "eval_runtime": 121.5217, | |
| "eval_samples_per_second": 6.295, | |
| "eval_steps_per_second": 0.79, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.0357941834451903, | |
| "grad_norm": 41.323997497558594, | |
| "learning_rate": 9.765873015873017e-06, | |
| "loss": 0.5304, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.1252796420581657, | |
| "grad_norm": 31.197467803955078, | |
| "learning_rate": 9.726190476190477e-06, | |
| "loss": 0.4863, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.214765100671141, | |
| "grad_norm": 15.407756805419922, | |
| "learning_rate": 9.686507936507938e-06, | |
| "loss": 0.4116, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.3042505592841165, | |
| "grad_norm": 19.091278076171875, | |
| "learning_rate": 9.646825396825397e-06, | |
| "loss": 0.4092, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.393736017897092, | |
| "grad_norm": 42.068511962890625, | |
| "learning_rate": 9.607142857142858e-06, | |
| "loss": 0.4627, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.4832214765100673, | |
| "grad_norm": 26.687232971191406, | |
| "learning_rate": 9.567460317460319e-06, | |
| "loss": 0.4487, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.5727069351230423, | |
| "grad_norm": 22.164098739624023, | |
| "learning_rate": 9.527777777777778e-06, | |
| "loss": 0.48, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.662192393736018, | |
| "grad_norm": 22.220373153686523, | |
| "learning_rate": 9.488095238095238e-06, | |
| "loss": 0.53, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.751677852348993, | |
| "grad_norm": 8.822561264038086, | |
| "learning_rate": 9.4484126984127e-06, | |
| "loss": 0.4999, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.841163310961969, | |
| "grad_norm": 11.927675247192383, | |
| "learning_rate": 9.40873015873016e-06, | |
| "loss": 0.4642, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.930648769574944, | |
| "grad_norm": 14.933186531066895, | |
| "learning_rate": 9.36904761904762e-06, | |
| "loss": 0.4145, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.5586118102073669, | |
| "eval_macro_f1": 0.741893986276926, | |
| "eval_runtime": 120.2217, | |
| "eval_samples_per_second": 6.363, | |
| "eval_steps_per_second": 0.799, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 4.017897091722595, | |
| "grad_norm": 23.335527420043945, | |
| "learning_rate": 9.32936507936508e-06, | |
| "loss": 0.4222, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.10738255033557, | |
| "grad_norm": 28.71408462524414, | |
| "learning_rate": 9.28968253968254e-06, | |
| "loss": 0.3959, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.196868008948546, | |
| "grad_norm": 20.4088077545166, | |
| "learning_rate": 9.250000000000001e-06, | |
| "loss": 0.3423, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.286353467561521, | |
| "grad_norm": 39.36516189575195, | |
| "learning_rate": 9.21031746031746e-06, | |
| "loss": 0.3851, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.375838926174497, | |
| "grad_norm": 32.29376983642578, | |
| "learning_rate": 9.170634920634921e-06, | |
| "loss": 0.3766, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.465324384787472, | |
| "grad_norm": 13.637434959411621, | |
| "learning_rate": 9.130952380952382e-06, | |
| "loss": 0.3328, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.554809843400448, | |
| "grad_norm": 23.359638214111328, | |
| "learning_rate": 9.091269841269843e-06, | |
| "loss": 0.3988, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.644295302013423, | |
| "grad_norm": 25.244564056396484, | |
| "learning_rate": 9.051587301587302e-06, | |
| "loss": 0.3149, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.733780760626399, | |
| "grad_norm": 273.609619140625, | |
| "learning_rate": 9.011904761904762e-06, | |
| "loss": 0.3685, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.823266219239374, | |
| "grad_norm": 18.818504333496094, | |
| "learning_rate": 8.972222222222223e-06, | |
| "loss": 0.2867, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.912751677852349, | |
| "grad_norm": 19.458040237426758, | |
| "learning_rate": 8.932539682539684e-06, | |
| "loss": 0.3952, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 11.257533073425293, | |
| "learning_rate": 8.892857142857143e-06, | |
| "loss": 0.3553, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.5237393379211426, | |
| "eval_macro_f1": 0.7685140098500236, | |
| "eval_runtime": 124.8114, | |
| "eval_samples_per_second": 6.129, | |
| "eval_steps_per_second": 0.769, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 5.089485458612975, | |
| "grad_norm": 16.261404037475586, | |
| "learning_rate": 8.853174603174604e-06, | |
| "loss": 0.3108, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 5.178970917225951, | |
| "grad_norm": 16.059083938598633, | |
| "learning_rate": 8.813492063492064e-06, | |
| "loss": 0.2986, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 5.268456375838926, | |
| "grad_norm": 22.006534576416016, | |
| "learning_rate": 8.773809523809525e-06, | |
| "loss": 0.2952, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 5.357941834451902, | |
| "grad_norm": 16.75338363647461, | |
| "learning_rate": 8.734126984126984e-06, | |
| "loss": 0.2512, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 5.447427293064877, | |
| "grad_norm": 36.52522659301758, | |
| "learning_rate": 8.694444444444445e-06, | |
| "loss": 0.2308, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 5.5369127516778525, | |
| "grad_norm": 14.535757064819336, | |
| "learning_rate": 8.654761904761906e-06, | |
| "loss": 0.3012, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 5.626398210290827, | |
| "grad_norm": 22.867900848388672, | |
| "learning_rate": 8.615079365079366e-06, | |
| "loss": 0.3232, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.715883668903803, | |
| "grad_norm": 19.417451858520508, | |
| "learning_rate": 8.575396825396826e-06, | |
| "loss": 0.3173, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.805369127516778, | |
| "grad_norm": 21.25806427001953, | |
| "learning_rate": 8.535714285714286e-06, | |
| "loss": 0.3376, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.894854586129754, | |
| "grad_norm": 11.842672348022461, | |
| "learning_rate": 8.496031746031747e-06, | |
| "loss": 0.3756, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.984340044742729, | |
| "grad_norm": 16.4525203704834, | |
| "learning_rate": 8.456349206349208e-06, | |
| "loss": 0.2923, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.5606415271759033, | |
| "eval_macro_f1": 0.7435031036046287, | |
| "eval_runtime": 115.5398, | |
| "eval_samples_per_second": 6.621, | |
| "eval_steps_per_second": 0.831, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 6.0715883668903805, | |
| "grad_norm": 20.93574333190918, | |
| "learning_rate": 8.416666666666667e-06, | |
| "loss": 0.2473, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 6.1610738255033555, | |
| "grad_norm": 26.189205169677734, | |
| "learning_rate": 8.376984126984128e-06, | |
| "loss": 0.2403, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 6.250559284116331, | |
| "grad_norm": 13.84333610534668, | |
| "learning_rate": 8.337301587301588e-06, | |
| "loss": 0.231, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 6.340044742729306, | |
| "grad_norm": 22.295377731323242, | |
| "learning_rate": 8.297619047619049e-06, | |
| "loss": 0.2416, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 6.429530201342282, | |
| "grad_norm": 14.893708229064941, | |
| "learning_rate": 8.257936507936508e-06, | |
| "loss": 0.2409, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 6.519015659955257, | |
| "grad_norm": 14.828768730163574, | |
| "learning_rate": 8.218253968253969e-06, | |
| "loss": 0.1927, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 6.608501118568233, | |
| "grad_norm": 23.54037094116211, | |
| "learning_rate": 8.17857142857143e-06, | |
| "loss": 0.2373, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 6.697986577181208, | |
| "grad_norm": 20.361324310302734, | |
| "learning_rate": 8.138888888888889e-06, | |
| "loss": 0.2266, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 6.787472035794184, | |
| "grad_norm": 32.86820602416992, | |
| "learning_rate": 8.09920634920635e-06, | |
| "loss": 0.3065, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 6.876957494407159, | |
| "grad_norm": 25.08152198791504, | |
| "learning_rate": 8.05952380952381e-06, | |
| "loss": 0.307, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.966442953020135, | |
| "grad_norm": 21.257457733154297, | |
| "learning_rate": 8.019841269841271e-06, | |
| "loss": 0.2943, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.5504983067512512, | |
| "eval_macro_f1": 0.7728941735285386, | |
| "eval_runtime": 119.5498, | |
| "eval_samples_per_second": 6.399, | |
| "eval_steps_per_second": 0.803, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 7.053691275167785, | |
| "grad_norm": 13.639359474182129, | |
| "learning_rate": 7.980158730158732e-06, | |
| "loss": 0.2103, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 7.143176733780761, | |
| "grad_norm": 27.568639755249023, | |
| "learning_rate": 7.94047619047619e-06, | |
| "loss": 0.1676, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 7.232662192393736, | |
| "grad_norm": 17.152692794799805, | |
| "learning_rate": 7.900793650793652e-06, | |
| "loss": 0.1818, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 7.322147651006711, | |
| "grad_norm": 16.47798728942871, | |
| "learning_rate": 7.861111111111112e-06, | |
| "loss": 0.2058, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 7.411633109619687, | |
| "grad_norm": 17.176942825317383, | |
| "learning_rate": 7.821428571428571e-06, | |
| "loss": 0.1508, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 7.501118568232663, | |
| "grad_norm": 29.886573791503906, | |
| "learning_rate": 7.781746031746032e-06, | |
| "loss": 0.2434, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 7.590604026845638, | |
| "grad_norm": 22.851221084594727, | |
| "learning_rate": 7.742063492063493e-06, | |
| "loss": 0.2431, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 7.680089485458613, | |
| "grad_norm": 9.74971866607666, | |
| "learning_rate": 7.702380952380954e-06, | |
| "loss": 0.1905, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 7.769574944071588, | |
| "grad_norm": 22.99750328063965, | |
| "learning_rate": 7.662698412698414e-06, | |
| "loss": 0.2215, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 7.859060402684563, | |
| "grad_norm": 29.295093536376953, | |
| "learning_rate": 7.623015873015873e-06, | |
| "loss": 0.2222, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 7.948545861297539, | |
| "grad_norm": 13.15281867980957, | |
| "learning_rate": 7.583333333333333e-06, | |
| "loss": 0.2172, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.5693853497505188, | |
| "eval_macro_f1": 0.7617472396955967, | |
| "eval_runtime": 71.7674, | |
| "eval_samples_per_second": 10.659, | |
| "eval_steps_per_second": 1.338, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 8.03579418344519, | |
| "grad_norm": 17.01239585876465, | |
| "learning_rate": 7.543650793650794e-06, | |
| "loss": 0.2183, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 8.125279642058166, | |
| "grad_norm": 16.31254768371582, | |
| "learning_rate": 7.503968253968255e-06, | |
| "loss": 0.1393, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 8.21476510067114, | |
| "grad_norm": 21.926393508911133, | |
| "learning_rate": 7.464285714285715e-06, | |
| "loss": 0.2104, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 8.304250559284116, | |
| "grad_norm": 16.77183723449707, | |
| "learning_rate": 7.4246031746031754e-06, | |
| "loss": 0.1808, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 8.393736017897092, | |
| "grad_norm": 10.125628471374512, | |
| "learning_rate": 7.384920634920636e-06, | |
| "loss": 0.142, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 8.483221476510067, | |
| "grad_norm": 8.853920936584473, | |
| "learning_rate": 7.345238095238096e-06, | |
| "loss": 0.1674, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 8.572706935123042, | |
| "grad_norm": 16.407033920288086, | |
| "learning_rate": 7.305555555555556e-06, | |
| "loss": 0.1684, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 8.662192393736017, | |
| "grad_norm": 19.892669677734375, | |
| "learning_rate": 7.265873015873016e-06, | |
| "loss": 0.1673, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 8.751677852348994, | |
| "grad_norm": 5.113985061645508, | |
| "learning_rate": 7.226190476190477e-06, | |
| "loss": 0.2087, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 8.841163310961969, | |
| "grad_norm": 10.283279418945312, | |
| "learning_rate": 7.186507936507937e-06, | |
| "loss": 0.1728, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 8.930648769574944, | |
| "grad_norm": 11.258045196533203, | |
| "learning_rate": 7.146825396825397e-06, | |
| "loss": 0.2067, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.6666957139968872, | |
| "eval_macro_f1": 0.7585227272727273, | |
| "eval_runtime": 75.2405, | |
| "eval_samples_per_second": 10.167, | |
| "eval_steps_per_second": 1.276, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 9.017897091722595, | |
| "grad_norm": 31.886018753051758, | |
| "learning_rate": 7.107142857142858e-06, | |
| "loss": 0.1853, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 9.10738255033557, | |
| "grad_norm": 21.961132049560547, | |
| "learning_rate": 7.067460317460319e-06, | |
| "loss": 0.1823, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 9.196868008948545, | |
| "grad_norm": 9.536689758300781, | |
| "learning_rate": 7.027777777777778e-06, | |
| "loss": 0.1289, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 9.286353467561522, | |
| "grad_norm": 17.52619171142578, | |
| "learning_rate": 6.988095238095239e-06, | |
| "loss": 0.1421, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 9.375838926174497, | |
| "grad_norm": 5.5908379554748535, | |
| "learning_rate": 6.9484126984126985e-06, | |
| "loss": 0.1024, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 9.465324384787472, | |
| "grad_norm": 15.923222541809082, | |
| "learning_rate": 6.908730158730159e-06, | |
| "loss": 0.1363, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 9.554809843400447, | |
| "grad_norm": 7.426005840301514, | |
| "learning_rate": 6.86904761904762e-06, | |
| "loss": 0.1851, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 9.644295302013422, | |
| "grad_norm": 2.362064838409424, | |
| "learning_rate": 6.82936507936508e-06, | |
| "loss": 0.1177, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 9.733780760626399, | |
| "grad_norm": 16.235544204711914, | |
| "learning_rate": 6.789682539682541e-06, | |
| "loss": 0.0991, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 9.823266219239374, | |
| "grad_norm": 8.337503433227539, | |
| "learning_rate": 6.750000000000001e-06, | |
| "loss": 0.13, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 9.912751677852349, | |
| "grad_norm": 34.09331130981445, | |
| "learning_rate": 6.7103174603174605e-06, | |
| "loss": 0.1171, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 25.452791213989258, | |
| "learning_rate": 6.67063492063492e-06, | |
| "loss": 0.1817, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.8301987051963806, | |
| "eval_macro_f1": 0.7432961635470596, | |
| "eval_runtime": 71.6774, | |
| "eval_samples_per_second": 10.673, | |
| "eval_steps_per_second": 1.339, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 10.089485458612975, | |
| "grad_norm": 19.931293487548828, | |
| "learning_rate": 6.630952380952381e-06, | |
| "loss": 0.1405, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 10.17897091722595, | |
| "grad_norm": 20.297443389892578, | |
| "learning_rate": 6.591269841269842e-06, | |
| "loss": 0.1359, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 10.268456375838927, | |
| "grad_norm": 11.013289451599121, | |
| "learning_rate": 6.551587301587302e-06, | |
| "loss": 0.0918, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 10.357941834451902, | |
| "grad_norm": 19.394319534301758, | |
| "learning_rate": 6.5119047619047626e-06, | |
| "loss": 0.1392, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 10.447427293064877, | |
| "grad_norm": 28.343791961669922, | |
| "learning_rate": 6.472222222222223e-06, | |
| "loss": 0.1866, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 10.536912751677852, | |
| "grad_norm": 14.38354778289795, | |
| "learning_rate": 6.432539682539683e-06, | |
| "loss": 0.1381, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 10.626398210290828, | |
| "grad_norm": 24.752470016479492, | |
| "learning_rate": 6.392857142857143e-06, | |
| "loss": 0.202, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 10.715883668903803, | |
| "grad_norm": 19.56192970275879, | |
| "learning_rate": 6.353174603174603e-06, | |
| "loss": 0.1566, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 10.805369127516778, | |
| "grad_norm": 20.234485626220703, | |
| "learning_rate": 6.313492063492064e-06, | |
| "loss": 0.1061, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 10.894854586129753, | |
| "grad_norm": 19.531757354736328, | |
| "learning_rate": 6.2738095238095245e-06, | |
| "loss": 0.1873, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 10.98434004474273, | |
| "grad_norm": 16.655208587646484, | |
| "learning_rate": 6.2341269841269844e-06, | |
| "loss": 0.1102, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.6939279437065125, | |
| "eval_macro_f1": 0.769257323981233, | |
| "eval_runtime": 73.198, | |
| "eval_samples_per_second": 10.451, | |
| "eval_steps_per_second": 1.312, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 11.07158836689038, | |
| "grad_norm": 5.695398330688477, | |
| "learning_rate": 6.194444444444445e-06, | |
| "loss": 0.1283, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 11.161073825503356, | |
| "grad_norm": 9.851438522338867, | |
| "learning_rate": 6.154761904761906e-06, | |
| "loss": 0.1065, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 11.250559284116331, | |
| "grad_norm": 29.838871002197266, | |
| "learning_rate": 6.115079365079366e-06, | |
| "loss": 0.1426, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 11.340044742729306, | |
| "grad_norm": 7.144505500793457, | |
| "learning_rate": 6.075396825396826e-06, | |
| "loss": 0.093, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 11.429530201342281, | |
| "grad_norm": 15.980908393859863, | |
| "learning_rate": 6.035714285714286e-06, | |
| "loss": 0.1508, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 11.519015659955258, | |
| "grad_norm": 23.028108596801758, | |
| "learning_rate": 5.996031746031746e-06, | |
| "loss": 0.1719, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 11.608501118568233, | |
| "grad_norm": 14.000625610351562, | |
| "learning_rate": 5.956349206349207e-06, | |
| "loss": 0.1063, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 11.697986577181208, | |
| "grad_norm": 13.606029510498047, | |
| "learning_rate": 5.916666666666667e-06, | |
| "loss": 0.1194, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 11.787472035794183, | |
| "grad_norm": 13.779529571533203, | |
| "learning_rate": 5.876984126984128e-06, | |
| "loss": 0.1728, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 11.87695749440716, | |
| "grad_norm": 10.599024772644043, | |
| "learning_rate": 5.8373015873015886e-06, | |
| "loss": 0.0797, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 11.966442953020135, | |
| "grad_norm": 11.666370391845703, | |
| "learning_rate": 5.7976190476190485e-06, | |
| "loss": 0.1175, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.6560544967651367, | |
| "eval_macro_f1": 0.7722084367245657, | |
| "eval_runtime": 65.1441, | |
| "eval_samples_per_second": 11.743, | |
| "eval_steps_per_second": 1.474, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 12.053691275167786, | |
| "grad_norm": 12.950461387634277, | |
| "learning_rate": 5.7579365079365075e-06, | |
| "loss": 0.1042, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 12.143176733780761, | |
| "grad_norm": 9.609066009521484, | |
| "learning_rate": 5.718253968253968e-06, | |
| "loss": 0.12, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 12.232662192393736, | |
| "grad_norm": 23.64732551574707, | |
| "learning_rate": 5.678571428571429e-06, | |
| "loss": 0.1145, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 12.322147651006711, | |
| "grad_norm": 32.57529067993164, | |
| "learning_rate": 5.638888888888889e-06, | |
| "loss": 0.0992, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 12.411633109619686, | |
| "grad_norm": 7.419304370880127, | |
| "learning_rate": 5.59920634920635e-06, | |
| "loss": 0.0988, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 12.501118568232663, | |
| "grad_norm": 6.912314414978027, | |
| "learning_rate": 5.5595238095238104e-06, | |
| "loss": 0.1157, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 12.590604026845638, | |
| "grad_norm": 19.71913719177246, | |
| "learning_rate": 5.51984126984127e-06, | |
| "loss": 0.1478, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 12.680089485458613, | |
| "grad_norm": 20.64476203918457, | |
| "learning_rate": 5.480158730158731e-06, | |
| "loss": 0.1196, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 12.769574944071588, | |
| "grad_norm": 15.437020301818848, | |
| "learning_rate": 5.44047619047619e-06, | |
| "loss": 0.0766, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 12.859060402684564, | |
| "grad_norm": 9.825302124023438, | |
| "learning_rate": 5.400793650793651e-06, | |
| "loss": 0.1254, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 12.94854586129754, | |
| "grad_norm": 26.381696701049805, | |
| "learning_rate": 5.361111111111112e-06, | |
| "loss": 0.1025, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.7549577951431274, | |
| "eval_macro_f1": 0.7658037701779237, | |
| "eval_runtime": 75.4344, | |
| "eval_samples_per_second": 10.141, | |
| "eval_steps_per_second": 1.273, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 13.03579418344519, | |
| "grad_norm": 25.113908767700195, | |
| "learning_rate": 5.3214285714285715e-06, | |
| "loss": 0.1438, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 13.125279642058166, | |
| "grad_norm": 10.833968162536621, | |
| "learning_rate": 5.281746031746032e-06, | |
| "loss": 0.1749, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 13.21476510067114, | |
| "grad_norm": 22.27955436706543, | |
| "learning_rate": 5.242063492063493e-06, | |
| "loss": 0.1608, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 13.304250559284116, | |
| "grad_norm": 15.125386238098145, | |
| "learning_rate": 5.202380952380953e-06, | |
| "loss": 0.1503, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 13.393736017897092, | |
| "grad_norm": 3.4376182556152344, | |
| "learning_rate": 5.162698412698414e-06, | |
| "loss": 0.0819, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 13.483221476510067, | |
| "grad_norm": 15.98349380493164, | |
| "learning_rate": 5.123015873015873e-06, | |
| "loss": 0.0837, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 13.572706935123042, | |
| "grad_norm": 11.065319061279297, | |
| "learning_rate": 5.0833333333333335e-06, | |
| "loss": 0.1035, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 13.662192393736017, | |
| "grad_norm": 10.306619644165039, | |
| "learning_rate": 5.043650793650794e-06, | |
| "loss": 0.1075, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 13.751677852348994, | |
| "grad_norm": 15.42297077178955, | |
| "learning_rate": 5.003968253968254e-06, | |
| "loss": 0.1208, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 13.841163310961969, | |
| "grad_norm": 20.406225204467773, | |
| "learning_rate": 4.964285714285715e-06, | |
| "loss": 0.1014, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 13.930648769574944, | |
| "grad_norm": 16.1427059173584, | |
| "learning_rate": 4.924603174603176e-06, | |
| "loss": 0.1593, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.7316491007804871, | |
| "eval_macro_f1": 0.7694998475634216, | |
| "eval_runtime": 69.4982, | |
| "eval_samples_per_second": 11.007, | |
| "eval_steps_per_second": 1.381, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 14.017897091722595, | |
| "grad_norm": 9.524744987487793, | |
| "learning_rate": 4.8849206349206356e-06, | |
| "loss": 0.1199, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 14.10738255033557, | |
| "grad_norm": 14.937419891357422, | |
| "learning_rate": 4.8452380952380955e-06, | |
| "loss": 0.1189, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 14.196868008948545, | |
| "grad_norm": 19.528654098510742, | |
| "learning_rate": 4.805555555555556e-06, | |
| "loss": 0.1114, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 14.286353467561522, | |
| "grad_norm": 2.1810483932495117, | |
| "learning_rate": 4.765873015873016e-06, | |
| "loss": 0.0805, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 14.375838926174497, | |
| "grad_norm": 5.020853042602539, | |
| "learning_rate": 4.726190476190476e-06, | |
| "loss": 0.1097, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 14.465324384787472, | |
| "grad_norm": 12.4754638671875, | |
| "learning_rate": 4.686507936507937e-06, | |
| "loss": 0.1231, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 14.554809843400447, | |
| "grad_norm": 25.44110679626465, | |
| "learning_rate": 4.6468253968253975e-06, | |
| "loss": 0.0903, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 14.644295302013422, | |
| "grad_norm": 27.849111557006836, | |
| "learning_rate": 4.6071428571428574e-06, | |
| "loss": 0.0905, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 14.733780760626399, | |
| "grad_norm": 18.283781051635742, | |
| "learning_rate": 4.567460317460317e-06, | |
| "loss": 0.0617, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 14.823266219239374, | |
| "grad_norm": 17.507619857788086, | |
| "learning_rate": 4.527777777777778e-06, | |
| "loss": 0.13, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 14.912751677852349, | |
| "grad_norm": 21.756675720214844, | |
| "learning_rate": 4.488095238095239e-06, | |
| "loss": 0.0926, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 19.568700790405273, | |
| "learning_rate": 4.448412698412699e-06, | |
| "loss": 0.0954, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.6716505885124207, | |
| "eval_macro_f1": 0.7774340887550624, | |
| "eval_runtime": 72.4554, | |
| "eval_samples_per_second": 10.558, | |
| "eval_steps_per_second": 1.325, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 15.089485458612975, | |
| "grad_norm": 18.1299991607666, | |
| "learning_rate": 4.408730158730159e-06, | |
| "loss": 0.0678, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 15.17897091722595, | |
| "grad_norm": 4.439563751220703, | |
| "learning_rate": 4.369047619047619e-06, | |
| "loss": 0.0767, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 15.268456375838927, | |
| "grad_norm": 14.997693061828613, | |
| "learning_rate": 4.32936507936508e-06, | |
| "loss": 0.0664, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 15.357941834451902, | |
| "grad_norm": 7.552863597869873, | |
| "learning_rate": 4.28968253968254e-06, | |
| "loss": 0.0842, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 15.447427293064877, | |
| "grad_norm": 8.728134155273438, | |
| "learning_rate": 4.25e-06, | |
| "loss": 0.1015, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 15.536912751677852, | |
| "grad_norm": 11.534658432006836, | |
| "learning_rate": 4.210317460317461e-06, | |
| "loss": 0.1085, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 15.626398210290828, | |
| "grad_norm": 22.487648010253906, | |
| "learning_rate": 4.1706349206349215e-06, | |
| "loss": 0.0993, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 15.715883668903803, | |
| "grad_norm": 4.885320663452148, | |
| "learning_rate": 4.130952380952381e-06, | |
| "loss": 0.093, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 15.805369127516778, | |
| "grad_norm": 16.688884735107422, | |
| "learning_rate": 4.091269841269841e-06, | |
| "loss": 0.0732, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 15.894854586129753, | |
| "grad_norm": 30.58871841430664, | |
| "learning_rate": 4.051587301587302e-06, | |
| "loss": 0.0867, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 15.98434004474273, | |
| "grad_norm": 17.19178009033203, | |
| "learning_rate": 4.011904761904763e-06, | |
| "loss": 0.0676, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.7994140386581421, | |
| "eval_macro_f1": 0.7741348066298342, | |
| "eval_runtime": 75.3551, | |
| "eval_samples_per_second": 10.152, | |
| "eval_steps_per_second": 1.274, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 16.07158836689038, | |
| "grad_norm": 10.005722999572754, | |
| "learning_rate": 3.972222222222223e-06, | |
| "loss": 0.0858, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 16.161073825503355, | |
| "grad_norm": 20.64703369140625, | |
| "learning_rate": 3.932539682539683e-06, | |
| "loss": 0.0836, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 16.25055928411633, | |
| "grad_norm": 8.218932151794434, | |
| "learning_rate": 3.892857142857143e-06, | |
| "loss": 0.0969, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 16.340044742729308, | |
| "grad_norm": 0.8766313195228577, | |
| "learning_rate": 3.853174603174604e-06, | |
| "loss": 0.0432, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 16.42953020134228, | |
| "grad_norm": 12.166719436645508, | |
| "learning_rate": 3.8134920634920636e-06, | |
| "loss": 0.0844, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 16.519015659955258, | |
| "grad_norm": 8.594962120056152, | |
| "learning_rate": 3.773809523809524e-06, | |
| "loss": 0.1276, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 16.60850111856823, | |
| "grad_norm": 2.2263548374176025, | |
| "learning_rate": 3.7341269841269846e-06, | |
| "loss": 0.0661, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 16.697986577181208, | |
| "grad_norm": 6.170251846313477, | |
| "learning_rate": 3.694444444444445e-06, | |
| "loss": 0.1007, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 16.787472035794185, | |
| "grad_norm": 8.359641075134277, | |
| "learning_rate": 3.654761904761905e-06, | |
| "loss": 0.0993, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 16.876957494407158, | |
| "grad_norm": 12.583647727966309, | |
| "learning_rate": 3.615079365079365e-06, | |
| "loss": 0.0826, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 16.966442953020135, | |
| "grad_norm": 6.046731948852539, | |
| "learning_rate": 3.575396825396826e-06, | |
| "loss": 0.0444, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.747003436088562, | |
| "eval_macro_f1": 0.7839196689592837, | |
| "eval_runtime": 75.1906, | |
| "eval_samples_per_second": 10.174, | |
| "eval_steps_per_second": 1.277, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 17.053691275167786, | |
| "grad_norm": 20.228485107421875, | |
| "learning_rate": 3.5357142857142863e-06, | |
| "loss": 0.0961, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 17.14317673378076, | |
| "grad_norm": 26.031396865844727, | |
| "learning_rate": 3.496031746031746e-06, | |
| "loss": 0.0873, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 17.232662192393736, | |
| "grad_norm": 12.642768859863281, | |
| "learning_rate": 3.4563492063492065e-06, | |
| "loss": 0.1064, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 17.322147651006713, | |
| "grad_norm": 18.097814559936523, | |
| "learning_rate": 3.416666666666667e-06, | |
| "loss": 0.0768, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 17.411633109619686, | |
| "grad_norm": 10.280755043029785, | |
| "learning_rate": 3.3769841269841276e-06, | |
| "loss": 0.1336, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 17.501118568232663, | |
| "grad_norm": 11.690203666687012, | |
| "learning_rate": 3.3373015873015875e-06, | |
| "loss": 0.0707, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 17.59060402684564, | |
| "grad_norm": 22.514507293701172, | |
| "learning_rate": 3.297619047619048e-06, | |
| "loss": 0.1073, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 17.680089485458613, | |
| "grad_norm": 10.396025657653809, | |
| "learning_rate": 3.257936507936508e-06, | |
| "loss": 0.0543, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 17.76957494407159, | |
| "grad_norm": 12.947179794311523, | |
| "learning_rate": 3.218253968253969e-06, | |
| "loss": 0.0768, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 17.859060402684563, | |
| "grad_norm": 2.5873477458953857, | |
| "learning_rate": 3.178571428571429e-06, | |
| "loss": 0.0766, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 17.94854586129754, | |
| "grad_norm": 6.163917064666748, | |
| "learning_rate": 3.138888888888889e-06, | |
| "loss": 0.0455, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.8047569990158081, | |
| "eval_macro_f1": 0.7781014378358804, | |
| "eval_runtime": 66.3878, | |
| "eval_samples_per_second": 11.523, | |
| "eval_steps_per_second": 1.446, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 18.03579418344519, | |
| "grad_norm": 4.654578685760498, | |
| "learning_rate": 3.0992063492063495e-06, | |
| "loss": 0.0684, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 18.125279642058164, | |
| "grad_norm": 1.1822065114974976, | |
| "learning_rate": 3.05952380952381e-06, | |
| "loss": 0.0961, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 18.21476510067114, | |
| "grad_norm": 21.223234176635742, | |
| "learning_rate": 3.0198412698412697e-06, | |
| "loss": 0.1265, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 18.304250559284117, | |
| "grad_norm": 12.335346221923828, | |
| "learning_rate": 2.9801587301587305e-06, | |
| "loss": 0.1095, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 18.39373601789709, | |
| "grad_norm": 4.7988715171813965, | |
| "learning_rate": 2.9404761904761908e-06, | |
| "loss": 0.0944, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 18.483221476510067, | |
| "grad_norm": 28.275365829467773, | |
| "learning_rate": 2.900793650793651e-06, | |
| "loss": 0.0549, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 18.572706935123044, | |
| "grad_norm": 7.988637447357178, | |
| "learning_rate": 2.861111111111111e-06, | |
| "loss": 0.0881, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 18.662192393736017, | |
| "grad_norm": 9.342594146728516, | |
| "learning_rate": 2.8214285714285718e-06, | |
| "loss": 0.0528, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 18.751677852348994, | |
| "grad_norm": 12.687505722045898, | |
| "learning_rate": 2.781746031746032e-06, | |
| "loss": 0.0711, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 18.841163310961967, | |
| "grad_norm": 7.692240238189697, | |
| "learning_rate": 2.7420634920634924e-06, | |
| "loss": 0.072, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 18.930648769574944, | |
| "grad_norm": 14.148133277893066, | |
| "learning_rate": 2.7023809523809523e-06, | |
| "loss": 0.0677, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.8003770709037781, | |
| "eval_macro_f1": 0.766772035108551, | |
| "eval_runtime": 25.3025, | |
| "eval_samples_per_second": 30.234, | |
| "eval_steps_per_second": 3.794, | |
| "step": 2128 | |
| }, | |
| { | |
| "epoch": 19.017897091722595, | |
| "grad_norm": 17.935680389404297, | |
| "learning_rate": 2.662698412698413e-06, | |
| "loss": 0.0416, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 19.107382550335572, | |
| "grad_norm": 7.1221537590026855, | |
| "learning_rate": 2.6230158730158734e-06, | |
| "loss": 0.1003, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 19.196868008948545, | |
| "grad_norm": 17.375965118408203, | |
| "learning_rate": 2.5833333333333337e-06, | |
| "loss": 0.0854, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 19.286353467561522, | |
| "grad_norm": 13.114810943603516, | |
| "learning_rate": 2.5436507936507936e-06, | |
| "loss": 0.0759, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 19.375838926174495, | |
| "grad_norm": 20.650806427001953, | |
| "learning_rate": 2.503968253968254e-06, | |
| "loss": 0.0576, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 19.465324384787472, | |
| "grad_norm": 1.7908034324645996, | |
| "learning_rate": 2.4642857142857147e-06, | |
| "loss": 0.0314, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 19.55480984340045, | |
| "grad_norm": 15.814742088317871, | |
| "learning_rate": 2.4246031746031746e-06, | |
| "loss": 0.0754, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 19.644295302013422, | |
| "grad_norm": 18.996606826782227, | |
| "learning_rate": 2.3849206349206354e-06, | |
| "loss": 0.0502, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 19.7337807606264, | |
| "grad_norm": 24.4049015045166, | |
| "learning_rate": 2.3452380952380953e-06, | |
| "loss": 0.0502, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 19.823266219239372, | |
| "grad_norm": 6.1143879890441895, | |
| "learning_rate": 2.305555555555556e-06, | |
| "loss": 0.054, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 19.91275167785235, | |
| "grad_norm": 12.98304557800293, | |
| "learning_rate": 2.265873015873016e-06, | |
| "loss": 0.0737, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.3279534876346588, | |
| "learning_rate": 2.2261904761904763e-06, | |
| "loss": 0.0353, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.8695369958877563, | |
| "eval_macro_f1": 0.7699478748997595, | |
| "eval_runtime": 25.1606, | |
| "eval_samples_per_second": 30.405, | |
| "eval_steps_per_second": 3.815, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 20.089485458612977, | |
| "grad_norm": 5.161200523376465, | |
| "learning_rate": 2.1865079365079366e-06, | |
| "loss": 0.0489, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 20.17897091722595, | |
| "grad_norm": 24.046892166137695, | |
| "learning_rate": 2.146825396825397e-06, | |
| "loss": 0.0887, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 20.268456375838927, | |
| "grad_norm": 3.9411873817443848, | |
| "learning_rate": 2.1071428571428572e-06, | |
| "loss": 0.0804, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 20.3579418344519, | |
| "grad_norm": 12.20919418334961, | |
| "learning_rate": 2.0674603174603176e-06, | |
| "loss": 0.0318, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 20.447427293064877, | |
| "grad_norm": 5.021272659301758, | |
| "learning_rate": 2.027777777777778e-06, | |
| "loss": 0.0983, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 20.536912751677853, | |
| "grad_norm": 15.741971969604492, | |
| "learning_rate": 1.9880952380952382e-06, | |
| "loss": 0.0335, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 20.626398210290827, | |
| "grad_norm": 16.82331085205078, | |
| "learning_rate": 1.9484126984126985e-06, | |
| "loss": 0.0744, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 20.715883668903803, | |
| "grad_norm": 0.5343822240829468, | |
| "learning_rate": 1.908730158730159e-06, | |
| "loss": 0.0432, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 20.80536912751678, | |
| "grad_norm": 18.707128524780273, | |
| "learning_rate": 1.8690476190476192e-06, | |
| "loss": 0.0517, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 20.894854586129753, | |
| "grad_norm": 10.502820014953613, | |
| "learning_rate": 1.8293650793650793e-06, | |
| "loss": 0.0596, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 20.98434004474273, | |
| "grad_norm": 3.260993242263794, | |
| "learning_rate": 1.7896825396825399e-06, | |
| "loss": 0.0262, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.8509567975997925, | |
| "eval_macro_f1": 0.7710241615578796, | |
| "eval_runtime": 21.2423, | |
| "eval_samples_per_second": 36.013, | |
| "eval_steps_per_second": 4.519, | |
| "step": 2352 | |
| }, | |
| { | |
| "epoch": 21.07158836689038, | |
| "grad_norm": 21.054967880249023, | |
| "learning_rate": 1.75e-06, | |
| "loss": 0.0623, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 21.161073825503355, | |
| "grad_norm": 7.076012134552002, | |
| "learning_rate": 1.7103174603174605e-06, | |
| "loss": 0.0567, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 21.25055928411633, | |
| "grad_norm": 9.261219024658203, | |
| "learning_rate": 1.6706349206349206e-06, | |
| "loss": 0.0805, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 21.340044742729308, | |
| "grad_norm": 21.940967559814453, | |
| "learning_rate": 1.6309523809523812e-06, | |
| "loss": 0.062, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 21.42953020134228, | |
| "grad_norm": 0.5533654093742371, | |
| "learning_rate": 1.5912698412698413e-06, | |
| "loss": 0.0443, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 21.519015659955258, | |
| "grad_norm": 1.7748634815216064, | |
| "learning_rate": 1.5515873015873018e-06, | |
| "loss": 0.0465, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 21.60850111856823, | |
| "grad_norm": 14.389286994934082, | |
| "learning_rate": 1.511904761904762e-06, | |
| "loss": 0.0426, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 21.697986577181208, | |
| "grad_norm": 0.1624564379453659, | |
| "learning_rate": 1.4722222222222225e-06, | |
| "loss": 0.0399, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 21.787472035794185, | |
| "grad_norm": 16.362260818481445, | |
| "learning_rate": 1.4325396825396826e-06, | |
| "loss": 0.0737, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 21.876957494407158, | |
| "grad_norm": 9.44295883178711, | |
| "learning_rate": 1.392857142857143e-06, | |
| "loss": 0.1062, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 21.966442953020135, | |
| "grad_norm": 12.541874885559082, | |
| "learning_rate": 1.3531746031746033e-06, | |
| "loss": 0.0628, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.8052472472190857, | |
| "eval_macro_f1": 0.7966146756469337, | |
| "eval_runtime": 25.2623, | |
| "eval_samples_per_second": 30.282, | |
| "eval_steps_per_second": 3.8, | |
| "step": 2464 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.8247393356405015e+19, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |