| { | |
| "best_metric": 0.981574539363484, | |
| "best_model_checkpoint": "convnext-tiny-224-finetuned-biopsy/checkpoint-1092", | |
| "epoch": 50.0, | |
| "eval_steps": 500, | |
| "global_step": 2100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 3.011354446411133, | |
| "learning_rate": 2.3809523809523808e-06, | |
| "loss": 1.3999, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 2.8402059078216553, | |
| "learning_rate": 4.7619047619047615e-06, | |
| "loss": 1.3646, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 2.7643537521362305, | |
| "learning_rate": 7.142857142857143e-06, | |
| "loss": 1.3146, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 2.404897689819336, | |
| "learning_rate": 9.523809523809523e-06, | |
| "loss": 1.2488, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.5360134003350083, | |
| "eval_loss": 1.1914242506027222, | |
| "eval_runtime": 99.5389, | |
| "eval_samples_per_second": 5.998, | |
| "eval_steps_per_second": 0.191, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 1.9510854482650757, | |
| "learning_rate": 1.1904761904761905e-05, | |
| "loss": 1.1624, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 2.0629866123199463, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": 1.0846, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.6558785438537598, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.9784, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 1.7638639211654663, | |
| "learning_rate": 1.9047619047619046e-05, | |
| "loss": 0.9265, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.5544388609715243, | |
| "eval_loss": 0.8633517026901245, | |
| "eval_runtime": 100.448, | |
| "eval_samples_per_second": 5.943, | |
| "eval_steps_per_second": 0.189, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 1.5686501264572144, | |
| "learning_rate": 2.1428571428571428e-05, | |
| "loss": 0.8297, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 1.9327245950698853, | |
| "learning_rate": 2.380952380952381e-05, | |
| "loss": 0.7616, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.619047619047619, | |
| "grad_norm": 2.8427188396453857, | |
| "learning_rate": 2.6190476190476192e-05, | |
| "loss": 0.6351, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 4.029513835906982, | |
| "learning_rate": 2.857142857142857e-05, | |
| "loss": 0.5701, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.8542713567839196, | |
| "eval_loss": 0.48339542746543884, | |
| "eval_runtime": 100.6672, | |
| "eval_samples_per_second": 5.93, | |
| "eval_steps_per_second": 0.189, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 3.0952380952380953, | |
| "grad_norm": 2.445913314819336, | |
| "learning_rate": 3.095238095238095e-05, | |
| "loss": 0.477, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 5.690745830535889, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.4355, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 3.973672389984131, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 0.3901, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.8095238095238093, | |
| "grad_norm": 2.3706932067871094, | |
| "learning_rate": 3.809523809523809e-05, | |
| "loss": 0.4041, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9212730318257957, | |
| "eval_loss": 0.2996470332145691, | |
| "eval_runtime": 110.7307, | |
| "eval_samples_per_second": 5.391, | |
| "eval_steps_per_second": 0.172, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 4.0476190476190474, | |
| "grad_norm": 4.306156635284424, | |
| "learning_rate": 4.047619047619048e-05, | |
| "loss": 0.3426, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 3.5428152084350586, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.3512, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 4.523809523809524, | |
| "grad_norm": 8.013957977294922, | |
| "learning_rate": 4.523809523809524e-05, | |
| "loss": 0.3105, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 4.20527458190918, | |
| "learning_rate": 4.761904761904762e-05, | |
| "loss": 0.2608, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 5.909843921661377, | |
| "learning_rate": 5e-05, | |
| "loss": 0.2747, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.914572864321608, | |
| "eval_loss": 0.2743017077445984, | |
| "eval_runtime": 98.6089, | |
| "eval_samples_per_second": 6.054, | |
| "eval_steps_per_second": 0.193, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 5.238095238095238, | |
| "grad_norm": 6.073720932006836, | |
| "learning_rate": 4.973544973544973e-05, | |
| "loss": 0.2715, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 5.476190476190476, | |
| "grad_norm": 3.8292412757873535, | |
| "learning_rate": 4.9470899470899475e-05, | |
| "loss": 0.2474, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 4.192967891693115, | |
| "learning_rate": 4.9206349206349204e-05, | |
| "loss": 0.2322, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.9523809523809526, | |
| "grad_norm": 6.533767223358154, | |
| "learning_rate": 4.894179894179895e-05, | |
| "loss": 0.2518, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.949748743718593, | |
| "eval_loss": 0.1826142966747284, | |
| "eval_runtime": 100.6269, | |
| "eval_samples_per_second": 5.933, | |
| "eval_steps_per_second": 0.189, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 6.190476190476191, | |
| "grad_norm": 5.2794036865234375, | |
| "learning_rate": 4.8677248677248676e-05, | |
| "loss": 0.2071, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 6.428571428571429, | |
| "grad_norm": 5.7268266677856445, | |
| "learning_rate": 4.841269841269841e-05, | |
| "loss": 0.2343, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 20.241403579711914, | |
| "learning_rate": 4.814814814814815e-05, | |
| "loss": 0.1984, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.904761904761905, | |
| "grad_norm": 14.405659675598145, | |
| "learning_rate": 4.7883597883597884e-05, | |
| "loss": 0.2363, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.949748743718593, | |
| "eval_loss": 0.17308001220226288, | |
| "eval_runtime": 100.6297, | |
| "eval_samples_per_second": 5.933, | |
| "eval_steps_per_second": 0.189, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "grad_norm": 3.933133125305176, | |
| "learning_rate": 4.761904761904762e-05, | |
| "loss": 0.2608, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 7.380952380952381, | |
| "grad_norm": 6.623403549194336, | |
| "learning_rate": 4.7354497354497356e-05, | |
| "loss": 0.2313, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 7.619047619047619, | |
| "grad_norm": 15.30415153503418, | |
| "learning_rate": 4.708994708994709e-05, | |
| "loss": 0.1701, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 7.857142857142857, | |
| "grad_norm": 3.3041625022888184, | |
| "learning_rate": 4.682539682539683e-05, | |
| "loss": 0.1782, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.9363484087102177, | |
| "eval_loss": 0.18696695566177368, | |
| "eval_runtime": 102.6363, | |
| "eval_samples_per_second": 5.817, | |
| "eval_steps_per_second": 0.185, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 8.095238095238095, | |
| "grad_norm": 12.961448669433594, | |
| "learning_rate": 4.656084656084656e-05, | |
| "loss": 0.2039, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 3.7345833778381348, | |
| "learning_rate": 4.62962962962963e-05, | |
| "loss": 0.2163, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 6.517577171325684, | |
| "learning_rate": 4.603174603174603e-05, | |
| "loss": 0.1678, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 8.80952380952381, | |
| "grad_norm": 22.639217376708984, | |
| "learning_rate": 4.576719576719577e-05, | |
| "loss": 0.2122, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.9614740368509213, | |
| "eval_loss": 0.1327419877052307, | |
| "eval_runtime": 101.4693, | |
| "eval_samples_per_second": 5.884, | |
| "eval_steps_per_second": 0.187, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 9.047619047619047, | |
| "grad_norm": 8.08673095703125, | |
| "learning_rate": 4.55026455026455e-05, | |
| "loss": 0.1976, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 9.285714285714286, | |
| "grad_norm": 4.341285228729248, | |
| "learning_rate": 4.523809523809524e-05, | |
| "loss": 0.2176, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 9.523809523809524, | |
| "grad_norm": 4.342065334320068, | |
| "learning_rate": 4.4973544973544974e-05, | |
| "loss": 0.1604, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 9.761904761904763, | |
| "grad_norm": 5.911900997161865, | |
| "learning_rate": 4.470899470899471e-05, | |
| "loss": 0.1294, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 14.639168739318848, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.1856, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.931323283082077, | |
| "eval_loss": 0.20822733640670776, | |
| "eval_runtime": 97.7108, | |
| "eval_samples_per_second": 6.11, | |
| "eval_steps_per_second": 0.194, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 10.238095238095237, | |
| "grad_norm": 2.8513264656066895, | |
| "learning_rate": 4.417989417989418e-05, | |
| "loss": 0.1524, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 10.476190476190476, | |
| "grad_norm": 4.05119514465332, | |
| "learning_rate": 4.391534391534391e-05, | |
| "loss": 0.1527, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 10.714285714285714, | |
| "grad_norm": 5.783977031707764, | |
| "learning_rate": 4.3650793650793655e-05, | |
| "loss": 0.1718, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 10.952380952380953, | |
| "grad_norm": 3.3016180992126465, | |
| "learning_rate": 4.3386243386243384e-05, | |
| "loss": 0.1736, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.9564489112227805, | |
| "eval_loss": 0.13064023852348328, | |
| "eval_runtime": 99.9343, | |
| "eval_samples_per_second": 5.974, | |
| "eval_steps_per_second": 0.19, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 11.19047619047619, | |
| "grad_norm": 8.341907501220703, | |
| "learning_rate": 4.312169312169313e-05, | |
| "loss": 0.1628, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 11.428571428571429, | |
| "grad_norm": 3.418076753616333, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.1215, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 11.666666666666666, | |
| "grad_norm": 5.960474967956543, | |
| "learning_rate": 4.259259259259259e-05, | |
| "loss": 0.1381, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 11.904761904761905, | |
| "grad_norm": 16.451749801635742, | |
| "learning_rate": 4.232804232804233e-05, | |
| "loss": 0.1423, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.9731993299832495, | |
| "eval_loss": 0.09885474294424057, | |
| "eval_runtime": 100.1371, | |
| "eval_samples_per_second": 5.962, | |
| "eval_steps_per_second": 0.19, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 12.142857142857142, | |
| "grad_norm": 4.7117180824279785, | |
| "learning_rate": 4.2063492063492065e-05, | |
| "loss": 0.1579, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 12.380952380952381, | |
| "grad_norm": 3.3007049560546875, | |
| "learning_rate": 4.17989417989418e-05, | |
| "loss": 0.1393, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 12.619047619047619, | |
| "grad_norm": 5.64157247543335, | |
| "learning_rate": 4.153439153439154e-05, | |
| "loss": 0.1167, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 12.857142857142858, | |
| "grad_norm": 3.9055404663085938, | |
| "learning_rate": 4.126984126984127e-05, | |
| "loss": 0.1296, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.9731993299832495, | |
| "eval_loss": 0.09491746872663498, | |
| "eval_runtime": 100.8651, | |
| "eval_samples_per_second": 5.919, | |
| "eval_steps_per_second": 0.188, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 13.095238095238095, | |
| "grad_norm": 3.2484312057495117, | |
| "learning_rate": 4.100529100529101e-05, | |
| "loss": 0.1174, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 14.672613143920898, | |
| "learning_rate": 4.074074074074074e-05, | |
| "loss": 0.1499, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 13.571428571428571, | |
| "grad_norm": 11.733295440673828, | |
| "learning_rate": 4.047619047619048e-05, | |
| "loss": 0.1225, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 13.80952380952381, | |
| "grad_norm": 4.323381423950195, | |
| "learning_rate": 4.021164021164021e-05, | |
| "loss": 0.1158, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.9631490787269682, | |
| "eval_loss": 0.10841543227434158, | |
| "eval_runtime": 99.5304, | |
| "eval_samples_per_second": 5.998, | |
| "eval_steps_per_second": 0.191, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 14.047619047619047, | |
| "grad_norm": 5.578769683837891, | |
| "learning_rate": 3.9947089947089946e-05, | |
| "loss": 0.1032, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 14.285714285714286, | |
| "grad_norm": 6.882154941558838, | |
| "learning_rate": 3.968253968253968e-05, | |
| "loss": 0.1488, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 14.523809523809524, | |
| "grad_norm": 10.363306045532227, | |
| "learning_rate": 3.941798941798942e-05, | |
| "loss": 0.1312, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 14.761904761904763, | |
| "grad_norm": 2.7422566413879395, | |
| "learning_rate": 3.9153439153439155e-05, | |
| "loss": 0.1225, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 3.881643533706665, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 0.1383, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.9715242881072027, | |
| "eval_loss": 0.08646578341722488, | |
| "eval_runtime": 99.7018, | |
| "eval_samples_per_second": 5.988, | |
| "eval_steps_per_second": 0.191, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 15.238095238095237, | |
| "grad_norm": 6.1451311111450195, | |
| "learning_rate": 3.862433862433863e-05, | |
| "loss": 0.1081, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 15.476190476190476, | |
| "grad_norm": 5.906555652618408, | |
| "learning_rate": 3.835978835978836e-05, | |
| "loss": 0.1396, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 15.714285714285714, | |
| "grad_norm": 6.843247413635254, | |
| "learning_rate": 3.809523809523809e-05, | |
| "loss": 0.1108, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 15.952380952380953, | |
| "grad_norm": 11.019234657287598, | |
| "learning_rate": 3.7830687830687835e-05, | |
| "loss": 0.1384, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.9715242881072027, | |
| "eval_loss": 0.08793748170137405, | |
| "eval_runtime": 101.06, | |
| "eval_samples_per_second": 5.907, | |
| "eval_steps_per_second": 0.188, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 16.19047619047619, | |
| "grad_norm": 5.210529327392578, | |
| "learning_rate": 3.7566137566137564e-05, | |
| "loss": 0.0953, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 16.428571428571427, | |
| "grad_norm": 5.584816932678223, | |
| "learning_rate": 3.730158730158731e-05, | |
| "loss": 0.1035, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 3.315423011779785, | |
| "learning_rate": 3.7037037037037037e-05, | |
| "loss": 0.1092, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 16.904761904761905, | |
| "grad_norm": 7.032583713531494, | |
| "learning_rate": 3.677248677248677e-05, | |
| "loss": 0.0924, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_accuracy": 0.9782244556113903, | |
| "eval_loss": 0.07580773532390594, | |
| "eval_runtime": 102.351, | |
| "eval_samples_per_second": 5.833, | |
| "eval_steps_per_second": 0.186, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 17.142857142857142, | |
| "grad_norm": 17.09100341796875, | |
| "learning_rate": 3.650793650793651e-05, | |
| "loss": 0.0955, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 17.38095238095238, | |
| "grad_norm": 9.608597755432129, | |
| "learning_rate": 3.6243386243386245e-05, | |
| "loss": 0.1377, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 17.61904761904762, | |
| "grad_norm": 2.7079122066497803, | |
| "learning_rate": 3.597883597883598e-05, | |
| "loss": 0.1027, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 17.857142857142858, | |
| "grad_norm": 2.485712766647339, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 0.0966, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.9681742043551089, | |
| "eval_loss": 0.08659480512142181, | |
| "eval_runtime": 100.9252, | |
| "eval_samples_per_second": 5.915, | |
| "eval_steps_per_second": 0.188, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 18.095238095238095, | |
| "grad_norm": 4.8402252197265625, | |
| "learning_rate": 3.5449735449735446e-05, | |
| "loss": 0.0941, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 18.333333333333332, | |
| "grad_norm": 5.546189785003662, | |
| "learning_rate": 3.518518518518519e-05, | |
| "loss": 0.0891, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 18.571428571428573, | |
| "grad_norm": 9.732258796691895, | |
| "learning_rate": 3.492063492063492e-05, | |
| "loss": 0.1018, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 18.80952380952381, | |
| "grad_norm": 13.696222305297852, | |
| "learning_rate": 3.465608465608466e-05, | |
| "loss": 0.1324, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_accuracy": 0.9715242881072027, | |
| "eval_loss": 0.08763264119625092, | |
| "eval_runtime": 99.5155, | |
| "eval_samples_per_second": 5.999, | |
| "eval_steps_per_second": 0.191, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 19.047619047619047, | |
| "grad_norm": 8.419419288635254, | |
| "learning_rate": 3.439153439153439e-05, | |
| "loss": 0.0911, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 19.285714285714285, | |
| "grad_norm": 3.410423755645752, | |
| "learning_rate": 3.412698412698413e-05, | |
| "loss": 0.1086, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 19.523809523809526, | |
| "grad_norm": 11.417459487915039, | |
| "learning_rate": 3.386243386243386e-05, | |
| "loss": 0.0953, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 19.761904761904763, | |
| "grad_norm": 6.009758472442627, | |
| "learning_rate": 3.35978835978836e-05, | |
| "loss": 0.1068, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 8.808171272277832, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.0995, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.964824120603015, | |
| "eval_loss": 0.09903094917535782, | |
| "eval_runtime": 99.7352, | |
| "eval_samples_per_second": 5.986, | |
| "eval_steps_per_second": 0.191, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 20.238095238095237, | |
| "grad_norm": 3.4568874835968018, | |
| "learning_rate": 3.306878306878307e-05, | |
| "loss": 0.0992, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 20.476190476190474, | |
| "grad_norm": 2.8322699069976807, | |
| "learning_rate": 3.280423280423281e-05, | |
| "loss": 0.0893, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 20.714285714285715, | |
| "grad_norm": 6.81901216506958, | |
| "learning_rate": 3.253968253968254e-05, | |
| "loss": 0.0882, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 20.952380952380953, | |
| "grad_norm": 11.737163543701172, | |
| "learning_rate": 3.227513227513227e-05, | |
| "loss": 0.083, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_accuracy": 0.9698492462311558, | |
| "eval_loss": 0.09110881388187408, | |
| "eval_runtime": 98.7423, | |
| "eval_samples_per_second": 6.046, | |
| "eval_steps_per_second": 0.192, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 21.19047619047619, | |
| "grad_norm": 5.511984348297119, | |
| "learning_rate": 3.2010582010582015e-05, | |
| "loss": 0.1076, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 21.428571428571427, | |
| "grad_norm": 3.369694709777832, | |
| "learning_rate": 3.1746031746031745e-05, | |
| "loss": 0.093, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 21.666666666666668, | |
| "grad_norm": 10.551066398620605, | |
| "learning_rate": 3.148148148148148e-05, | |
| "loss": 0.1042, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 21.904761904761905, | |
| "grad_norm": 4.625993251800537, | |
| "learning_rate": 3.121693121693122e-05, | |
| "loss": 0.082, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.9798994974874372, | |
| "eval_loss": 0.08163543045520782, | |
| "eval_runtime": 99.0332, | |
| "eval_samples_per_second": 6.028, | |
| "eval_steps_per_second": 0.192, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 22.142857142857142, | |
| "grad_norm": 4.648172378540039, | |
| "learning_rate": 3.095238095238095e-05, | |
| "loss": 0.0826, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 22.38095238095238, | |
| "grad_norm": 3.161362409591675, | |
| "learning_rate": 3.068783068783069e-05, | |
| "loss": 0.0912, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 22.61904761904762, | |
| "grad_norm": 4.312536239624023, | |
| "learning_rate": 3.0423280423280425e-05, | |
| "loss": 0.0923, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 22.857142857142858, | |
| "grad_norm": 11.137120246887207, | |
| "learning_rate": 3.0158730158730158e-05, | |
| "loss": 0.1038, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_accuracy": 0.9430485762144054, | |
| "eval_loss": 0.14531147480010986, | |
| "eval_runtime": 100.399, | |
| "eval_samples_per_second": 5.946, | |
| "eval_steps_per_second": 0.189, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 23.095238095238095, | |
| "grad_norm": 7.795422077178955, | |
| "learning_rate": 2.9894179894179897e-05, | |
| "loss": 0.1162, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 23.333333333333332, | |
| "grad_norm": 3.744706392288208, | |
| "learning_rate": 2.962962962962963e-05, | |
| "loss": 0.0755, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 23.571428571428573, | |
| "grad_norm": 1.1532397270202637, | |
| "learning_rate": 2.9365079365079366e-05, | |
| "loss": 0.0725, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 23.80952380952381, | |
| "grad_norm": 7.470936298370361, | |
| "learning_rate": 2.91005291005291e-05, | |
| "loss": 0.0751, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 0.9731993299832495, | |
| "eval_loss": 0.08765166252851486, | |
| "eval_runtime": 99.5874, | |
| "eval_samples_per_second": 5.995, | |
| "eval_steps_per_second": 0.191, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 24.047619047619047, | |
| "grad_norm": 3.4710640907287598, | |
| "learning_rate": 2.8835978835978838e-05, | |
| "loss": 0.0946, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 24.285714285714285, | |
| "grad_norm": 3.0858073234558105, | |
| "learning_rate": 2.857142857142857e-05, | |
| "loss": 0.0744, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 24.523809523809526, | |
| "grad_norm": 3.6983649730682373, | |
| "learning_rate": 2.830687830687831e-05, | |
| "loss": 0.0717, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 24.761904761904763, | |
| "grad_norm": 5.484372138977051, | |
| "learning_rate": 2.8042328042328043e-05, | |
| "loss": 0.0981, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 14.718037605285645, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 0.0733, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_accuracy": 0.9681742043551089, | |
| "eval_loss": 0.08776049315929413, | |
| "eval_runtime": 100.772, | |
| "eval_samples_per_second": 5.924, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 25.238095238095237, | |
| "grad_norm": 2.7761025428771973, | |
| "learning_rate": 2.7513227513227512e-05, | |
| "loss": 0.0696, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 25.476190476190474, | |
| "grad_norm": 3.9167935848236084, | |
| "learning_rate": 2.724867724867725e-05, | |
| "loss": 0.0792, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 25.714285714285715, | |
| "grad_norm": 6.069251537322998, | |
| "learning_rate": 2.6984126984126984e-05, | |
| "loss": 0.0797, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 25.952380952380953, | |
| "grad_norm": 4.076193332672119, | |
| "learning_rate": 2.6719576719576723e-05, | |
| "loss": 0.0813, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_accuracy": 0.981574539363484, | |
| "eval_loss": 0.06883959472179413, | |
| "eval_runtime": 98.9533, | |
| "eval_samples_per_second": 6.033, | |
| "eval_steps_per_second": 0.192, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 26.19047619047619, | |
| "grad_norm": 2.7729415893554688, | |
| "learning_rate": 2.6455026455026456e-05, | |
| "loss": 0.0827, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 26.428571428571427, | |
| "grad_norm": 5.632189750671387, | |
| "learning_rate": 2.6190476190476192e-05, | |
| "loss": 0.0967, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 26.666666666666668, | |
| "grad_norm": 26.66261100769043, | |
| "learning_rate": 2.5925925925925925e-05, | |
| "loss": 0.0699, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 26.904761904761905, | |
| "grad_norm": 8.61750602722168, | |
| "learning_rate": 2.5661375661375664e-05, | |
| "loss": 0.0788, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_accuracy": 0.9782244556113903, | |
| "eval_loss": 0.0731692910194397, | |
| "eval_runtime": 100.6453, | |
| "eval_samples_per_second": 5.932, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 27.142857142857142, | |
| "grad_norm": 6.395318031311035, | |
| "learning_rate": 2.5396825396825397e-05, | |
| "loss": 0.0916, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 27.38095238095238, | |
| "grad_norm": 9.09482192993164, | |
| "learning_rate": 2.5132275132275137e-05, | |
| "loss": 0.0733, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 27.61904761904762, | |
| "grad_norm": 6.092742919921875, | |
| "learning_rate": 2.4867724867724866e-05, | |
| "loss": 0.0663, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 27.857142857142858, | |
| "grad_norm": 4.222706317901611, | |
| "learning_rate": 2.4603174603174602e-05, | |
| "loss": 0.0617, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_accuracy": 0.9748743718592965, | |
| "eval_loss": 0.07219748198986053, | |
| "eval_runtime": 100.6876, | |
| "eval_samples_per_second": 5.929, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 28.095238095238095, | |
| "grad_norm": 3.090153932571411, | |
| "learning_rate": 2.4338624338624338e-05, | |
| "loss": 0.0636, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 28.333333333333332, | |
| "grad_norm": 7.900331020355225, | |
| "learning_rate": 2.4074074074074074e-05, | |
| "loss": 0.0601, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 28.571428571428573, | |
| "grad_norm": 2.6271705627441406, | |
| "learning_rate": 2.380952380952381e-05, | |
| "loss": 0.0707, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 28.80952380952381, | |
| "grad_norm": 8.997330665588379, | |
| "learning_rate": 2.3544973544973546e-05, | |
| "loss": 0.0568, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_accuracy": 0.964824120603015, | |
| "eval_loss": 0.08830268681049347, | |
| "eval_runtime": 101.2279, | |
| "eval_samples_per_second": 5.898, | |
| "eval_steps_per_second": 0.188, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 29.047619047619047, | |
| "grad_norm": 4.938997745513916, | |
| "learning_rate": 2.328042328042328e-05, | |
| "loss": 0.0745, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 29.285714285714285, | |
| "grad_norm": 4.161396503448486, | |
| "learning_rate": 2.3015873015873015e-05, | |
| "loss": 0.0582, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 29.523809523809526, | |
| "grad_norm": 2.22847318649292, | |
| "learning_rate": 2.275132275132275e-05, | |
| "loss": 0.0686, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 29.761904761904763, | |
| "grad_norm": 4.214893341064453, | |
| "learning_rate": 2.2486772486772487e-05, | |
| "loss": 0.0793, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 6.174160003662109, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.0701, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_accuracy": 0.9765494137353434, | |
| "eval_loss": 0.07026898860931396, | |
| "eval_runtime": 100.0757, | |
| "eval_samples_per_second": 5.965, | |
| "eval_steps_per_second": 0.19, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 30.238095238095237, | |
| "grad_norm": 4.761576175689697, | |
| "learning_rate": 2.1957671957671956e-05, | |
| "loss": 0.0886, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 30.476190476190474, | |
| "grad_norm": 6.844214916229248, | |
| "learning_rate": 2.1693121693121692e-05, | |
| "loss": 0.0685, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 30.714285714285715, | |
| "grad_norm": 6.561805725097656, | |
| "learning_rate": 2.1428571428571428e-05, | |
| "loss": 0.0853, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 30.952380952380953, | |
| "grad_norm": 4.84083890914917, | |
| "learning_rate": 2.1164021164021164e-05, | |
| "loss": 0.0535, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_accuracy": 0.9782244556113903, | |
| "eval_loss": 0.07920973747968674, | |
| "eval_runtime": 101.0786, | |
| "eval_samples_per_second": 5.906, | |
| "eval_steps_per_second": 0.188, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 31.19047619047619, | |
| "grad_norm": 5.74992561340332, | |
| "learning_rate": 2.08994708994709e-05, | |
| "loss": 0.0627, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 31.428571428571427, | |
| "grad_norm": 5.586116790771484, | |
| "learning_rate": 2.0634920634920636e-05, | |
| "loss": 0.0824, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 31.666666666666668, | |
| "grad_norm": 5.378215312957764, | |
| "learning_rate": 2.037037037037037e-05, | |
| "loss": 0.0619, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 31.904761904761905, | |
| "grad_norm": 6.4354448318481445, | |
| "learning_rate": 2.0105820105820105e-05, | |
| "loss": 0.0716, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_accuracy": 0.9798994974874372, | |
| "eval_loss": 0.06844783574342728, | |
| "eval_runtime": 96.5819, | |
| "eval_samples_per_second": 6.181, | |
| "eval_steps_per_second": 0.197, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 32.142857142857146, | |
| "grad_norm": 15.240459442138672, | |
| "learning_rate": 1.984126984126984e-05, | |
| "loss": 0.0697, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 32.38095238095238, | |
| "grad_norm": 6.0155110359191895, | |
| "learning_rate": 1.9576719576719577e-05, | |
| "loss": 0.0607, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 32.61904761904762, | |
| "grad_norm": 3.0747318267822266, | |
| "learning_rate": 1.9312169312169313e-05, | |
| "loss": 0.0619, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 32.857142857142854, | |
| "grad_norm": 2.2313997745513916, | |
| "learning_rate": 1.9047619047619046e-05, | |
| "loss": 0.0419, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_accuracy": 0.981574539363484, | |
| "eval_loss": 0.06662608683109283, | |
| "eval_runtime": 100.317, | |
| "eval_samples_per_second": 5.951, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 33.095238095238095, | |
| "grad_norm": 4.3953752517700195, | |
| "learning_rate": 1.8783068783068782e-05, | |
| "loss": 0.0469, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 33.333333333333336, | |
| "grad_norm": 3.364664077758789, | |
| "learning_rate": 1.8518518518518518e-05, | |
| "loss": 0.0566, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 33.57142857142857, | |
| "grad_norm": 5.303763389587402, | |
| "learning_rate": 1.8253968253968254e-05, | |
| "loss": 0.0535, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 33.80952380952381, | |
| "grad_norm": 2.8668274879455566, | |
| "learning_rate": 1.798941798941799e-05, | |
| "loss": 0.054, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_accuracy": 0.9748743718592965, | |
| "eval_loss": 0.07679982483386993, | |
| "eval_runtime": 98.4202, | |
| "eval_samples_per_second": 6.066, | |
| "eval_steps_per_second": 0.193, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 34.04761904761905, | |
| "grad_norm": 4.746896266937256, | |
| "learning_rate": 1.7724867724867723e-05, | |
| "loss": 0.063, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 34.285714285714285, | |
| "grad_norm": 4.740354537963867, | |
| "learning_rate": 1.746031746031746e-05, | |
| "loss": 0.0477, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 34.523809523809526, | |
| "grad_norm": 5.7020182609558105, | |
| "learning_rate": 1.7195767195767195e-05, | |
| "loss": 0.0608, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 34.76190476190476, | |
| "grad_norm": 1.4976569414138794, | |
| "learning_rate": 1.693121693121693e-05, | |
| "loss": 0.0511, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 5.835646152496338, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.0332, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_accuracy": 0.9798994974874372, | |
| "eval_loss": 0.07169447094202042, | |
| "eval_runtime": 99.5255, | |
| "eval_samples_per_second": 5.998, | |
| "eval_steps_per_second": 0.191, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 35.23809523809524, | |
| "grad_norm": 4.712400913238525, | |
| "learning_rate": 1.6402116402116404e-05, | |
| "loss": 0.0579, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 35.476190476190474, | |
| "grad_norm": 5.839983940124512, | |
| "learning_rate": 1.6137566137566136e-05, | |
| "loss": 0.0835, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 35.714285714285715, | |
| "grad_norm": 3.9647841453552246, | |
| "learning_rate": 1.5873015873015872e-05, | |
| "loss": 0.068, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 35.95238095238095, | |
| "grad_norm": 6.082459449768066, | |
| "learning_rate": 1.560846560846561e-05, | |
| "loss": 0.0524, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_accuracy": 0.9715242881072027, | |
| "eval_loss": 0.1067463755607605, | |
| "eval_runtime": 98.3457, | |
| "eval_samples_per_second": 6.07, | |
| "eval_steps_per_second": 0.193, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 36.19047619047619, | |
| "grad_norm": 1.7107481956481934, | |
| "learning_rate": 1.5343915343915344e-05, | |
| "loss": 0.0572, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 36.42857142857143, | |
| "grad_norm": 3.3297202587127686, | |
| "learning_rate": 1.5079365079365079e-05, | |
| "loss": 0.0607, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 36.666666666666664, | |
| "grad_norm": 3.2910244464874268, | |
| "learning_rate": 1.4814814814814815e-05, | |
| "loss": 0.0613, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 36.904761904761905, | |
| "grad_norm": 2.077385663986206, | |
| "learning_rate": 1.455026455026455e-05, | |
| "loss": 0.0372, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_accuracy": 0.981574539363484, | |
| "eval_loss": 0.06044450402259827, | |
| "eval_runtime": 102.3291, | |
| "eval_samples_per_second": 5.834, | |
| "eval_steps_per_second": 0.186, | |
| "step": 1554 | |
| }, | |
| { | |
| "epoch": 37.142857142857146, | |
| "grad_norm": 3.205519914627075, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": 0.0643, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 37.38095238095238, | |
| "grad_norm": 5.730500221252441, | |
| "learning_rate": 1.4021164021164022e-05, | |
| "loss": 0.0521, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 37.61904761904762, | |
| "grad_norm": 4.390468120574951, | |
| "learning_rate": 1.3756613756613756e-05, | |
| "loss": 0.0451, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 37.857142857142854, | |
| "grad_norm": 3.4064576625823975, | |
| "learning_rate": 1.3492063492063492e-05, | |
| "loss": 0.0692, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_accuracy": 0.9798994974874372, | |
| "eval_loss": 0.057894375175237656, | |
| "eval_runtime": 100.3152, | |
| "eval_samples_per_second": 5.951, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 38.095238095238095, | |
| "grad_norm": 3.356318950653076, | |
| "learning_rate": 1.3227513227513228e-05, | |
| "loss": 0.0483, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 38.333333333333336, | |
| "grad_norm": 6.842693328857422, | |
| "learning_rate": 1.2962962962962962e-05, | |
| "loss": 0.0633, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 38.57142857142857, | |
| "grad_norm": 6.391998767852783, | |
| "learning_rate": 1.2698412698412699e-05, | |
| "loss": 0.055, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 38.80952380952381, | |
| "grad_norm": 8.841273307800293, | |
| "learning_rate": 1.2433862433862433e-05, | |
| "loss": 0.038, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_accuracy": 0.9731993299832495, | |
| "eval_loss": 0.08243392407894135, | |
| "eval_runtime": 99.8594, | |
| "eval_samples_per_second": 5.978, | |
| "eval_steps_per_second": 0.19, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 39.04761904761905, | |
| "grad_norm": 4.4886603355407715, | |
| "learning_rate": 1.2169312169312169e-05, | |
| "loss": 0.0557, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 39.285714285714285, | |
| "grad_norm": 3.220594644546509, | |
| "learning_rate": 1.1904761904761905e-05, | |
| "loss": 0.0368, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 39.523809523809526, | |
| "grad_norm": 5.886962890625, | |
| "learning_rate": 1.164021164021164e-05, | |
| "loss": 0.05, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 39.76190476190476, | |
| "grad_norm": 6.625497817993164, | |
| "learning_rate": 1.1375661375661376e-05, | |
| "loss": 0.0446, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 2.696031093597412, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.0524, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_accuracy": 0.9765494137353434, | |
| "eval_loss": 0.06349373608827591, | |
| "eval_runtime": 99.349, | |
| "eval_samples_per_second": 6.009, | |
| "eval_steps_per_second": 0.191, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 40.23809523809524, | |
| "grad_norm": 9.071761131286621, | |
| "learning_rate": 1.0846560846560846e-05, | |
| "loss": 0.0509, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 40.476190476190474, | |
| "grad_norm": 5.18645715713501, | |
| "learning_rate": 1.0582010582010582e-05, | |
| "loss": 0.0663, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 40.714285714285715, | |
| "grad_norm": 9.29127311706543, | |
| "learning_rate": 1.0317460317460318e-05, | |
| "loss": 0.0451, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 40.95238095238095, | |
| "grad_norm": 1.8370898962020874, | |
| "learning_rate": 1.0052910052910053e-05, | |
| "loss": 0.0429, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_accuracy": 0.981574539363484, | |
| "eval_loss": 0.06439382582902908, | |
| "eval_runtime": 100.3354, | |
| "eval_samples_per_second": 5.95, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 41.19047619047619, | |
| "grad_norm": 3.81706166267395, | |
| "learning_rate": 9.788359788359789e-06, | |
| "loss": 0.045, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 41.42857142857143, | |
| "grad_norm": 4.299212455749512, | |
| "learning_rate": 9.523809523809523e-06, | |
| "loss": 0.0521, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 41.666666666666664, | |
| "grad_norm": 2.0019755363464355, | |
| "learning_rate": 9.259259259259259e-06, | |
| "loss": 0.0483, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 41.904761904761905, | |
| "grad_norm": 3.840075731277466, | |
| "learning_rate": 8.994708994708995e-06, | |
| "loss": 0.0705, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_accuracy": 0.9765494137353434, | |
| "eval_loss": 0.07467743009328842, | |
| "eval_runtime": 98.8969, | |
| "eval_samples_per_second": 6.037, | |
| "eval_steps_per_second": 0.192, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 42.142857142857146, | |
| "grad_norm": 4.549776077270508, | |
| "learning_rate": 8.73015873015873e-06, | |
| "loss": 0.0471, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 42.38095238095238, | |
| "grad_norm": 6.040684223175049, | |
| "learning_rate": 8.465608465608466e-06, | |
| "loss": 0.0279, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 42.61904761904762, | |
| "grad_norm": 5.306166172027588, | |
| "learning_rate": 8.201058201058202e-06, | |
| "loss": 0.0586, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 42.857142857142854, | |
| "grad_norm": 3.0557098388671875, | |
| "learning_rate": 7.936507936507936e-06, | |
| "loss": 0.0325, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_accuracy": 0.981574539363484, | |
| "eval_loss": 0.06849457323551178, | |
| "eval_runtime": 100.3214, | |
| "eval_samples_per_second": 5.951, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1806 | |
| }, | |
| { | |
| "epoch": 43.095238095238095, | |
| "grad_norm": 2.943039894104004, | |
| "learning_rate": 7.671957671957672e-06, | |
| "loss": 0.0319, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 43.333333333333336, | |
| "grad_norm": 4.270468235015869, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 0.0633, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 43.57142857142857, | |
| "grad_norm": 4.981551170349121, | |
| "learning_rate": 7.142857142857143e-06, | |
| "loss": 0.0504, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 43.80952380952381, | |
| "grad_norm": 3.0280566215515137, | |
| "learning_rate": 6.878306878306878e-06, | |
| "loss": 0.0446, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_accuracy": 0.9782244556113903, | |
| "eval_loss": 0.06833869963884354, | |
| "eval_runtime": 99.8026, | |
| "eval_samples_per_second": 5.982, | |
| "eval_steps_per_second": 0.19, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 44.04761904761905, | |
| "grad_norm": 1.9423714876174927, | |
| "learning_rate": 6.613756613756614e-06, | |
| "loss": 0.046, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 44.285714285714285, | |
| "grad_norm": 1.1068910360336304, | |
| "learning_rate": 6.349206349206349e-06, | |
| "loss": 0.028, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 44.523809523809526, | |
| "grad_norm": 6.38804817199707, | |
| "learning_rate": 6.0846560846560845e-06, | |
| "loss": 0.0422, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 44.76190476190476, | |
| "grad_norm": 4.064125061035156, | |
| "learning_rate": 5.82010582010582e-06, | |
| "loss": 0.0464, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 3.527822494506836, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 0.0439, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_accuracy": 0.9748743718592965, | |
| "eval_loss": 0.07074515521526337, | |
| "eval_runtime": 98.623, | |
| "eval_samples_per_second": 6.053, | |
| "eval_steps_per_second": 0.193, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 45.23809523809524, | |
| "grad_norm": 2.8777387142181396, | |
| "learning_rate": 5.291005291005291e-06, | |
| "loss": 0.0258, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 45.476190476190474, | |
| "grad_norm": 5.76155424118042, | |
| "learning_rate": 5.026455026455026e-06, | |
| "loss": 0.0505, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 45.714285714285715, | |
| "grad_norm": 3.660327196121216, | |
| "learning_rate": 4.7619047619047615e-06, | |
| "loss": 0.0439, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 45.95238095238095, | |
| "grad_norm": 1.7132253646850586, | |
| "learning_rate": 4.497354497354498e-06, | |
| "loss": 0.0346, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_accuracy": 0.9782244556113903, | |
| "eval_loss": 0.0642278715968132, | |
| "eval_runtime": 100.3418, | |
| "eval_samples_per_second": 5.95, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 46.19047619047619, | |
| "grad_norm": 5.524164199829102, | |
| "learning_rate": 4.232804232804233e-06, | |
| "loss": 0.0485, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 46.42857142857143, | |
| "grad_norm": 1.0166205167770386, | |
| "learning_rate": 3.968253968253968e-06, | |
| "loss": 0.0399, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 46.666666666666664, | |
| "grad_norm": 5.7505388259887695, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 0.026, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 46.904761904761905, | |
| "grad_norm": 7.123528003692627, | |
| "learning_rate": 3.439153439153439e-06, | |
| "loss": 0.0504, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_accuracy": 0.9798994974874372, | |
| "eval_loss": 0.06543267518281937, | |
| "eval_runtime": 100.7676, | |
| "eval_samples_per_second": 5.925, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1974 | |
| }, | |
| { | |
| "epoch": 47.142857142857146, | |
| "grad_norm": 1.3328098058700562, | |
| "learning_rate": 3.1746031746031746e-06, | |
| "loss": 0.0332, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 47.38095238095238, | |
| "grad_norm": 7.107743263244629, | |
| "learning_rate": 2.91005291005291e-06, | |
| "loss": 0.0488, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 47.61904761904762, | |
| "grad_norm": 6.963500499725342, | |
| "learning_rate": 2.6455026455026455e-06, | |
| "loss": 0.0459, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 47.857142857142854, | |
| "grad_norm": 1.7525726556777954, | |
| "learning_rate": 2.3809523809523808e-06, | |
| "loss": 0.0379, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_accuracy": 0.9765494137353434, | |
| "eval_loss": 0.0651329830288887, | |
| "eval_runtime": 99.5773, | |
| "eval_samples_per_second": 5.995, | |
| "eval_steps_per_second": 0.191, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 48.095238095238095, | |
| "grad_norm": 4.4532365798950195, | |
| "learning_rate": 2.1164021164021164e-06, | |
| "loss": 0.0472, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 48.333333333333336, | |
| "grad_norm": 0.9908486604690552, | |
| "learning_rate": 1.8518518518518519e-06, | |
| "loss": 0.039, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 48.57142857142857, | |
| "grad_norm": 5.510822772979736, | |
| "learning_rate": 1.5873015873015873e-06, | |
| "loss": 0.0414, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 48.80952380952381, | |
| "grad_norm": 2.4685218334198, | |
| "learning_rate": 1.3227513227513228e-06, | |
| "loss": 0.0433, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_accuracy": 0.9765494137353434, | |
| "eval_loss": 0.06536927819252014, | |
| "eval_runtime": 101.1448, | |
| "eval_samples_per_second": 5.902, | |
| "eval_steps_per_second": 0.188, | |
| "step": 2058 | |
| }, | |
| { | |
| "epoch": 49.04761904761905, | |
| "grad_norm": 1.254361867904663, | |
| "learning_rate": 1.0582010582010582e-06, | |
| "loss": 0.0567, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 49.285714285714285, | |
| "grad_norm": 5.534576892852783, | |
| "learning_rate": 7.936507936507937e-07, | |
| "loss": 0.0567, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 49.523809523809526, | |
| "grad_norm": 2.032499313354492, | |
| "learning_rate": 5.291005291005291e-07, | |
| "loss": 0.0259, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 49.76190476190476, | |
| "grad_norm": 4.391397476196289, | |
| "learning_rate": 2.6455026455026455e-07, | |
| "loss": 0.0376, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 2.142780303955078, | |
| "learning_rate": 0.0, | |
| "loss": 0.0337, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_accuracy": 0.9765494137353434, | |
| "eval_loss": 0.06553396582603455, | |
| "eval_runtime": 102.5062, | |
| "eval_samples_per_second": 5.824, | |
| "eval_steps_per_second": 0.185, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "step": 2100, | |
| "total_flos": 6.745887435132518e+18, | |
| "train_loss": 0.15604650461957567, | |
| "train_runtime": 46928.7528, | |
| "train_samples_per_second": 5.72, | |
| "train_steps_per_second": 0.045 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.745887435132518e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |