| { | |
| "best_metric": 0.926367461430575, | |
| "best_model_checkpoint": "swin-tiny-patch4-window7-224-classification/checkpoint-1403", | |
| "epoch": 14.962593516209477, | |
| "eval_steps": 500, | |
| "global_step": 1500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.234764814376831, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.1047, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.926308631896973, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.1257, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.245235919952393, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1047, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.806109428405762, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.1283, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.090646743774414, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.1193, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 3.47908353805542, | |
| "learning_rate": 4e-05, | |
| "loss": 0.1233, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 3.029428720474243, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 0.1364, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 5.012099742889404, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 0.1371, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 4.080081462860107, | |
| "learning_rate": 6e-05, | |
| "loss": 0.1219, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 4.989726543426514, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.1469, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.9126928471248247, | |
| "eval_loss": 0.302682101726532, | |
| "eval_runtime": 38.9184, | |
| "eval_samples_per_second": 73.282, | |
| "eval_steps_per_second": 1.156, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 4.7255024909973145, | |
| "learning_rate": 7.333333333333333e-05, | |
| "loss": 0.1329, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 7.446033477783203, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1485, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 7.688333988189697, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 0.1558, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 6.560573577880859, | |
| "learning_rate": 9.333333333333334e-05, | |
| "loss": 0.1516, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 4.632976055145264, | |
| "learning_rate": 0.0001, | |
| "loss": 0.153, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 6.091333389282227, | |
| "learning_rate": 9.925925925925926e-05, | |
| "loss": 0.1659, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 4.933795928955078, | |
| "learning_rate": 9.851851851851852e-05, | |
| "loss": 0.1837, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 7.728570938110352, | |
| "learning_rate": 9.777777777777778e-05, | |
| "loss": 0.1817, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 5.771063327789307, | |
| "learning_rate": 9.703703703703704e-05, | |
| "loss": 0.1667, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 4.893702030181885, | |
| "learning_rate": 9.62962962962963e-05, | |
| "loss": 0.1677, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.9000701262272089, | |
| "eval_loss": 0.335124135017395, | |
| "eval_runtime": 39.1392, | |
| "eval_samples_per_second": 72.868, | |
| "eval_steps_per_second": 1.15, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 5.826776027679443, | |
| "learning_rate": 9.555555555555557e-05, | |
| "loss": 0.1539, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 3.6135647296905518, | |
| "learning_rate": 9.481481481481483e-05, | |
| "loss": 0.144, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 4.992943286895752, | |
| "learning_rate": 9.407407407407408e-05, | |
| "loss": 0.1626, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 5.117826461791992, | |
| "learning_rate": 9.333333333333334e-05, | |
| "loss": 0.1459, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 5.909987926483154, | |
| "learning_rate": 9.25925925925926e-05, | |
| "loss": 0.1577, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 7.713306903839111, | |
| "learning_rate": 9.185185185185186e-05, | |
| "loss": 0.1829, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 5.385562419891357, | |
| "learning_rate": 9.111111111111112e-05, | |
| "loss": 0.1298, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 5.207605838775635, | |
| "learning_rate": 9.037037037037038e-05, | |
| "loss": 0.1794, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 5.401705265045166, | |
| "learning_rate": 8.962962962962963e-05, | |
| "loss": 0.1632, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 4.565079212188721, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 0.167, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "eval_accuracy": 0.8930575035063114, | |
| "eval_loss": 0.3874761760234833, | |
| "eval_runtime": 39.183, | |
| "eval_samples_per_second": 72.787, | |
| "eval_steps_per_second": 1.148, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 6.44505500793457, | |
| "learning_rate": 8.814814814814815e-05, | |
| "loss": 0.1483, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 7.901595592498779, | |
| "learning_rate": 8.740740740740741e-05, | |
| "loss": 0.1422, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 6.531421184539795, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 0.145, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 3.7486908435821533, | |
| "learning_rate": 8.592592592592593e-05, | |
| "loss": 0.1519, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 4.0806403160095215, | |
| "learning_rate": 8.518518518518518e-05, | |
| "loss": 0.1519, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 4.170175552368164, | |
| "learning_rate": 8.444444444444444e-05, | |
| "loss": 0.1287, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 3.4781312942504883, | |
| "learning_rate": 8.37037037037037e-05, | |
| "loss": 0.1338, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 5.322227478027344, | |
| "learning_rate": 8.296296296296296e-05, | |
| "loss": 0.1653, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 6.252974510192871, | |
| "learning_rate": 8.222222222222222e-05, | |
| "loss": 0.1527, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 3.7561535835266113, | |
| "learning_rate": 8.148148148148148e-05, | |
| "loss": 0.1556, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.896914446002805, | |
| "eval_loss": 0.3814311921596527, | |
| "eval_runtime": 39.4669, | |
| "eval_samples_per_second": 72.263, | |
| "eval_steps_per_second": 1.14, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 5.396999359130859, | |
| "learning_rate": 8.074074074074075e-05, | |
| "loss": 0.1274, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 4.547370910644531, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1089, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 6.839280605316162, | |
| "learning_rate": 7.925925925925926e-05, | |
| "loss": 0.1402, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 8.286632537841797, | |
| "learning_rate": 7.851851851851852e-05, | |
| "loss": 0.1313, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 3.8082900047302246, | |
| "learning_rate": 7.777777777777778e-05, | |
| "loss": 0.1231, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 6.801741123199463, | |
| "learning_rate": 7.703703703703704e-05, | |
| "loss": 0.1387, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 4.6010823249816895, | |
| "learning_rate": 7.62962962962963e-05, | |
| "loss": 0.1273, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 4.252594947814941, | |
| "learning_rate": 7.555555555555556e-05, | |
| "loss": 0.1276, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 4.800199031829834, | |
| "learning_rate": 7.481481481481481e-05, | |
| "loss": 0.1282, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 4.848538875579834, | |
| "learning_rate": 7.407407407407407e-05, | |
| "loss": 0.1328, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9046283309957924, | |
| "eval_loss": 0.32806462049484253, | |
| "eval_runtime": 39.0575, | |
| "eval_samples_per_second": 73.02, | |
| "eval_steps_per_second": 1.152, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 5.09, | |
| "grad_norm": 5.300040245056152, | |
| "learning_rate": 7.333333333333333e-05, | |
| "loss": 0.1063, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "grad_norm": 4.112715721130371, | |
| "learning_rate": 7.25925925925926e-05, | |
| "loss": 0.1425, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.29, | |
| "grad_norm": 4.367955684661865, | |
| "learning_rate": 7.185185185185186e-05, | |
| "loss": 0.1198, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 5.39, | |
| "grad_norm": 3.767223834991455, | |
| "learning_rate": 7.111111111111112e-05, | |
| "loss": 0.1199, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 5.49, | |
| "grad_norm": 4.640177249908447, | |
| "learning_rate": 7.037037037037038e-05, | |
| "loss": 0.1175, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 5.59, | |
| "grad_norm": 4.267022609710693, | |
| "learning_rate": 6.962962962962964e-05, | |
| "loss": 0.1312, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 5.69, | |
| "grad_norm": 3.086017608642578, | |
| "learning_rate": 6.88888888888889e-05, | |
| "loss": 0.1055, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 5.79, | |
| "grad_norm": 6.53023099899292, | |
| "learning_rate": 6.814814814814815e-05, | |
| "loss": 0.126, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 5.89, | |
| "grad_norm": 3.820864677429199, | |
| "learning_rate": 6.740740740740741e-05, | |
| "loss": 0.1197, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 5.99, | |
| "grad_norm": 3.039929151535034, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.1, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9004207573632539, | |
| "eval_loss": 0.3726123869419098, | |
| "eval_runtime": 39.249, | |
| "eval_samples_per_second": 72.664, | |
| "eval_steps_per_second": 1.147, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 5.862295150756836, | |
| "learning_rate": 6.592592592592593e-05, | |
| "loss": 0.1248, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 6.18, | |
| "grad_norm": 3.471050977706909, | |
| "learning_rate": 6.51851851851852e-05, | |
| "loss": 0.1107, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 6.28, | |
| "grad_norm": 3.645636558532715, | |
| "learning_rate": 6.444444444444446e-05, | |
| "loss": 0.0999, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 6.38, | |
| "grad_norm": 4.186192512512207, | |
| "learning_rate": 6.37037037037037e-05, | |
| "loss": 0.1281, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 5.014218807220459, | |
| "learning_rate": 6.296296296296296e-05, | |
| "loss": 0.115, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 6.58, | |
| "grad_norm": 4.402308464050293, | |
| "learning_rate": 6.222222222222222e-05, | |
| "loss": 0.1246, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 6.68, | |
| "grad_norm": 4.047673225402832, | |
| "learning_rate": 6.148148148148148e-05, | |
| "loss": 0.1128, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 6.78, | |
| "grad_norm": 5.06707763671875, | |
| "learning_rate": 6.074074074074074e-05, | |
| "loss": 0.1315, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 3.584838628768921, | |
| "learning_rate": 6e-05, | |
| "loss": 0.1052, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 6.98, | |
| "grad_norm": 4.084716320037842, | |
| "learning_rate": 5.925925925925926e-05, | |
| "loss": 0.1188, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 6.99, | |
| "eval_accuracy": 0.9046283309957924, | |
| "eval_loss": 0.373639851808548, | |
| "eval_runtime": 39.2003, | |
| "eval_samples_per_second": 72.754, | |
| "eval_steps_per_second": 1.148, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 7.08, | |
| "grad_norm": 5.199974536895752, | |
| "learning_rate": 5.851851851851852e-05, | |
| "loss": 0.1068, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 7.18, | |
| "grad_norm": 3.842390775680542, | |
| "learning_rate": 5.7777777777777776e-05, | |
| "loss": 0.1011, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 4.7214837074279785, | |
| "learning_rate": 5.703703703703704e-05, | |
| "loss": 0.1045, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 7.38, | |
| "grad_norm": 4.736299514770508, | |
| "learning_rate": 5.62962962962963e-05, | |
| "loss": 0.1071, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 7.48, | |
| "grad_norm": 5.003107070922852, | |
| "learning_rate": 5.555555555555556e-05, | |
| "loss": 0.0928, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 7.58, | |
| "grad_norm": 5.801096439361572, | |
| "learning_rate": 5.4814814814814817e-05, | |
| "loss": 0.118, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 5.874922752380371, | |
| "learning_rate": 5.4074074074074075e-05, | |
| "loss": 0.108, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 7.78, | |
| "grad_norm": 3.476767063140869, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 0.1034, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 7.88, | |
| "grad_norm": 4.573951244354248, | |
| "learning_rate": 5.259259259259259e-05, | |
| "loss": 0.121, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 7.98, | |
| "grad_norm": 4.446974754333496, | |
| "learning_rate": 5.185185185185185e-05, | |
| "loss": 0.1257, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.9102384291725105, | |
| "eval_loss": 0.33806487917900085, | |
| "eval_runtime": 39.6445, | |
| "eval_samples_per_second": 71.939, | |
| "eval_steps_per_second": 1.135, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 8.08, | |
| "grad_norm": 3.8721048831939697, | |
| "learning_rate": 5.111111111111111e-05, | |
| "loss": 0.1036, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 8.18, | |
| "grad_norm": 4.111603736877441, | |
| "learning_rate": 5.0370370370370366e-05, | |
| "loss": 0.1031, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "grad_norm": 5.5718674659729, | |
| "learning_rate": 4.962962962962963e-05, | |
| "loss": 0.1057, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 8.38, | |
| "grad_norm": 5.053532123565674, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 0.1038, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 5.147861480712891, | |
| "learning_rate": 4.814814814814815e-05, | |
| "loss": 0.1089, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 8.58, | |
| "grad_norm": 3.2430307865142822, | |
| "learning_rate": 4.740740740740741e-05, | |
| "loss": 0.0912, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 8.68, | |
| "grad_norm": 4.054327011108398, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 0.0956, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 8.78, | |
| "grad_norm": 4.733587741851807, | |
| "learning_rate": 4.592592592592593e-05, | |
| "loss": 0.1178, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 8.88, | |
| "grad_norm": 3.2026824951171875, | |
| "learning_rate": 4.518518518518519e-05, | |
| "loss": 0.0937, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 8.98, | |
| "grad_norm": 4.621606349945068, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.1017, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.9214586255259467, | |
| "eval_loss": 0.2871844172477722, | |
| "eval_runtime": 39.1384, | |
| "eval_samples_per_second": 72.87, | |
| "eval_steps_per_second": 1.15, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 9.08, | |
| "grad_norm": 4.2190141677856445, | |
| "learning_rate": 4.3703703703703705e-05, | |
| "loss": 0.0941, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 9.18, | |
| "grad_norm": 4.141015529632568, | |
| "learning_rate": 4.296296296296296e-05, | |
| "loss": 0.1028, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 2.8188929557800293, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 0.0947, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 9.38, | |
| "grad_norm": 4.885405540466309, | |
| "learning_rate": 4.148148148148148e-05, | |
| "loss": 0.0984, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 9.48, | |
| "grad_norm": 3.4018211364746094, | |
| "learning_rate": 4.074074074074074e-05, | |
| "loss": 0.1083, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 9.58, | |
| "grad_norm": 5.3954548835754395, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0934, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 9.68, | |
| "grad_norm": 3.0029079914093018, | |
| "learning_rate": 3.925925925925926e-05, | |
| "loss": 0.1042, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 9.78, | |
| "grad_norm": 3.6331303119659424, | |
| "learning_rate": 3.851851851851852e-05, | |
| "loss": 0.1075, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 9.88, | |
| "grad_norm": 4.058956623077393, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 0.0988, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 9.98, | |
| "grad_norm": 4.427227020263672, | |
| "learning_rate": 3.7037037037037037e-05, | |
| "loss": 0.0987, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.917601683029453, | |
| "eval_loss": 0.30670177936553955, | |
| "eval_runtime": 39.1776, | |
| "eval_samples_per_second": 72.797, | |
| "eval_steps_per_second": 1.149, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 10.07, | |
| "grad_norm": 3.0564310550689697, | |
| "learning_rate": 3.62962962962963e-05, | |
| "loss": 0.099, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 10.17, | |
| "grad_norm": 4.081722736358643, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 0.1069, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 10.27, | |
| "grad_norm": 5.021610260009766, | |
| "learning_rate": 3.481481481481482e-05, | |
| "loss": 0.0963, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 10.37, | |
| "grad_norm": 3.7664260864257812, | |
| "learning_rate": 3.4074074074074077e-05, | |
| "loss": 0.0857, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 10.47, | |
| "grad_norm": 4.532868385314941, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.0945, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 10.57, | |
| "grad_norm": 3.1503536701202393, | |
| "learning_rate": 3.25925925925926e-05, | |
| "loss": 0.1018, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 10.67, | |
| "grad_norm": 4.589663982391357, | |
| "learning_rate": 3.185185185185185e-05, | |
| "loss": 0.0924, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 10.77, | |
| "grad_norm": 2.68914794921875, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 0.0945, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 10.87, | |
| "grad_norm": 3.398358106613159, | |
| "learning_rate": 3.037037037037037e-05, | |
| "loss": 0.0877, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 10.97, | |
| "grad_norm": 4.018301010131836, | |
| "learning_rate": 2.962962962962963e-05, | |
| "loss": 0.0874, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 10.99, | |
| "eval_accuracy": 0.9165497896213184, | |
| "eval_loss": 0.29185158014297485, | |
| "eval_runtime": 39.4428, | |
| "eval_samples_per_second": 72.307, | |
| "eval_steps_per_second": 1.141, | |
| "step": 1102 | |
| }, | |
| { | |
| "epoch": 11.07, | |
| "grad_norm": 4.864216327667236, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 0.0979, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 11.17, | |
| "grad_norm": 5.510034084320068, | |
| "learning_rate": 2.814814814814815e-05, | |
| "loss": 0.0857, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 11.27, | |
| "grad_norm": 6.076953887939453, | |
| "learning_rate": 2.7407407407407408e-05, | |
| "loss": 0.099, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 11.37, | |
| "grad_norm": 3.9851274490356445, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.0967, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 11.47, | |
| "grad_norm": 4.604448318481445, | |
| "learning_rate": 2.5925925925925925e-05, | |
| "loss": 0.0975, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 11.57, | |
| "grad_norm": 1.658200740814209, | |
| "learning_rate": 2.5185185185185183e-05, | |
| "loss": 0.0849, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 11.67, | |
| "grad_norm": 5.351607322692871, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 0.0979, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 11.77, | |
| "grad_norm": 4.404117584228516, | |
| "learning_rate": 2.3703703703703707e-05, | |
| "loss": 0.0953, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 11.87, | |
| "grad_norm": 5.584921836853027, | |
| "learning_rate": 2.2962962962962965e-05, | |
| "loss": 0.0803, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 11.97, | |
| "grad_norm": 11.780359268188477, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.0901, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.9228611500701263, | |
| "eval_loss": 0.2942492961883545, | |
| "eval_runtime": 39.3265, | |
| "eval_samples_per_second": 72.521, | |
| "eval_steps_per_second": 1.144, | |
| "step": 1203 | |
| }, | |
| { | |
| "epoch": 12.07, | |
| "grad_norm": 3.900620460510254, | |
| "learning_rate": 2.148148148148148e-05, | |
| "loss": 0.09, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 12.17, | |
| "grad_norm": 3.0629093647003174, | |
| "learning_rate": 2.074074074074074e-05, | |
| "loss": 0.0769, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 12.27, | |
| "grad_norm": 3.7691752910614014, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0925, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 12.37, | |
| "grad_norm": 3.8815600872039795, | |
| "learning_rate": 1.925925925925926e-05, | |
| "loss": 0.0736, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 12.47, | |
| "grad_norm": 3.6754331588745117, | |
| "learning_rate": 1.8518518518518518e-05, | |
| "loss": 0.0826, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 12.57, | |
| "grad_norm": 4.773692607879639, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 0.0821, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 12.67, | |
| "grad_norm": 4.411365985870361, | |
| "learning_rate": 1.7037037037037038e-05, | |
| "loss": 0.0754, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 12.77, | |
| "grad_norm": 3.2157444953918457, | |
| "learning_rate": 1.62962962962963e-05, | |
| "loss": 0.0736, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 12.87, | |
| "grad_norm": 3.8522512912750244, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 0.0806, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 12.97, | |
| "grad_norm": 3.8069818019866943, | |
| "learning_rate": 1.4814814814814815e-05, | |
| "loss": 0.0831, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.9232117812061711, | |
| "eval_loss": 0.29743239283561707, | |
| "eval_runtime": 39.4899, | |
| "eval_samples_per_second": 72.221, | |
| "eval_steps_per_second": 1.14, | |
| "step": 1303 | |
| }, | |
| { | |
| "epoch": 13.07, | |
| "grad_norm": 2.675555467605591, | |
| "learning_rate": 1.4074074074074075e-05, | |
| "loss": 0.0687, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 13.17, | |
| "grad_norm": 4.165126800537109, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.0694, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 13.27, | |
| "grad_norm": 3.973789930343628, | |
| "learning_rate": 1.2592592592592592e-05, | |
| "loss": 0.0831, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 13.37, | |
| "grad_norm": 3.488872528076172, | |
| "learning_rate": 1.1851851851851853e-05, | |
| "loss": 0.0761, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 13.47, | |
| "grad_norm": 3.375296115875244, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.0607, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 13.57, | |
| "grad_norm": 1.8006842136383057, | |
| "learning_rate": 1.037037037037037e-05, | |
| "loss": 0.0681, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 13.67, | |
| "grad_norm": 3.174525260925293, | |
| "learning_rate": 9.62962962962963e-06, | |
| "loss": 0.0622, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 13.77, | |
| "grad_norm": 3.6712663173675537, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 0.082, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 13.87, | |
| "grad_norm": 2.8986759185791016, | |
| "learning_rate": 8.14814814814815e-06, | |
| "loss": 0.0879, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 13.97, | |
| "grad_norm": 4.151699066162109, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 0.0838, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.926367461430575, | |
| "eval_loss": 0.27871325612068176, | |
| "eval_runtime": 39.5001, | |
| "eval_samples_per_second": 72.202, | |
| "eval_steps_per_second": 1.139, | |
| "step": 1403 | |
| }, | |
| { | |
| "epoch": 14.06, | |
| "grad_norm": 4.287971019744873, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.0731, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 14.16, | |
| "grad_norm": 2.9228641986846924, | |
| "learning_rate": 5.925925925925927e-06, | |
| "loss": 0.0659, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 14.26, | |
| "grad_norm": 1.4059169292449951, | |
| "learning_rate": 5.185185185185185e-06, | |
| "loss": 0.0654, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 14.36, | |
| "grad_norm": 4.815260887145996, | |
| "learning_rate": 4.444444444444445e-06, | |
| "loss": 0.081, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 14.46, | |
| "grad_norm": 2.322667121887207, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 0.0689, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 14.56, | |
| "grad_norm": 4.125411033630371, | |
| "learning_rate": 2.9629629629629633e-06, | |
| "loss": 0.0716, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 14.66, | |
| "grad_norm": 3.8639822006225586, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 0.072, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 14.76, | |
| "grad_norm": 1.4972394704818726, | |
| "learning_rate": 1.4814814814814817e-06, | |
| "loss": 0.0728, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 14.86, | |
| "grad_norm": 1.8535915613174438, | |
| "learning_rate": 7.407407407407408e-07, | |
| "loss": 0.077, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 14.96, | |
| "grad_norm": 4.621140003204346, | |
| "learning_rate": 0.0, | |
| "loss": 0.0603, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 14.96, | |
| "eval_accuracy": 0.926367461430575, | |
| "eval_loss": 0.2779853940010071, | |
| "eval_runtime": 39.6501, | |
| "eval_samples_per_second": 71.929, | |
| "eval_steps_per_second": 1.135, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 14.96, | |
| "step": 1500, | |
| "total_flos": 9.546299981955072e+18, | |
| "train_loss": 0.11206140112876892, | |
| "train_runtime": 7336.5108, | |
| "train_samples_per_second": 52.472, | |
| "train_steps_per_second": 0.204 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 500, | |
| "total_flos": 9.546299981955072e+18, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |