| { | |
| "best_metric": 0.9142091152815014, | |
| "best_model_checkpoint": "pokemon_models\\checkpoint-1610", | |
| "epoch": 23.0, | |
| "eval_steps": 500, | |
| "global_step": 1610, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.29, | |
| "learning_rate": 5e-06, | |
| "loss": 5.0145, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "learning_rate": 1e-05, | |
| "loss": 5.0039, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "learning_rate": 1.5e-05, | |
| "loss": 4.9942, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.01876675603217158, | |
| "eval_loss": 4.973499298095703, | |
| "eval_runtime": 102.0829, | |
| "eval_samples_per_second": 10.962, | |
| "eval_steps_per_second": 0.686, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "learning_rate": 2e-05, | |
| "loss": 4.97, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "learning_rate": 2.5e-05, | |
| "loss": 4.9313, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "learning_rate": 3e-05, | |
| "loss": 4.893, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "learning_rate": 3.5e-05, | |
| "loss": 4.8374, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.20196604110813227, | |
| "eval_loss": 4.816006660461426, | |
| "eval_runtime": 124.2897, | |
| "eval_samples_per_second": 9.003, | |
| "eval_steps_per_second": 0.563, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "learning_rate": 4e-05, | |
| "loss": 4.7329, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "learning_rate": 4.5e-05, | |
| "loss": 4.6472, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "learning_rate": 5e-05, | |
| "loss": 4.541, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.5495978552278821, | |
| "eval_loss": 4.4448018074035645, | |
| "eval_runtime": 101.0357, | |
| "eval_samples_per_second": 11.075, | |
| "eval_steps_per_second": 0.693, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "learning_rate": 4.9444444444444446e-05, | |
| "loss": 4.4117, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 4.2454, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "learning_rate": 4.8333333333333334e-05, | |
| "loss": 4.1227, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 4.0198, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.7042001787310098, | |
| "eval_loss": 4.0061211585998535, | |
| "eval_runtime": 100.1956, | |
| "eval_samples_per_second": 11.168, | |
| "eval_steps_per_second": 0.699, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "learning_rate": 4.722222222222222e-05, | |
| "loss": 3.84, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 3.757, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "learning_rate": 4.6111111111111115e-05, | |
| "loss": 3.6626, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.7605004468275246, | |
| "eval_loss": 3.630556106567383, | |
| "eval_runtime": 100.0509, | |
| "eval_samples_per_second": 11.184, | |
| "eval_steps_per_second": 0.7, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.14, | |
| "learning_rate": 4.555555555555556e-05, | |
| "loss": 3.5477, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.43, | |
| "learning_rate": 4.5e-05, | |
| "loss": 3.3914, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.71, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 3.3164, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "learning_rate": 4.388888888888889e-05, | |
| "loss": 3.2654, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.7971403038427167, | |
| "eval_loss": 3.3061511516571045, | |
| "eval_runtime": 99.8013, | |
| "eval_samples_per_second": 11.212, | |
| "eval_steps_per_second": 0.701, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.29, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 3.1041, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.57, | |
| "learning_rate": 4.277777777777778e-05, | |
| "loss": 3.0193, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 6.86, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 2.9314, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.8310991957104558, | |
| "eval_loss": 2.994609832763672, | |
| "eval_runtime": 106.5638, | |
| "eval_samples_per_second": 10.501, | |
| "eval_steps_per_second": 0.657, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.14, | |
| "learning_rate": 4.166666666666667e-05, | |
| "loss": 2.871, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.43, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 2.7418, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 7.71, | |
| "learning_rate": 4.055555555555556e-05, | |
| "loss": 2.6542, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "learning_rate": 4e-05, | |
| "loss": 2.5893, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.8507596067917784, | |
| "eval_loss": 2.7318336963653564, | |
| "eval_runtime": 125.4233, | |
| "eval_samples_per_second": 8.922, | |
| "eval_steps_per_second": 0.558, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 8.29, | |
| "learning_rate": 3.944444444444445e-05, | |
| "loss": 2.5106, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 8.57, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 2.4358, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.86, | |
| "learning_rate": 3.8333333333333334e-05, | |
| "loss": 2.3645, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.8579088471849866, | |
| "eval_loss": 2.4826338291168213, | |
| "eval_runtime": 121.4568, | |
| "eval_samples_per_second": 9.213, | |
| "eval_steps_per_second": 0.576, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 9.14, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 2.2831, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 9.43, | |
| "learning_rate": 3.722222222222222e-05, | |
| "loss": 2.2297, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 9.71, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 2.1367, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "learning_rate": 3.611111111111111e-05, | |
| "loss": 2.0793, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.871313672922252, | |
| "eval_loss": 2.245124578475952, | |
| "eval_runtime": 122.6079, | |
| "eval_samples_per_second": 9.127, | |
| "eval_steps_per_second": 0.571, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 10.29, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 1.9796, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 10.57, | |
| "learning_rate": 3.5e-05, | |
| "loss": 1.9471, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 10.86, | |
| "learning_rate": 3.444444444444445e-05, | |
| "loss": 1.8754, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.871313672922252, | |
| "eval_loss": 2.060222625732422, | |
| "eval_runtime": 122.2722, | |
| "eval_samples_per_second": 9.152, | |
| "eval_steps_per_second": 0.572, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 11.14, | |
| "learning_rate": 3.388888888888889e-05, | |
| "loss": 1.8259, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 11.43, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 1.7872, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 11.71, | |
| "learning_rate": 3.277777777777778e-05, | |
| "loss": 1.6884, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "learning_rate": 3.222222222222223e-05, | |
| "loss": 1.6703, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.8811438784629133, | |
| "eval_loss": 1.872039556503296, | |
| "eval_runtime": 98.0421, | |
| "eval_samples_per_second": 11.413, | |
| "eval_steps_per_second": 0.714, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 12.29, | |
| "learning_rate": 3.1666666666666666e-05, | |
| "loss": 1.6003, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 12.57, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 1.5433, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 12.86, | |
| "learning_rate": 3.055555555555556e-05, | |
| "loss": 1.5198, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.8900804289544236, | |
| "eval_loss": 1.7361352443695068, | |
| "eval_runtime": 97.0673, | |
| "eval_samples_per_second": 11.528, | |
| "eval_steps_per_second": 0.721, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 13.14, | |
| "learning_rate": 3e-05, | |
| "loss": 1.4742, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 13.43, | |
| "learning_rate": 2.9444444444444448e-05, | |
| "loss": 1.3876, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 13.71, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 1.3603, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "learning_rate": 2.8333333333333335e-05, | |
| "loss": 1.329, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.900804289544236, | |
| "eval_loss": 1.563855528831482, | |
| "eval_runtime": 97.4399, | |
| "eval_samples_per_second": 11.484, | |
| "eval_steps_per_second": 0.718, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 14.29, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 1.2523, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 14.57, | |
| "learning_rate": 2.7222222222222223e-05, | |
| "loss": 1.2747, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 14.86, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 1.203, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.8927613941018767, | |
| "eval_loss": 1.4685680866241455, | |
| "eval_runtime": 96.9819, | |
| "eval_samples_per_second": 11.538, | |
| "eval_steps_per_second": 0.722, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 15.14, | |
| "learning_rate": 2.6111111111111114e-05, | |
| "loss": 1.1697, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 15.43, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 1.0943, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 15.71, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.0947, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 1.104, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.8981233243967829, | |
| "eval_loss": 1.3596620559692383, | |
| "eval_runtime": 97.1177, | |
| "eval_samples_per_second": 11.522, | |
| "eval_steps_per_second": 0.721, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 16.29, | |
| "learning_rate": 2.3888888888888892e-05, | |
| "loss": 1.0113, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 16.57, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 1.0285, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 16.86, | |
| "learning_rate": 2.277777777777778e-05, | |
| "loss": 0.9682, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_accuracy": 0.8990169794459338, | |
| "eval_loss": 1.2199994325637817, | |
| "eval_runtime": 486.7671, | |
| "eval_samples_per_second": 2.299, | |
| "eval_steps_per_second": 0.144, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 17.14, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.9578, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 17.43, | |
| "learning_rate": 2.1666666666666667e-05, | |
| "loss": 0.9403, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 17.71, | |
| "learning_rate": 2.111111111111111e-05, | |
| "loss": 0.8924, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "learning_rate": 2.0555555555555555e-05, | |
| "loss": 0.872, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.903485254691689, | |
| "eval_loss": 1.1389293670654297, | |
| "eval_runtime": 110.8112, | |
| "eval_samples_per_second": 10.098, | |
| "eval_steps_per_second": 0.632, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 18.29, | |
| "learning_rate": 2e-05, | |
| "loss": 0.8312, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 18.57, | |
| "learning_rate": 1.9444444444444445e-05, | |
| "loss": 0.8201, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 18.86, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 0.844, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_accuracy": 0.9124218051831993, | |
| "eval_loss": 1.0643764734268188, | |
| "eval_runtime": 109.2391, | |
| "eval_samples_per_second": 10.244, | |
| "eval_steps_per_second": 0.641, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 19.14, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": 0.8116, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 19.43, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 0.7649, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 19.71, | |
| "learning_rate": 1.7222222222222224e-05, | |
| "loss": 0.7402, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.7605, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.9088471849865952, | |
| "eval_loss": 1.0364218950271606, | |
| "eval_runtime": 108.8495, | |
| "eval_samples_per_second": 10.28, | |
| "eval_steps_per_second": 0.643, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 20.29, | |
| "learning_rate": 1.6111111111111115e-05, | |
| "loss": 0.7156, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 20.57, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 0.7109, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 20.86, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7244, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_accuracy": 0.902591599642538, | |
| "eval_loss": 0.9655722379684448, | |
| "eval_runtime": 106.989, | |
| "eval_samples_per_second": 10.459, | |
| "eval_steps_per_second": 0.654, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 21.14, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 0.6925, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 21.43, | |
| "learning_rate": 1.388888888888889e-05, | |
| "loss": 0.6687, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 21.71, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.658, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "learning_rate": 1.2777777777777777e-05, | |
| "loss": 0.6595, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.9133154602323503, | |
| "eval_loss": 0.9125866889953613, | |
| "eval_runtime": 106.9609, | |
| "eval_samples_per_second": 10.462, | |
| "eval_steps_per_second": 0.654, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 22.29, | |
| "learning_rate": 1.2222222222222222e-05, | |
| "loss": 0.6489, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 22.57, | |
| "learning_rate": 1.1666666666666668e-05, | |
| "loss": 0.6666, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 22.86, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.6188, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_accuracy": 0.9142091152815014, | |
| "eval_loss": 0.8716733455657959, | |
| "eval_runtime": 107.8489, | |
| "eval_samples_per_second": 10.376, | |
| "eval_steps_per_second": 0.649, | |
| "step": 1610 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 29, | |
| "save_steps": 500, | |
| "total_flos": 7.982873471516332e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |