| { |
| "best_metric": 0.9899, |
| "best_model_checkpoint": "../../checkpoint/cifar10/vit-base/checkpoint-18981", |
| "epoch": 100.0, |
| "eval_steps": 500, |
| "global_step": 33300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 1.0, |
| "eval_accuracy": 0.9599, |
| "eval_loss": 0.7045394778251648, |
| "eval_runtime": 32.9965, |
| "eval_samples_per_second": 303.063, |
| "eval_steps_per_second": 1.212, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 2.1130874156951904, |
| "learning_rate": 9.849849849849851e-06, |
| "loss": 1.2275, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_accuracy": 0.9721, |
| "eval_loss": 0.38615667819976807, |
| "eval_runtime": 27.3127, |
| "eval_samples_per_second": 366.13, |
| "eval_steps_per_second": 1.465, |
| "step": 666 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_accuracy": 0.9771, |
| "eval_loss": 0.273359090089798, |
| "eval_runtime": 27.1217, |
| "eval_samples_per_second": 368.709, |
| "eval_steps_per_second": 1.475, |
| "step": 999 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 2.09121036529541, |
| "learning_rate": 9.699699699699701e-06, |
| "loss": 0.4176, |
| "step": 1000 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_accuracy": 0.9794, |
| "eval_loss": 0.21269012987613678, |
| "eval_runtime": 33.0791, |
| "eval_samples_per_second": 302.305, |
| "eval_steps_per_second": 1.209, |
| "step": 1332 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 2.6250178813934326, |
| "learning_rate": 9.54954954954955e-06, |
| "loss": 0.3859, |
| "step": 1500 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_accuracy": 0.9822, |
| "eval_loss": 0.17196297645568848, |
| "eval_runtime": 27.2002, |
| "eval_samples_per_second": 367.645, |
| "eval_steps_per_second": 1.471, |
| "step": 1665 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_accuracy": 0.9834, |
| "eval_loss": 0.1458989977836609, |
| "eval_runtime": 26.7049, |
| "eval_samples_per_second": 374.463, |
| "eval_steps_per_second": 1.498, |
| "step": 1998 |
| }, |
| { |
| "epoch": 6.01, |
| "grad_norm": 3.6171772480010986, |
| "learning_rate": 9.3993993993994e-06, |
| "loss": 0.32, |
| "step": 2000 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_accuracy": 0.9843, |
| "eval_loss": 0.12406200915575027, |
| "eval_runtime": 26.8675, |
| "eval_samples_per_second": 372.197, |
| "eval_steps_per_second": 1.489, |
| "step": 2331 |
| }, |
| { |
| "epoch": 7.51, |
| "grad_norm": 2.88914155960083, |
| "learning_rate": 9.24924924924925e-06, |
| "loss": 0.2795, |
| "step": 2500 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_accuracy": 0.9846, |
| "eval_loss": 0.11055979877710342, |
| "eval_runtime": 26.4102, |
| "eval_samples_per_second": 378.641, |
| "eval_steps_per_second": 1.515, |
| "step": 2664 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_accuracy": 0.9861, |
| "eval_loss": 0.09506094455718994, |
| "eval_runtime": 26.6993, |
| "eval_samples_per_second": 374.542, |
| "eval_steps_per_second": 1.498, |
| "step": 2997 |
| }, |
| { |
| "epoch": 9.01, |
| "grad_norm": 3.4046921730041504, |
| "learning_rate": 9.0990990990991e-06, |
| "loss": 0.2489, |
| "step": 3000 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_accuracy": 0.9856, |
| "eval_loss": 0.08772800117731094, |
| "eval_runtime": 26.376, |
| "eval_samples_per_second": 379.132, |
| "eval_steps_per_second": 1.517, |
| "step": 3330 |
| }, |
| { |
| "epoch": 10.51, |
| "grad_norm": 1.4524540901184082, |
| "learning_rate": 8.94894894894895e-06, |
| "loss": 0.2284, |
| "step": 3500 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_accuracy": 0.987, |
| "eval_loss": 0.07828080654144287, |
| "eval_runtime": 26.6208, |
| "eval_samples_per_second": 375.646, |
| "eval_steps_per_second": 1.503, |
| "step": 3663 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_accuracy": 0.9861, |
| "eval_loss": 0.07426337152719498, |
| "eval_runtime": 27.2182, |
| "eval_samples_per_second": 367.401, |
| "eval_steps_per_second": 1.47, |
| "step": 3996 |
| }, |
| { |
| "epoch": 12.01, |
| "grad_norm": 1.878126859664917, |
| "learning_rate": 8.798798798798799e-06, |
| "loss": 0.2139, |
| "step": 4000 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_accuracy": 0.9883, |
| "eval_loss": 0.06659487634897232, |
| "eval_runtime": 27.3633, |
| "eval_samples_per_second": 365.453, |
| "eval_steps_per_second": 1.462, |
| "step": 4329 |
| }, |
| { |
| "epoch": 13.51, |
| "grad_norm": 2.9144155979156494, |
| "learning_rate": 8.64864864864865e-06, |
| "loss": 0.2019, |
| "step": 4500 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_accuracy": 0.9862, |
| "eval_loss": 0.06538616865873337, |
| "eval_runtime": 26.5973, |
| "eval_samples_per_second": 375.978, |
| "eval_steps_per_second": 1.504, |
| "step": 4662 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_accuracy": 0.9875, |
| "eval_loss": 0.060847021639347076, |
| "eval_runtime": 27.2607, |
| "eval_samples_per_second": 366.829, |
| "eval_steps_per_second": 1.467, |
| "step": 4995 |
| }, |
| { |
| "epoch": 15.02, |
| "grad_norm": 2.323160409927368, |
| "learning_rate": 8.4984984984985e-06, |
| "loss": 0.1882, |
| "step": 5000 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_accuracy": 0.9875, |
| "eval_loss": 0.05937081202864647, |
| "eval_runtime": 27.7458, |
| "eval_samples_per_second": 360.415, |
| "eval_steps_per_second": 1.442, |
| "step": 5328 |
| }, |
| { |
| "epoch": 16.52, |
| "grad_norm": 1.7828189134597778, |
| "learning_rate": 8.348348348348348e-06, |
| "loss": 0.1845, |
| "step": 5500 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_accuracy": 0.9878, |
| "eval_loss": 0.054522428661584854, |
| "eval_runtime": 26.6197, |
| "eval_samples_per_second": 375.662, |
| "eval_steps_per_second": 1.503, |
| "step": 5661 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_accuracy": 0.9885, |
| "eval_loss": 0.05343218520283699, |
| "eval_runtime": 26.6787, |
| "eval_samples_per_second": 374.831, |
| "eval_steps_per_second": 1.499, |
| "step": 5994 |
| }, |
| { |
| "epoch": 18.02, |
| "grad_norm": 2.5986483097076416, |
| "learning_rate": 8.198198198198198e-06, |
| "loss": 0.1762, |
| "step": 6000 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_accuracy": 0.9876, |
| "eval_loss": 0.05616134777665138, |
| "eval_runtime": 27.0562, |
| "eval_samples_per_second": 369.601, |
| "eval_steps_per_second": 1.478, |
| "step": 6327 |
| }, |
| { |
| "epoch": 19.52, |
| "grad_norm": 2.639050006866455, |
| "learning_rate": 8.048048048048048e-06, |
| "loss": 0.1629, |
| "step": 6500 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_accuracy": 0.9879, |
| "eval_loss": 0.05099354684352875, |
| "eval_runtime": 27.7176, |
| "eval_samples_per_second": 360.781, |
| "eval_steps_per_second": 1.443, |
| "step": 6660 |
| }, |
| { |
| "epoch": 21.0, |
| "eval_accuracy": 0.9889, |
| "eval_loss": 0.04877820238471031, |
| "eval_runtime": 26.5357, |
| "eval_samples_per_second": 376.85, |
| "eval_steps_per_second": 1.507, |
| "step": 6993 |
| }, |
| { |
| "epoch": 21.02, |
| "grad_norm": 3.4336190223693848, |
| "learning_rate": 7.897897897897899e-06, |
| "loss": 0.1622, |
| "step": 7000 |
| }, |
| { |
| "epoch": 22.0, |
| "eval_accuracy": 0.9879, |
| "eval_loss": 0.04886335879564285, |
| "eval_runtime": 27.6701, |
| "eval_samples_per_second": 361.401, |
| "eval_steps_per_second": 1.446, |
| "step": 7326 |
| }, |
| { |
| "epoch": 22.52, |
| "grad_norm": 3.954786777496338, |
| "learning_rate": 7.747747747747749e-06, |
| "loss": 0.1621, |
| "step": 7500 |
| }, |
| { |
| "epoch": 23.0, |
| "eval_accuracy": 0.9881, |
| "eval_loss": 0.04821654036641121, |
| "eval_runtime": 26.7986, |
| "eval_samples_per_second": 373.153, |
| "eval_steps_per_second": 1.493, |
| "step": 7659 |
| }, |
| { |
| "epoch": 24.0, |
| "eval_accuracy": 0.9886, |
| "eval_loss": 0.04642421752214432, |
| "eval_runtime": 27.0748, |
| "eval_samples_per_second": 369.347, |
| "eval_steps_per_second": 1.477, |
| "step": 7992 |
| }, |
| { |
| "epoch": 24.02, |
| "grad_norm": 2.539337396621704, |
| "learning_rate": 7.597597597597598e-06, |
| "loss": 0.1518, |
| "step": 8000 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_accuracy": 0.9887, |
| "eval_loss": 0.04640224575996399, |
| "eval_runtime": 26.7195, |
| "eval_samples_per_second": 374.258, |
| "eval_steps_per_second": 1.497, |
| "step": 8325 |
| }, |
| { |
| "epoch": 25.53, |
| "grad_norm": 1.5434321165084839, |
| "learning_rate": 7.447447447447448e-06, |
| "loss": 0.151, |
| "step": 8500 |
| }, |
| { |
| "epoch": 26.0, |
| "eval_accuracy": 0.9884, |
| "eval_loss": 0.04765336588025093, |
| "eval_runtime": 26.3496, |
| "eval_samples_per_second": 379.513, |
| "eval_steps_per_second": 1.518, |
| "step": 8658 |
| }, |
| { |
| "epoch": 27.0, |
| "eval_accuracy": 0.9886, |
| "eval_loss": 0.04709744080901146, |
| "eval_runtime": 27.5627, |
| "eval_samples_per_second": 362.809, |
| "eval_steps_per_second": 1.451, |
| "step": 8991 |
| }, |
| { |
| "epoch": 27.03, |
| "grad_norm": 1.4213284254074097, |
| "learning_rate": 7.297297297297298e-06, |
| "loss": 0.1486, |
| "step": 9000 |
| }, |
| { |
| "epoch": 28.0, |
| "eval_accuracy": 0.9882, |
| "eval_loss": 0.048934612423181534, |
| "eval_runtime": 26.5141, |
| "eval_samples_per_second": 377.157, |
| "eval_steps_per_second": 1.509, |
| "step": 9324 |
| }, |
| { |
| "epoch": 28.53, |
| "grad_norm": 3.0046286582946777, |
| "learning_rate": 7.147147147147148e-06, |
| "loss": 0.147, |
| "step": 9500 |
| }, |
| { |
| "epoch": 29.0, |
| "eval_accuracy": 0.9884, |
| "eval_loss": 0.04772612452507019, |
| "eval_runtime": 27.447, |
| "eval_samples_per_second": 364.339, |
| "eval_steps_per_second": 1.457, |
| "step": 9657 |
| }, |
| { |
| "epoch": 30.0, |
| "eval_accuracy": 0.9883, |
| "eval_loss": 0.049376897513866425, |
| "eval_runtime": 32.9017, |
| "eval_samples_per_second": 303.936, |
| "eval_steps_per_second": 1.216, |
| "step": 9990 |
| }, |
| { |
| "epoch": 30.03, |
| "grad_norm": 2.6233084201812744, |
| "learning_rate": 6.996996996996997e-06, |
| "loss": 0.1412, |
| "step": 10000 |
| }, |
| { |
| "epoch": 31.0, |
| "eval_accuracy": 0.9881, |
| "eval_loss": 0.04674990102648735, |
| "eval_runtime": 27.7335, |
| "eval_samples_per_second": 360.575, |
| "eval_steps_per_second": 1.442, |
| "step": 10323 |
| }, |
| { |
| "epoch": 31.53, |
| "grad_norm": 2.2710893154144287, |
| "learning_rate": 6.846846846846848e-06, |
| "loss": 0.1403, |
| "step": 10500 |
| }, |
| { |
| "epoch": 32.0, |
| "eval_accuracy": 0.9888, |
| "eval_loss": 0.04444491118192673, |
| "eval_runtime": 26.7278, |
| "eval_samples_per_second": 374.143, |
| "eval_steps_per_second": 1.497, |
| "step": 10656 |
| }, |
| { |
| "epoch": 33.0, |
| "eval_accuracy": 0.9888, |
| "eval_loss": 0.04506918787956238, |
| "eval_runtime": 26.7164, |
| "eval_samples_per_second": 374.302, |
| "eval_steps_per_second": 1.497, |
| "step": 10989 |
| }, |
| { |
| "epoch": 33.03, |
| "grad_norm": 1.3235912322998047, |
| "learning_rate": 6.696696696696697e-06, |
| "loss": 0.1373, |
| "step": 11000 |
| }, |
| { |
| "epoch": 34.0, |
| "eval_accuracy": 0.9887, |
| "eval_loss": 0.046430543065071106, |
| "eval_runtime": 27.3757, |
| "eval_samples_per_second": 365.288, |
| "eval_steps_per_second": 1.461, |
| "step": 11322 |
| }, |
| { |
| "epoch": 34.53, |
| "grad_norm": 3.1201539039611816, |
| "learning_rate": 6.546546546546547e-06, |
| "loss": 0.1379, |
| "step": 11500 |
| }, |
| { |
| "epoch": 35.0, |
| "eval_accuracy": 0.9896, |
| "eval_loss": 0.04377752169966698, |
| "eval_runtime": 26.9572, |
| "eval_samples_per_second": 370.958, |
| "eval_steps_per_second": 1.484, |
| "step": 11655 |
| }, |
| { |
| "epoch": 36.0, |
| "eval_accuracy": 0.9887, |
| "eval_loss": 0.044030845165252686, |
| "eval_runtime": 27.4449, |
| "eval_samples_per_second": 364.366, |
| "eval_steps_per_second": 1.457, |
| "step": 11988 |
| }, |
| { |
| "epoch": 36.04, |
| "grad_norm": 1.388404130935669, |
| "learning_rate": 6.396396396396397e-06, |
| "loss": 0.1375, |
| "step": 12000 |
| }, |
| { |
| "epoch": 37.0, |
| "eval_accuracy": 0.9881, |
| "eval_loss": 0.046024248003959656, |
| "eval_runtime": 27.3415, |
| "eval_samples_per_second": 365.744, |
| "eval_steps_per_second": 1.463, |
| "step": 12321 |
| }, |
| { |
| "epoch": 37.54, |
| "grad_norm": 4.725052833557129, |
| "learning_rate": 6.246246246246247e-06, |
| "loss": 0.1377, |
| "step": 12500 |
| }, |
| { |
| "epoch": 38.0, |
| "eval_accuracy": 0.9896, |
| "eval_loss": 0.04353851079940796, |
| "eval_runtime": 26.7809, |
| "eval_samples_per_second": 373.4, |
| "eval_steps_per_second": 1.494, |
| "step": 12654 |
| }, |
| { |
| "epoch": 39.0, |
| "eval_accuracy": 0.989, |
| "eval_loss": 0.046141043305397034, |
| "eval_runtime": 26.7652, |
| "eval_samples_per_second": 373.619, |
| "eval_steps_per_second": 1.494, |
| "step": 12987 |
| }, |
| { |
| "epoch": 39.04, |
| "grad_norm": 1.4163159132003784, |
| "learning_rate": 6.096096096096097e-06, |
| "loss": 0.1332, |
| "step": 13000 |
| }, |
| { |
| "epoch": 40.0, |
| "eval_accuracy": 0.9897, |
| "eval_loss": 0.04419806972146034, |
| "eval_runtime": 26.8089, |
| "eval_samples_per_second": 373.011, |
| "eval_steps_per_second": 1.492, |
| "step": 13320 |
| }, |
| { |
| "epoch": 40.54, |
| "grad_norm": 2.823390007019043, |
| "learning_rate": 5.945945945945947e-06, |
| "loss": 0.1306, |
| "step": 13500 |
| }, |
| { |
| "epoch": 41.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04626644402742386, |
| "eval_runtime": 27.1555, |
| "eval_samples_per_second": 368.25, |
| "eval_steps_per_second": 1.473, |
| "step": 13653 |
| }, |
| { |
| "epoch": 42.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.04492880031466484, |
| "eval_runtime": 27.5197, |
| "eval_samples_per_second": 363.376, |
| "eval_steps_per_second": 1.454, |
| "step": 13986 |
| }, |
| { |
| "epoch": 42.04, |
| "grad_norm": 2.2915682792663574, |
| "learning_rate": 5.7957957957957965e-06, |
| "loss": 0.1289, |
| "step": 14000 |
| }, |
| { |
| "epoch": 43.0, |
| "eval_accuracy": 0.989, |
| "eval_loss": 0.045636676251888275, |
| "eval_runtime": 27.1073, |
| "eval_samples_per_second": 368.904, |
| "eval_steps_per_second": 1.476, |
| "step": 14319 |
| }, |
| { |
| "epoch": 43.54, |
| "grad_norm": 2.901336669921875, |
| "learning_rate": 5.645645645645647e-06, |
| "loss": 0.128, |
| "step": 14500 |
| }, |
| { |
| "epoch": 44.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.04512866213917732, |
| "eval_runtime": 26.2641, |
| "eval_samples_per_second": 380.748, |
| "eval_steps_per_second": 1.523, |
| "step": 14652 |
| }, |
| { |
| "epoch": 45.0, |
| "eval_accuracy": 0.9889, |
| "eval_loss": 0.045423876494169235, |
| "eval_runtime": 26.7396, |
| "eval_samples_per_second": 373.977, |
| "eval_steps_per_second": 1.496, |
| "step": 14985 |
| }, |
| { |
| "epoch": 45.05, |
| "grad_norm": 2.4207634925842285, |
| "learning_rate": 5.495495495495496e-06, |
| "loss": 0.1321, |
| "step": 15000 |
| }, |
| { |
| "epoch": 46.0, |
| "eval_accuracy": 0.9895, |
| "eval_loss": 0.04446360096335411, |
| "eval_runtime": 27.9958, |
| "eval_samples_per_second": 357.197, |
| "eval_steps_per_second": 1.429, |
| "step": 15318 |
| }, |
| { |
| "epoch": 46.55, |
| "grad_norm": 1.811728596687317, |
| "learning_rate": 5.345345345345346e-06, |
| "loss": 0.1222, |
| "step": 15500 |
| }, |
| { |
| "epoch": 47.0, |
| "eval_accuracy": 0.9893, |
| "eval_loss": 0.046666089445352554, |
| "eval_runtime": 27.8191, |
| "eval_samples_per_second": 359.465, |
| "eval_steps_per_second": 1.438, |
| "step": 15651 |
| }, |
| { |
| "epoch": 48.0, |
| "eval_accuracy": 0.9897, |
| "eval_loss": 0.046483393758535385, |
| "eval_runtime": 27.8293, |
| "eval_samples_per_second": 359.334, |
| "eval_steps_per_second": 1.437, |
| "step": 15984 |
| }, |
| { |
| "epoch": 48.05, |
| "grad_norm": 2.140018939971924, |
| "learning_rate": 5.195195195195195e-06, |
| "loss": 0.122, |
| "step": 16000 |
| }, |
| { |
| "epoch": 49.0, |
| "eval_accuracy": 0.9896, |
| "eval_loss": 0.045183293521404266, |
| "eval_runtime": 27.5683, |
| "eval_samples_per_second": 362.736, |
| "eval_steps_per_second": 1.451, |
| "step": 16317 |
| }, |
| { |
| "epoch": 49.55, |
| "grad_norm": 2.951556444168091, |
| "learning_rate": 5.045045045045045e-06, |
| "loss": 0.123, |
| "step": 16500 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.047760359942913055, |
| "eval_runtime": 26.9033, |
| "eval_samples_per_second": 371.702, |
| "eval_steps_per_second": 1.487, |
| "step": 16650 |
| }, |
| { |
| "epoch": 51.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.0464739091694355, |
| "eval_runtime": 26.782, |
| "eval_samples_per_second": 373.385, |
| "eval_steps_per_second": 1.494, |
| "step": 16983 |
| }, |
| { |
| "epoch": 51.05, |
| "grad_norm": 1.8529798984527588, |
| "learning_rate": 4.894894894894895e-06, |
| "loss": 0.1194, |
| "step": 17000 |
| }, |
| { |
| "epoch": 52.0, |
| "eval_accuracy": 0.9887, |
| "eval_loss": 0.048829443752765656, |
| "eval_runtime": 27.3149, |
| "eval_samples_per_second": 366.101, |
| "eval_steps_per_second": 1.464, |
| "step": 17316 |
| }, |
| { |
| "epoch": 52.55, |
| "grad_norm": 2.048177719116211, |
| "learning_rate": 4.7447447447447454e-06, |
| "loss": 0.1209, |
| "step": 17500 |
| }, |
| { |
| "epoch": 53.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.047239311039447784, |
| "eval_runtime": 27.9195, |
| "eval_samples_per_second": 358.172, |
| "eval_steps_per_second": 1.433, |
| "step": 17649 |
| }, |
| { |
| "epoch": 54.0, |
| "eval_accuracy": 0.9897, |
| "eval_loss": 0.04561839625239372, |
| "eval_runtime": 27.8439, |
| "eval_samples_per_second": 359.145, |
| "eval_steps_per_second": 1.437, |
| "step": 17982 |
| }, |
| { |
| "epoch": 54.05, |
| "grad_norm": 2.6934187412261963, |
| "learning_rate": 4.594594594594596e-06, |
| "loss": 0.1212, |
| "step": 18000 |
| }, |
| { |
| "epoch": 55.0, |
| "eval_accuracy": 0.9893, |
| "eval_loss": 0.04664906859397888, |
| "eval_runtime": 27.5803, |
| "eval_samples_per_second": 362.577, |
| "eval_steps_per_second": 1.45, |
| "step": 18315 |
| }, |
| { |
| "epoch": 55.56, |
| "grad_norm": 1.0602542161941528, |
| "learning_rate": 4.444444444444444e-06, |
| "loss": 0.1187, |
| "step": 18500 |
| }, |
| { |
| "epoch": 56.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04580499231815338, |
| "eval_runtime": 27.9221, |
| "eval_samples_per_second": 358.139, |
| "eval_steps_per_second": 1.433, |
| "step": 18648 |
| }, |
| { |
| "epoch": 57.0, |
| "eval_accuracy": 0.9899, |
| "eval_loss": 0.04467911645770073, |
| "eval_runtime": 26.7649, |
| "eval_samples_per_second": 373.624, |
| "eval_steps_per_second": 1.494, |
| "step": 18981 |
| }, |
| { |
| "epoch": 57.06, |
| "grad_norm": 2.04836368560791, |
| "learning_rate": 4.294294294294294e-06, |
| "loss": 0.1193, |
| "step": 19000 |
| }, |
| { |
| "epoch": 58.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.04191720113158226, |
| "eval_runtime": 26.7715, |
| "eval_samples_per_second": 373.531, |
| "eval_steps_per_second": 1.494, |
| "step": 19314 |
| }, |
| { |
| "epoch": 58.56, |
| "grad_norm": 2.186086654663086, |
| "learning_rate": 4.1441441441441446e-06, |
| "loss": 0.119, |
| "step": 19500 |
| }, |
| { |
| "epoch": 59.0, |
| "eval_accuracy": 0.9897, |
| "eval_loss": 0.04308394715189934, |
| "eval_runtime": 27.2975, |
| "eval_samples_per_second": 366.334, |
| "eval_steps_per_second": 1.465, |
| "step": 19647 |
| }, |
| { |
| "epoch": 60.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.0437154695391655, |
| "eval_runtime": 27.418, |
| "eval_samples_per_second": 364.724, |
| "eval_steps_per_second": 1.459, |
| "step": 19980 |
| }, |
| { |
| "epoch": 60.06, |
| "grad_norm": 1.6354001760482788, |
| "learning_rate": 3.993993993993994e-06, |
| "loss": 0.1165, |
| "step": 20000 |
| }, |
| { |
| "epoch": 61.0, |
| "eval_accuracy": 0.9889, |
| "eval_loss": 0.04698378965258598, |
| "eval_runtime": 26.8506, |
| "eval_samples_per_second": 372.431, |
| "eval_steps_per_second": 1.49, |
| "step": 20313 |
| }, |
| { |
| "epoch": 61.56, |
| "grad_norm": 1.5250356197357178, |
| "learning_rate": 3.843843843843844e-06, |
| "loss": 0.1146, |
| "step": 20500 |
| }, |
| { |
| "epoch": 62.0, |
| "eval_accuracy": 0.989, |
| "eval_loss": 0.04715728014707565, |
| "eval_runtime": 26.5635, |
| "eval_samples_per_second": 376.456, |
| "eval_steps_per_second": 1.506, |
| "step": 20646 |
| }, |
| { |
| "epoch": 63.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.044496119022369385, |
| "eval_runtime": 28.0733, |
| "eval_samples_per_second": 356.211, |
| "eval_steps_per_second": 1.425, |
| "step": 20979 |
| }, |
| { |
| "epoch": 63.06, |
| "grad_norm": 2.1367080211639404, |
| "learning_rate": 3.693693693693694e-06, |
| "loss": 0.1147, |
| "step": 21000 |
| }, |
| { |
| "epoch": 64.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04541689530014992, |
| "eval_runtime": 26.2197, |
| "eval_samples_per_second": 381.393, |
| "eval_steps_per_second": 1.526, |
| "step": 21312 |
| }, |
| { |
| "epoch": 64.56, |
| "grad_norm": 1.8817474842071533, |
| "learning_rate": 3.5435435435435437e-06, |
| "loss": 0.1117, |
| "step": 21500 |
| }, |
| { |
| "epoch": 65.0, |
| "eval_accuracy": 0.9899, |
| "eval_loss": 0.04460064694285393, |
| "eval_runtime": 27.2042, |
| "eval_samples_per_second": 367.59, |
| "eval_steps_per_second": 1.47, |
| "step": 21645 |
| }, |
| { |
| "epoch": 66.0, |
| "eval_accuracy": 0.989, |
| "eval_loss": 0.04820993170142174, |
| "eval_runtime": 27.2216, |
| "eval_samples_per_second": 367.355, |
| "eval_steps_per_second": 1.469, |
| "step": 21978 |
| }, |
| { |
| "epoch": 66.07, |
| "grad_norm": 4.140334129333496, |
| "learning_rate": 3.393393393393394e-06, |
| "loss": 0.1137, |
| "step": 22000 |
| }, |
| { |
| "epoch": 67.0, |
| "eval_accuracy": 0.9895, |
| "eval_loss": 0.04575618728995323, |
| "eval_runtime": 27.7597, |
| "eval_samples_per_second": 360.234, |
| "eval_steps_per_second": 1.441, |
| "step": 22311 |
| }, |
| { |
| "epoch": 67.57, |
| "grad_norm": 1.3289170265197754, |
| "learning_rate": 3.2432432432432437e-06, |
| "loss": 0.1145, |
| "step": 22500 |
| }, |
| { |
| "epoch": 68.0, |
| "eval_accuracy": 0.989, |
| "eval_loss": 0.046150561422109604, |
| "eval_runtime": 27.6364, |
| "eval_samples_per_second": 361.842, |
| "eval_steps_per_second": 1.447, |
| "step": 22644 |
| }, |
| { |
| "epoch": 69.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04607143998146057, |
| "eval_runtime": 27.3526, |
| "eval_samples_per_second": 365.596, |
| "eval_steps_per_second": 1.462, |
| "step": 22977 |
| }, |
| { |
| "epoch": 69.07, |
| "grad_norm": 3.2696564197540283, |
| "learning_rate": 3.0930930930930935e-06, |
| "loss": 0.1136, |
| "step": 23000 |
| }, |
| { |
| "epoch": 70.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04546944424510002, |
| "eval_runtime": 26.8405, |
| "eval_samples_per_second": 372.571, |
| "eval_steps_per_second": 1.49, |
| "step": 23310 |
| }, |
| { |
| "epoch": 70.57, |
| "grad_norm": 1.5183466672897339, |
| "learning_rate": 2.942942942942943e-06, |
| "loss": 0.1144, |
| "step": 23500 |
| }, |
| { |
| "epoch": 71.0, |
| "eval_accuracy": 0.9896, |
| "eval_loss": 0.04547298699617386, |
| "eval_runtime": 28.012, |
| "eval_samples_per_second": 356.99, |
| "eval_steps_per_second": 1.428, |
| "step": 23643 |
| }, |
| { |
| "epoch": 72.0, |
| "eval_accuracy": 0.9891, |
| "eval_loss": 0.045761577785015106, |
| "eval_runtime": 26.7415, |
| "eval_samples_per_second": 373.951, |
| "eval_steps_per_second": 1.496, |
| "step": 23976 |
| }, |
| { |
| "epoch": 72.07, |
| "grad_norm": 2.621833324432373, |
| "learning_rate": 2.7927927927927926e-06, |
| "loss": 0.1126, |
| "step": 24000 |
| }, |
| { |
| "epoch": 73.0, |
| "eval_accuracy": 0.989, |
| "eval_loss": 0.04621967673301697, |
| "eval_runtime": 26.6469, |
| "eval_samples_per_second": 375.278, |
| "eval_steps_per_second": 1.501, |
| "step": 24309 |
| }, |
| { |
| "epoch": 73.57, |
| "grad_norm": 1.9195489883422852, |
| "learning_rate": 2.642642642642643e-06, |
| "loss": 0.1065, |
| "step": 24500 |
| }, |
| { |
| "epoch": 74.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04627339914441109, |
| "eval_runtime": 26.8321, |
| "eval_samples_per_second": 372.687, |
| "eval_steps_per_second": 1.491, |
| "step": 24642 |
| }, |
| { |
| "epoch": 75.0, |
| "eval_accuracy": 0.9895, |
| "eval_loss": 0.04610699415206909, |
| "eval_runtime": 26.787, |
| "eval_samples_per_second": 373.316, |
| "eval_steps_per_second": 1.493, |
| "step": 24975 |
| }, |
| { |
| "epoch": 75.08, |
| "grad_norm": 2.639979839324951, |
| "learning_rate": 2.4924924924924926e-06, |
| "loss": 0.1136, |
| "step": 25000 |
| }, |
| { |
| "epoch": 76.0, |
| "eval_accuracy": 0.9893, |
| "eval_loss": 0.04624738171696663, |
| "eval_runtime": 27.255, |
| "eval_samples_per_second": 366.905, |
| "eval_steps_per_second": 1.468, |
| "step": 25308 |
| }, |
| { |
| "epoch": 76.58, |
| "grad_norm": 1.8884766101837158, |
| "learning_rate": 2.3423423423423424e-06, |
| "loss": 0.1117, |
| "step": 25500 |
| }, |
| { |
| "epoch": 77.0, |
| "eval_accuracy": 0.9886, |
| "eval_loss": 0.04540451616048813, |
| "eval_runtime": 26.8153, |
| "eval_samples_per_second": 372.921, |
| "eval_steps_per_second": 1.492, |
| "step": 25641 |
| }, |
| { |
| "epoch": 78.0, |
| "eval_accuracy": 0.9889, |
| "eval_loss": 0.045627232640981674, |
| "eval_runtime": 27.2138, |
| "eval_samples_per_second": 367.46, |
| "eval_steps_per_second": 1.47, |
| "step": 25974 |
| }, |
| { |
| "epoch": 78.08, |
| "grad_norm": 2.558985948562622, |
| "learning_rate": 2.192192192192192e-06, |
| "loss": 0.1106, |
| "step": 26000 |
| }, |
| { |
| "epoch": 79.0, |
| "eval_accuracy": 0.9887, |
| "eval_loss": 0.0453827828168869, |
| "eval_runtime": 26.7871, |
| "eval_samples_per_second": 373.313, |
| "eval_steps_per_second": 1.493, |
| "step": 26307 |
| }, |
| { |
| "epoch": 79.58, |
| "grad_norm": 1.8785403966903687, |
| "learning_rate": 2.0420420420420424e-06, |
| "loss": 0.1085, |
| "step": 26500 |
| }, |
| { |
| "epoch": 80.0, |
| "eval_accuracy": 0.9893, |
| "eval_loss": 0.04575396701693535, |
| "eval_runtime": 26.9022, |
| "eval_samples_per_second": 371.716, |
| "eval_steps_per_second": 1.487, |
| "step": 26640 |
| }, |
| { |
| "epoch": 81.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.045756805688142776, |
| "eval_runtime": 27.2761, |
| "eval_samples_per_second": 366.621, |
| "eval_steps_per_second": 1.466, |
| "step": 26973 |
| }, |
| { |
| "epoch": 81.08, |
| "grad_norm": 1.840050458908081, |
| "learning_rate": 1.8918918918918922e-06, |
| "loss": 0.107, |
| "step": 27000 |
| }, |
| { |
| "epoch": 82.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04503399878740311, |
| "eval_runtime": 26.7515, |
| "eval_samples_per_second": 373.811, |
| "eval_steps_per_second": 1.495, |
| "step": 27306 |
| }, |
| { |
| "epoch": 82.58, |
| "grad_norm": 0.8984606862068176, |
| "learning_rate": 1.7417417417417418e-06, |
| "loss": 0.1112, |
| "step": 27500 |
| }, |
| { |
| "epoch": 83.0, |
| "eval_accuracy": 0.9896, |
| "eval_loss": 0.043780211359262466, |
| "eval_runtime": 27.2885, |
| "eval_samples_per_second": 366.454, |
| "eval_steps_per_second": 1.466, |
| "step": 27639 |
| }, |
| { |
| "epoch": 84.0, |
| "eval_accuracy": 0.9891, |
| "eval_loss": 0.045303359627723694, |
| "eval_runtime": 27.2638, |
| "eval_samples_per_second": 366.786, |
| "eval_steps_per_second": 1.467, |
| "step": 27972 |
| }, |
| { |
| "epoch": 84.08, |
| "grad_norm": 2.417297124862671, |
| "learning_rate": 1.5915915915915916e-06, |
| "loss": 0.1073, |
| "step": 28000 |
| }, |
| { |
| "epoch": 85.0, |
| "eval_accuracy": 0.9893, |
| "eval_loss": 0.04447495564818382, |
| "eval_runtime": 27.5559, |
| "eval_samples_per_second": 362.898, |
| "eval_steps_per_second": 1.452, |
| "step": 28305 |
| }, |
| { |
| "epoch": 85.59, |
| "grad_norm": 2.735337734222412, |
| "learning_rate": 1.4414414414414416e-06, |
| "loss": 0.1103, |
| "step": 28500 |
| }, |
| { |
| "epoch": 86.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.044364869594573975, |
| "eval_runtime": 26.7863, |
| "eval_samples_per_second": 373.326, |
| "eval_steps_per_second": 1.493, |
| "step": 28638 |
| }, |
| { |
| "epoch": 87.0, |
| "eval_accuracy": 0.9891, |
| "eval_loss": 0.044344834983348846, |
| "eval_runtime": 26.6701, |
| "eval_samples_per_second": 374.952, |
| "eval_steps_per_second": 1.5, |
| "step": 28971 |
| }, |
| { |
| "epoch": 87.09, |
| "grad_norm": 2.378945827484131, |
| "learning_rate": 1.2912912912912913e-06, |
| "loss": 0.1074, |
| "step": 29000 |
| }, |
| { |
| "epoch": 88.0, |
| "eval_accuracy": 0.9893, |
| "eval_loss": 0.04596693813800812, |
| "eval_runtime": 27.7741, |
| "eval_samples_per_second": 360.047, |
| "eval_steps_per_second": 1.44, |
| "step": 29304 |
| }, |
| { |
| "epoch": 88.59, |
| "grad_norm": 1.8432105779647827, |
| "learning_rate": 1.1411411411411411e-06, |
| "loss": 0.1041, |
| "step": 29500 |
| }, |
| { |
| "epoch": 89.0, |
| "eval_accuracy": 0.9891, |
| "eval_loss": 0.045480720698833466, |
| "eval_runtime": 27.1037, |
| "eval_samples_per_second": 368.953, |
| "eval_steps_per_second": 1.476, |
| "step": 29637 |
| }, |
| { |
| "epoch": 90.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04399260878562927, |
| "eval_runtime": 26.7306, |
| "eval_samples_per_second": 374.102, |
| "eval_steps_per_second": 1.496, |
| "step": 29970 |
| }, |
| { |
| "epoch": 90.09, |
| "grad_norm": 2.23136305809021, |
| "learning_rate": 9.909909909909911e-07, |
| "loss": 0.1054, |
| "step": 30000 |
| }, |
| { |
| "epoch": 91.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.04531684145331383, |
| "eval_runtime": 26.671, |
| "eval_samples_per_second": 374.939, |
| "eval_steps_per_second": 1.5, |
| "step": 30303 |
| }, |
| { |
| "epoch": 91.59, |
| "grad_norm": 2.097313642501831, |
| "learning_rate": 8.40840840840841e-07, |
| "loss": 0.1069, |
| "step": 30500 |
| }, |
| { |
| "epoch": 92.0, |
| "eval_accuracy": 0.989, |
| "eval_loss": 0.04511631652712822, |
| "eval_runtime": 27.1707, |
| "eval_samples_per_second": 368.043, |
| "eval_steps_per_second": 1.472, |
| "step": 30636 |
| }, |
| { |
| "epoch": 93.0, |
| "eval_accuracy": 0.9894, |
| "eval_loss": 0.044860344380140305, |
| "eval_runtime": 26.5648, |
| "eval_samples_per_second": 376.438, |
| "eval_steps_per_second": 1.506, |
| "step": 30969 |
| }, |
| { |
| "epoch": 93.09, |
| "grad_norm": 2.0008368492126465, |
| "learning_rate": 6.906906906906907e-07, |
| "loss": 0.1056, |
| "step": 31000 |
| }, |
| { |
| "epoch": 94.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.04571190103888512, |
| "eval_runtime": 27.2069, |
| "eval_samples_per_second": 367.554, |
| "eval_steps_per_second": 1.47, |
| "step": 31302 |
| }, |
| { |
| "epoch": 94.59, |
| "grad_norm": 2.618077039718628, |
| "learning_rate": 5.405405405405406e-07, |
| "loss": 0.1069, |
| "step": 31500 |
| }, |
| { |
| "epoch": 95.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.04491310566663742, |
| "eval_runtime": 26.6385, |
| "eval_samples_per_second": 375.396, |
| "eval_steps_per_second": 1.502, |
| "step": 31635 |
| }, |
| { |
| "epoch": 96.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.0449623242020607, |
| "eval_runtime": 26.4041, |
| "eval_samples_per_second": 378.729, |
| "eval_steps_per_second": 1.515, |
| "step": 31968 |
| }, |
| { |
| "epoch": 96.1, |
| "grad_norm": 1.9157476425170898, |
| "learning_rate": 3.903903903903904e-07, |
| "loss": 0.1053, |
| "step": 32000 |
| }, |
| { |
| "epoch": 97.0, |
| "eval_accuracy": 0.9896, |
| "eval_loss": 0.044889744371175766, |
| "eval_runtime": 27.0991, |
| "eval_samples_per_second": 369.016, |
| "eval_steps_per_second": 1.476, |
| "step": 32301 |
| }, |
| { |
| "epoch": 97.6, |
| "grad_norm": 2.6996421813964844, |
| "learning_rate": 2.4024024024024026e-07, |
| "loss": 0.1068, |
| "step": 32500 |
| }, |
| { |
| "epoch": 98.0, |
| "eval_accuracy": 0.9893, |
| "eval_loss": 0.04525148868560791, |
| "eval_runtime": 27.1528, |
| "eval_samples_per_second": 368.286, |
| "eval_steps_per_second": 1.473, |
| "step": 32634 |
| }, |
| { |
| "epoch": 99.0, |
| "eval_accuracy": 0.9891, |
| "eval_loss": 0.045253388583660126, |
| "eval_runtime": 27.5963, |
| "eval_samples_per_second": 362.368, |
| "eval_steps_per_second": 1.449, |
| "step": 32967 |
| }, |
| { |
| "epoch": 99.1, |
| "grad_norm": 2.057509422302246, |
| "learning_rate": 9.00900900900901e-08, |
| "loss": 0.1059, |
| "step": 33000 |
| }, |
| { |
| "epoch": 100.0, |
| "eval_accuracy": 0.9892, |
| "eval_loss": 0.045226842164993286, |
| "eval_runtime": 26.356, |
| "eval_samples_per_second": 379.421, |
| "eval_steps_per_second": 1.518, |
| "step": 33300 |
| }, |
| { |
| "epoch": 100.0, |
| "step": 33300, |
| "total_flos": 3.293645700925431e+20, |
| "train_loss": 0.08339859163438952, |
| "train_runtime": 26101.948, |
| "train_samples_per_second": 162.823, |
| "train_steps_per_second": 1.276 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 33300, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 100, |
| "save_steps": 500, |
| "total_flos": 3.293645700925431e+20, |
| "train_batch_size": 128, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|