| { | |
| "best_metric": 0.9818, | |
| "best_model_checkpoint": "../../checkpoint/cifar10/swin-tiny/checkpoint-38295", | |
| "epoch": 300.0, | |
| "eval_steps": 500, | |
| "global_step": 99900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.9223, | |
| "eval_loss": 0.2584497928619385, | |
| "eval_runtime": 21.3622, | |
| "eval_samples_per_second": 468.117, | |
| "eval_steps_per_second": 1.872, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 14.220479011535645, | |
| "learning_rate": 9.949949949949951e-06, | |
| "loss": 0.9076, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.945, | |
| "eval_loss": 0.1637052595615387, | |
| "eval_runtime": 13.6699, | |
| "eval_samples_per_second": 731.533, | |
| "eval_steps_per_second": 2.926, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.9553, | |
| "eval_loss": 0.1344435065984726, | |
| "eval_runtime": 13.1721, | |
| "eval_samples_per_second": 759.181, | |
| "eval_steps_per_second": 3.037, | |
| "step": 999 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 9.328938484191895, | |
| "learning_rate": 9.899899899899901e-06, | |
| "loss": 0.4797, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9604, | |
| "eval_loss": 0.1205841451883316, | |
| "eval_runtime": 12.6584, | |
| "eval_samples_per_second": 789.992, | |
| "eval_steps_per_second": 3.16, | |
| "step": 1332 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 14.11563777923584, | |
| "learning_rate": 9.849849849849851e-06, | |
| "loss": 0.4193, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9635, | |
| "eval_loss": 0.11088060587644577, | |
| "eval_runtime": 12.7891, | |
| "eval_samples_per_second": 781.918, | |
| "eval_steps_per_second": 3.128, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9661, | |
| "eval_loss": 0.10564317554235458, | |
| "eval_runtime": 12.9686, | |
| "eval_samples_per_second": 771.094, | |
| "eval_steps_per_second": 3.084, | |
| "step": 1998 | |
| }, | |
| { | |
| "epoch": 6.01, | |
| "grad_norm": 12.565740585327148, | |
| "learning_rate": 9.799799799799801e-06, | |
| "loss": 0.3846, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.9688, | |
| "eval_loss": 0.09508195519447327, | |
| "eval_runtime": 13.2698, | |
| "eval_samples_per_second": 753.592, | |
| "eval_steps_per_second": 3.014, | |
| "step": 2331 | |
| }, | |
| { | |
| "epoch": 7.51, | |
| "grad_norm": 9.896069526672363, | |
| "learning_rate": 9.749749749749751e-06, | |
| "loss": 0.3572, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.9689, | |
| "eval_loss": 0.09568808227777481, | |
| "eval_runtime": 13.6448, | |
| "eval_samples_per_second": 732.879, | |
| "eval_steps_per_second": 2.932, | |
| "step": 2664 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.9693, | |
| "eval_loss": 0.09088099747896194, | |
| "eval_runtime": 13.9779, | |
| "eval_samples_per_second": 715.417, | |
| "eval_steps_per_second": 2.862, | |
| "step": 2997 | |
| }, | |
| { | |
| "epoch": 9.01, | |
| "grad_norm": 9.739038467407227, | |
| "learning_rate": 9.699699699699701e-06, | |
| "loss": 0.3409, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.971, | |
| "eval_loss": 0.0861617922782898, | |
| "eval_runtime": 13.5874, | |
| "eval_samples_per_second": 735.979, | |
| "eval_steps_per_second": 2.944, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 10.51, | |
| "grad_norm": 7.383803367614746, | |
| "learning_rate": 9.649649649649651e-06, | |
| "loss": 0.3319, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.9721, | |
| "eval_loss": 0.08562646806240082, | |
| "eval_runtime": 13.5289, | |
| "eval_samples_per_second": 739.158, | |
| "eval_steps_per_second": 2.957, | |
| "step": 3663 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.972, | |
| "eval_loss": 0.08723447471857071, | |
| "eval_runtime": 13.3531, | |
| "eval_samples_per_second": 748.887, | |
| "eval_steps_per_second": 2.996, | |
| "step": 3996 | |
| }, | |
| { | |
| "epoch": 12.01, | |
| "grad_norm": 11.866540908813477, | |
| "learning_rate": 9.5995995995996e-06, | |
| "loss": 0.3253, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.973, | |
| "eval_loss": 0.08058160543441772, | |
| "eval_runtime": 14.1547, | |
| "eval_samples_per_second": 706.479, | |
| "eval_steps_per_second": 2.826, | |
| "step": 4329 | |
| }, | |
| { | |
| "epoch": 13.51, | |
| "grad_norm": 7.938398361206055, | |
| "learning_rate": 9.54954954954955e-06, | |
| "loss": 0.3084, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.9738, | |
| "eval_loss": 0.08162784576416016, | |
| "eval_runtime": 14.1065, | |
| "eval_samples_per_second": 708.895, | |
| "eval_steps_per_second": 2.836, | |
| "step": 4662 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.9742, | |
| "eval_loss": 0.07894858717918396, | |
| "eval_runtime": 13.886, | |
| "eval_samples_per_second": 720.149, | |
| "eval_steps_per_second": 2.881, | |
| "step": 4995 | |
| }, | |
| { | |
| "epoch": 15.02, | |
| "grad_norm": 16.568248748779297, | |
| "learning_rate": 9.4994994994995e-06, | |
| "loss": 0.3022, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.9746, | |
| "eval_loss": 0.07670588046312332, | |
| "eval_runtime": 13.5929, | |
| "eval_samples_per_second": 735.676, | |
| "eval_steps_per_second": 2.943, | |
| "step": 5328 | |
| }, | |
| { | |
| "epoch": 16.52, | |
| "grad_norm": 13.009441375732422, | |
| "learning_rate": 9.44944944944945e-06, | |
| "loss": 0.2894, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_accuracy": 0.9725, | |
| "eval_loss": 0.0805484876036644, | |
| "eval_runtime": 13.3932, | |
| "eval_samples_per_second": 746.649, | |
| "eval_steps_per_second": 2.987, | |
| "step": 5661 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.9759, | |
| "eval_loss": 0.0759720578789711, | |
| "eval_runtime": 13.5457, | |
| "eval_samples_per_second": 738.24, | |
| "eval_steps_per_second": 2.953, | |
| "step": 5994 | |
| }, | |
| { | |
| "epoch": 18.02, | |
| "grad_norm": 13.468392372131348, | |
| "learning_rate": 9.3993993993994e-06, | |
| "loss": 0.2842, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_accuracy": 0.9744, | |
| "eval_loss": 0.07423894852399826, | |
| "eval_runtime": 13.6253, | |
| "eval_samples_per_second": 733.929, | |
| "eval_steps_per_second": 2.936, | |
| "step": 6327 | |
| }, | |
| { | |
| "epoch": 19.52, | |
| "grad_norm": 12.263895988464355, | |
| "learning_rate": 9.34934934934935e-06, | |
| "loss": 0.2712, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.9738, | |
| "eval_loss": 0.07846847176551819, | |
| "eval_runtime": 12.9608, | |
| "eval_samples_per_second": 771.556, | |
| "eval_steps_per_second": 3.086, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_accuracy": 0.9735, | |
| "eval_loss": 0.07904864102602005, | |
| "eval_runtime": 12.8724, | |
| "eval_samples_per_second": 776.858, | |
| "eval_steps_per_second": 3.107, | |
| "step": 6993 | |
| }, | |
| { | |
| "epoch": 21.02, | |
| "grad_norm": 10.845202445983887, | |
| "learning_rate": 9.2992992992993e-06, | |
| "loss": 0.2729, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.9759, | |
| "eval_loss": 0.07514221966266632, | |
| "eval_runtime": 14.1339, | |
| "eval_samples_per_second": 707.52, | |
| "eval_steps_per_second": 2.83, | |
| "step": 7326 | |
| }, | |
| { | |
| "epoch": 22.52, | |
| "grad_norm": 11.12897777557373, | |
| "learning_rate": 9.24924924924925e-06, | |
| "loss": 0.2634, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_accuracy": 0.9737, | |
| "eval_loss": 0.07959982007741928, | |
| "eval_runtime": 13.0065, | |
| "eval_samples_per_second": 768.844, | |
| "eval_steps_per_second": 3.075, | |
| "step": 7659 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 0.9752, | |
| "eval_loss": 0.07558540254831314, | |
| "eval_runtime": 13.805, | |
| "eval_samples_per_second": 724.375, | |
| "eval_steps_per_second": 2.897, | |
| "step": 7992 | |
| }, | |
| { | |
| "epoch": 24.02, | |
| "grad_norm": 10.100821495056152, | |
| "learning_rate": 9.1991991991992e-06, | |
| "loss": 0.2591, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_accuracy": 0.9759, | |
| "eval_loss": 0.07549387961626053, | |
| "eval_runtime": 13.4677, | |
| "eval_samples_per_second": 742.518, | |
| "eval_steps_per_second": 2.97, | |
| "step": 8325 | |
| }, | |
| { | |
| "epoch": 25.53, | |
| "grad_norm": 9.881790161132812, | |
| "learning_rate": 9.14914914914915e-06, | |
| "loss": 0.253, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_accuracy": 0.9746, | |
| "eval_loss": 0.07933681458234787, | |
| "eval_runtime": 13.2517, | |
| "eval_samples_per_second": 754.619, | |
| "eval_steps_per_second": 3.018, | |
| "step": 8658 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_accuracy": 0.9765, | |
| "eval_loss": 0.07278025895357132, | |
| "eval_runtime": 13.5258, | |
| "eval_samples_per_second": 739.327, | |
| "eval_steps_per_second": 2.957, | |
| "step": 8991 | |
| }, | |
| { | |
| "epoch": 27.03, | |
| "grad_norm": 7.72860860824585, | |
| "learning_rate": 9.0990990990991e-06, | |
| "loss": 0.2518, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_accuracy": 0.9748, | |
| "eval_loss": 0.07914856821298599, | |
| "eval_runtime": 13.7348, | |
| "eval_samples_per_second": 728.079, | |
| "eval_steps_per_second": 2.912, | |
| "step": 9324 | |
| }, | |
| { | |
| "epoch": 28.53, | |
| "grad_norm": 8.068327903747559, | |
| "learning_rate": 9.04904904904905e-06, | |
| "loss": 0.2482, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_accuracy": 0.9756, | |
| "eval_loss": 0.07918867468833923, | |
| "eval_runtime": 13.3633, | |
| "eval_samples_per_second": 748.316, | |
| "eval_steps_per_second": 2.993, | |
| "step": 9657 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_accuracy": 0.9764, | |
| "eval_loss": 0.07418718934059143, | |
| "eval_runtime": 12.9493, | |
| "eval_samples_per_second": 772.24, | |
| "eval_steps_per_second": 3.089, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 30.03, | |
| "grad_norm": 8.977522850036621, | |
| "learning_rate": 8.998998998999e-06, | |
| "loss": 0.2429, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_accuracy": 0.9757, | |
| "eval_loss": 0.07399851083755493, | |
| "eval_runtime": 13.4787, | |
| "eval_samples_per_second": 741.913, | |
| "eval_steps_per_second": 2.968, | |
| "step": 10323 | |
| }, | |
| { | |
| "epoch": 31.53, | |
| "grad_norm": 11.080597877502441, | |
| "learning_rate": 8.94894894894895e-06, | |
| "loss": 0.2405, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_accuracy": 0.9757, | |
| "eval_loss": 0.07426943629980087, | |
| "eval_runtime": 12.8343, | |
| "eval_samples_per_second": 779.16, | |
| "eval_steps_per_second": 3.117, | |
| "step": 10656 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_accuracy": 0.9757, | |
| "eval_loss": 0.07429418712854385, | |
| "eval_runtime": 12.9825, | |
| "eval_samples_per_second": 770.266, | |
| "eval_steps_per_second": 3.081, | |
| "step": 10989 | |
| }, | |
| { | |
| "epoch": 33.03, | |
| "grad_norm": 7.3039140701293945, | |
| "learning_rate": 8.8988988988989e-06, | |
| "loss": 0.234, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_accuracy": 0.9769, | |
| "eval_loss": 0.07486932724714279, | |
| "eval_runtime": 12.96, | |
| "eval_samples_per_second": 771.606, | |
| "eval_steps_per_second": 3.086, | |
| "step": 11322 | |
| }, | |
| { | |
| "epoch": 34.53, | |
| "grad_norm": 8.610194206237793, | |
| "learning_rate": 8.84884884884885e-06, | |
| "loss": 0.2353, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_accuracy": 0.975, | |
| "eval_loss": 0.0768030509352684, | |
| "eval_runtime": 13.519, | |
| "eval_samples_per_second": 739.698, | |
| "eval_steps_per_second": 2.959, | |
| "step": 11655 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_accuracy": 0.9771, | |
| "eval_loss": 0.07342812418937683, | |
| "eval_runtime": 14.3472, | |
| "eval_samples_per_second": 697.001, | |
| "eval_steps_per_second": 2.788, | |
| "step": 11988 | |
| }, | |
| { | |
| "epoch": 36.04, | |
| "grad_norm": 7.767194747924805, | |
| "learning_rate": 8.798798798798799e-06, | |
| "loss": 0.2329, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_accuracy": 0.9755, | |
| "eval_loss": 0.07778933644294739, | |
| "eval_runtime": 13.5633, | |
| "eval_samples_per_second": 737.284, | |
| "eval_steps_per_second": 2.949, | |
| "step": 12321 | |
| }, | |
| { | |
| "epoch": 37.54, | |
| "grad_norm": 11.39279842376709, | |
| "learning_rate": 8.74874874874875e-06, | |
| "loss": 0.2289, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_accuracy": 0.9771, | |
| "eval_loss": 0.07622923702001572, | |
| "eval_runtime": 13.5603, | |
| "eval_samples_per_second": 737.447, | |
| "eval_steps_per_second": 2.95, | |
| "step": 12654 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_accuracy": 0.9761, | |
| "eval_loss": 0.07648137956857681, | |
| "eval_runtime": 13.4622, | |
| "eval_samples_per_second": 742.82, | |
| "eval_steps_per_second": 2.971, | |
| "step": 12987 | |
| }, | |
| { | |
| "epoch": 39.04, | |
| "grad_norm": 8.879070281982422, | |
| "learning_rate": 8.6986986986987e-06, | |
| "loss": 0.227, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_accuracy": 0.9768, | |
| "eval_loss": 0.07394447922706604, | |
| "eval_runtime": 13.4641, | |
| "eval_samples_per_second": 742.715, | |
| "eval_steps_per_second": 2.971, | |
| "step": 13320 | |
| }, | |
| { | |
| "epoch": 40.54, | |
| "grad_norm": 10.858572006225586, | |
| "learning_rate": 8.64864864864865e-06, | |
| "loss": 0.2213, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_accuracy": 0.9773, | |
| "eval_loss": 0.07473840564489365, | |
| "eval_runtime": 12.9211, | |
| "eval_samples_per_second": 773.93, | |
| "eval_steps_per_second": 3.096, | |
| "step": 13653 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_accuracy": 0.9786, | |
| "eval_loss": 0.07195272296667099, | |
| "eval_runtime": 13.3716, | |
| "eval_samples_per_second": 747.852, | |
| "eval_steps_per_second": 2.991, | |
| "step": 13986 | |
| }, | |
| { | |
| "epoch": 42.04, | |
| "grad_norm": 9.299273490905762, | |
| "learning_rate": 8.5985985985986e-06, | |
| "loss": 0.217, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_accuracy": 0.9771, | |
| "eval_loss": 0.07661354541778564, | |
| "eval_runtime": 13.4888, | |
| "eval_samples_per_second": 741.354, | |
| "eval_steps_per_second": 2.965, | |
| "step": 14319 | |
| }, | |
| { | |
| "epoch": 43.54, | |
| "grad_norm": 9.49695873260498, | |
| "learning_rate": 8.54854854854855e-06, | |
| "loss": 0.22, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_accuracy": 0.9767, | |
| "eval_loss": 0.07640816271305084, | |
| "eval_runtime": 14.0377, | |
| "eval_samples_per_second": 712.365, | |
| "eval_steps_per_second": 2.849, | |
| "step": 14652 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_accuracy": 0.9779, | |
| "eval_loss": 0.07278802245855331, | |
| "eval_runtime": 13.4886, | |
| "eval_samples_per_second": 741.366, | |
| "eval_steps_per_second": 2.965, | |
| "step": 14985 | |
| }, | |
| { | |
| "epoch": 45.05, | |
| "grad_norm": 12.065461158752441, | |
| "learning_rate": 8.4984984984985e-06, | |
| "loss": 0.2179, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_accuracy": 0.9785, | |
| "eval_loss": 0.0740213543176651, | |
| "eval_runtime": 14.112, | |
| "eval_samples_per_second": 708.617, | |
| "eval_steps_per_second": 2.834, | |
| "step": 15318 | |
| }, | |
| { | |
| "epoch": 46.55, | |
| "grad_norm": 9.281307220458984, | |
| "learning_rate": 8.44844844844845e-06, | |
| "loss": 0.2074, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_accuracy": 0.9793, | |
| "eval_loss": 0.0712471604347229, | |
| "eval_runtime": 13.5017, | |
| "eval_samples_per_second": 740.647, | |
| "eval_steps_per_second": 2.963, | |
| "step": 15651 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_accuracy": 0.9783, | |
| "eval_loss": 0.0759299248456955, | |
| "eval_runtime": 13.3849, | |
| "eval_samples_per_second": 747.113, | |
| "eval_steps_per_second": 2.988, | |
| "step": 15984 | |
| }, | |
| { | |
| "epoch": 48.05, | |
| "grad_norm": 6.8984503746032715, | |
| "learning_rate": 8.398398398398398e-06, | |
| "loss": 0.2096, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_accuracy": 0.9791, | |
| "eval_loss": 0.07268951088190079, | |
| "eval_runtime": 13.5376, | |
| "eval_samples_per_second": 738.686, | |
| "eval_steps_per_second": 2.955, | |
| "step": 16317 | |
| }, | |
| { | |
| "epoch": 49.55, | |
| "grad_norm": 8.968807220458984, | |
| "learning_rate": 8.348348348348348e-06, | |
| "loss": 0.2097, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_accuracy": 0.9792, | |
| "eval_loss": 0.07472656667232513, | |
| "eval_runtime": 13.5262, | |
| "eval_samples_per_second": 739.304, | |
| "eval_steps_per_second": 2.957, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.0754549577832222, | |
| "eval_runtime": 13.1606, | |
| "eval_samples_per_second": 759.845, | |
| "eval_steps_per_second": 3.039, | |
| "step": 16983 | |
| }, | |
| { | |
| "epoch": 51.05, | |
| "grad_norm": 8.540103912353516, | |
| "learning_rate": 8.298298298298298e-06, | |
| "loss": 0.2063, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "eval_accuracy": 0.9788, | |
| "eval_loss": 0.0741283968091011, | |
| "eval_runtime": 13.8466, | |
| "eval_samples_per_second": 722.201, | |
| "eval_steps_per_second": 2.889, | |
| "step": 17316 | |
| }, | |
| { | |
| "epoch": 52.55, | |
| "grad_norm": 7.042116165161133, | |
| "learning_rate": 8.248248248248248e-06, | |
| "loss": 0.2054, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "eval_accuracy": 0.9784, | |
| "eval_loss": 0.0738772302865982, | |
| "eval_runtime": 13.021, | |
| "eval_samples_per_second": 767.992, | |
| "eval_steps_per_second": 3.072, | |
| "step": 17649 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "eval_accuracy": 0.9779, | |
| "eval_loss": 0.07553113251924515, | |
| "eval_runtime": 12.8958, | |
| "eval_samples_per_second": 775.444, | |
| "eval_steps_per_second": 3.102, | |
| "step": 17982 | |
| }, | |
| { | |
| "epoch": 54.05, | |
| "grad_norm": 9.23681640625, | |
| "learning_rate": 8.198198198198198e-06, | |
| "loss": 0.2003, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "eval_accuracy": 0.9784, | |
| "eval_loss": 0.07760650664567947, | |
| "eval_runtime": 12.6634, | |
| "eval_samples_per_second": 789.678, | |
| "eval_steps_per_second": 3.159, | |
| "step": 18315 | |
| }, | |
| { | |
| "epoch": 55.56, | |
| "grad_norm": 5.839297771453857, | |
| "learning_rate": 8.148148148148148e-06, | |
| "loss": 0.2009, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "eval_accuracy": 0.9786, | |
| "eval_loss": 0.07352690398693085, | |
| "eval_runtime": 13.1656, | |
| "eval_samples_per_second": 759.554, | |
| "eval_steps_per_second": 3.038, | |
| "step": 18648 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "eval_accuracy": 0.9769, | |
| "eval_loss": 0.07721856981515884, | |
| "eval_runtime": 12.8626, | |
| "eval_samples_per_second": 777.447, | |
| "eval_steps_per_second": 3.11, | |
| "step": 18981 | |
| }, | |
| { | |
| "epoch": 57.06, | |
| "grad_norm": 10.131054878234863, | |
| "learning_rate": 8.098098098098098e-06, | |
| "loss": 0.1999, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "eval_accuracy": 0.9789, | |
| "eval_loss": 0.07691636681556702, | |
| "eval_runtime": 12.8042, | |
| "eval_samples_per_second": 780.991, | |
| "eval_steps_per_second": 3.124, | |
| "step": 19314 | |
| }, | |
| { | |
| "epoch": 58.56, | |
| "grad_norm": 7.643968105316162, | |
| "learning_rate": 8.048048048048048e-06, | |
| "loss": 0.1973, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "eval_accuracy": 0.9793, | |
| "eval_loss": 0.07336228340864182, | |
| "eval_runtime": 13.6825, | |
| "eval_samples_per_second": 730.861, | |
| "eval_steps_per_second": 2.923, | |
| "step": 19647 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_accuracy": 0.9787, | |
| "eval_loss": 0.07408491522073746, | |
| "eval_runtime": 13.4079, | |
| "eval_samples_per_second": 745.831, | |
| "eval_steps_per_second": 2.983, | |
| "step": 19980 | |
| }, | |
| { | |
| "epoch": 60.06, | |
| "grad_norm": 9.443299293518066, | |
| "learning_rate": 7.997997997997999e-06, | |
| "loss": 0.1953, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "eval_accuracy": 0.978, | |
| "eval_loss": 0.07513260841369629, | |
| "eval_runtime": 13.4048, | |
| "eval_samples_per_second": 745.999, | |
| "eval_steps_per_second": 2.984, | |
| "step": 20313 | |
| }, | |
| { | |
| "epoch": 61.56, | |
| "grad_norm": 16.85797119140625, | |
| "learning_rate": 7.947947947947949e-06, | |
| "loss": 0.1937, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "eval_accuracy": 0.9786, | |
| "eval_loss": 0.07370081543922424, | |
| "eval_runtime": 13.3055, | |
| "eval_samples_per_second": 751.568, | |
| "eval_steps_per_second": 3.006, | |
| "step": 20646 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "eval_accuracy": 0.9786, | |
| "eval_loss": 0.07323586940765381, | |
| "eval_runtime": 12.7695, | |
| "eval_samples_per_second": 783.119, | |
| "eval_steps_per_second": 3.132, | |
| "step": 20979 | |
| }, | |
| { | |
| "epoch": 63.06, | |
| "grad_norm": 8.4561128616333, | |
| "learning_rate": 7.897897897897899e-06, | |
| "loss": 0.1946, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "eval_accuracy": 0.9789, | |
| "eval_loss": 0.07585693150758743, | |
| "eval_runtime": 13.6921, | |
| "eval_samples_per_second": 730.349, | |
| "eval_steps_per_second": 2.921, | |
| "step": 21312 | |
| }, | |
| { | |
| "epoch": 64.56, | |
| "grad_norm": 11.68150806427002, | |
| "learning_rate": 7.847847847847849e-06, | |
| "loss": 0.1909, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.07349375635385513, | |
| "eval_runtime": 12.8445, | |
| "eval_samples_per_second": 778.544, | |
| "eval_steps_per_second": 3.114, | |
| "step": 21645 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "eval_accuracy": 0.9788, | |
| "eval_loss": 0.07336971163749695, | |
| "eval_runtime": 12.8882, | |
| "eval_samples_per_second": 775.905, | |
| "eval_steps_per_second": 3.104, | |
| "step": 21978 | |
| }, | |
| { | |
| "epoch": 66.07, | |
| "grad_norm": 8.738271713256836, | |
| "learning_rate": 7.797797797797799e-06, | |
| "loss": 0.1935, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "eval_accuracy": 0.9793, | |
| "eval_loss": 0.07337453961372375, | |
| "eval_runtime": 12.8166, | |
| "eval_samples_per_second": 780.24, | |
| "eval_steps_per_second": 3.121, | |
| "step": 22311 | |
| }, | |
| { | |
| "epoch": 67.57, | |
| "grad_norm": 6.386814117431641, | |
| "learning_rate": 7.747747747747749e-06, | |
| "loss": 0.1936, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.07239189743995667, | |
| "eval_runtime": 12.831, | |
| "eval_samples_per_second": 779.362, | |
| "eval_steps_per_second": 3.117, | |
| "step": 22644 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "eval_accuracy": 0.9785, | |
| "eval_loss": 0.07570048421621323, | |
| "eval_runtime": 12.8964, | |
| "eval_samples_per_second": 775.412, | |
| "eval_steps_per_second": 3.102, | |
| "step": 22977 | |
| }, | |
| { | |
| "epoch": 69.07, | |
| "grad_norm": 9.476435661315918, | |
| "learning_rate": 7.697697697697697e-06, | |
| "loss": 0.1858, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.07100442796945572, | |
| "eval_runtime": 13.0999, | |
| "eval_samples_per_second": 763.367, | |
| "eval_steps_per_second": 3.053, | |
| "step": 23310 | |
| }, | |
| { | |
| "epoch": 70.57, | |
| "grad_norm": 9.190871238708496, | |
| "learning_rate": 7.647647647647647e-06, | |
| "loss": 0.1871, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.07596922665834427, | |
| "eval_runtime": 13.423, | |
| "eval_samples_per_second": 744.991, | |
| "eval_steps_per_second": 2.98, | |
| "step": 23643 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.07650475203990936, | |
| "eval_runtime": 13.055, | |
| "eval_samples_per_second": 765.99, | |
| "eval_steps_per_second": 3.064, | |
| "step": 23976 | |
| }, | |
| { | |
| "epoch": 72.07, | |
| "grad_norm": 10.302529335021973, | |
| "learning_rate": 7.597597597597598e-06, | |
| "loss": 0.1836, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "eval_accuracy": 0.9787, | |
| "eval_loss": 0.07714686542749405, | |
| "eval_runtime": 13.4272, | |
| "eval_samples_per_second": 744.757, | |
| "eval_steps_per_second": 2.979, | |
| "step": 24309 | |
| }, | |
| { | |
| "epoch": 73.57, | |
| "grad_norm": 7.050232410430908, | |
| "learning_rate": 7.547547547547548e-06, | |
| "loss": 0.1827, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "eval_accuracy": 0.9782, | |
| "eval_loss": 0.07620517909526825, | |
| "eval_runtime": 12.8858, | |
| "eval_samples_per_second": 776.045, | |
| "eval_steps_per_second": 3.104, | |
| "step": 24642 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "eval_accuracy": 0.9781, | |
| "eval_loss": 0.0778127908706665, | |
| "eval_runtime": 13.234, | |
| "eval_samples_per_second": 755.629, | |
| "eval_steps_per_second": 3.023, | |
| "step": 24975 | |
| }, | |
| { | |
| "epoch": 75.08, | |
| "grad_norm": 8.824182510375977, | |
| "learning_rate": 7.4974974974974975e-06, | |
| "loss": 0.1847, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "eval_accuracy": 0.9781, | |
| "eval_loss": 0.08140425384044647, | |
| "eval_runtime": 13.9137, | |
| "eval_samples_per_second": 718.714, | |
| "eval_steps_per_second": 2.875, | |
| "step": 25308 | |
| }, | |
| { | |
| "epoch": 76.58, | |
| "grad_norm": 8.920430183410645, | |
| "learning_rate": 7.447447447447448e-06, | |
| "loss": 0.1815, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "eval_accuracy": 0.9788, | |
| "eval_loss": 0.07689312100410461, | |
| "eval_runtime": 13.1404, | |
| "eval_samples_per_second": 761.014, | |
| "eval_steps_per_second": 3.044, | |
| "step": 25641 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.07370501756668091, | |
| "eval_runtime": 13.7683, | |
| "eval_samples_per_second": 726.307, | |
| "eval_steps_per_second": 2.905, | |
| "step": 25974 | |
| }, | |
| { | |
| "epoch": 78.08, | |
| "grad_norm": 9.352115631103516, | |
| "learning_rate": 7.397397397397398e-06, | |
| "loss": 0.1786, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.07396883517503738, | |
| "eval_runtime": 13.0019, | |
| "eval_samples_per_second": 769.121, | |
| "eval_steps_per_second": 3.076, | |
| "step": 26307 | |
| }, | |
| { | |
| "epoch": 79.58, | |
| "grad_norm": 14.500313758850098, | |
| "learning_rate": 7.347347347347348e-06, | |
| "loss": 0.1819, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.07097125053405762, | |
| "eval_runtime": 13.6192, | |
| "eval_samples_per_second": 734.256, | |
| "eval_steps_per_second": 2.937, | |
| "step": 26640 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.07538946717977524, | |
| "eval_runtime": 13.1675, | |
| "eval_samples_per_second": 759.445, | |
| "eval_steps_per_second": 3.038, | |
| "step": 26973 | |
| }, | |
| { | |
| "epoch": 81.08, | |
| "grad_norm": 6.939184188842773, | |
| "learning_rate": 7.297297297297298e-06, | |
| "loss": 0.1767, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "eval_accuracy": 0.9789, | |
| "eval_loss": 0.07721950113773346, | |
| "eval_runtime": 13.1312, | |
| "eval_samples_per_second": 761.542, | |
| "eval_steps_per_second": 3.046, | |
| "step": 27306 | |
| }, | |
| { | |
| "epoch": 82.58, | |
| "grad_norm": 6.59556770324707, | |
| "learning_rate": 7.247247247247248e-06, | |
| "loss": 0.1792, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.0765281617641449, | |
| "eval_runtime": 13.0707, | |
| "eval_samples_per_second": 765.07, | |
| "eval_steps_per_second": 3.06, | |
| "step": 27639 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.07408629357814789, | |
| "eval_runtime": 12.4754, | |
| "eval_samples_per_second": 801.579, | |
| "eval_steps_per_second": 3.206, | |
| "step": 27972 | |
| }, | |
| { | |
| "epoch": 84.08, | |
| "grad_norm": 7.376372814178467, | |
| "learning_rate": 7.197197197197198e-06, | |
| "loss": 0.1752, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.0741427093744278, | |
| "eval_runtime": 12.8952, | |
| "eval_samples_per_second": 775.481, | |
| "eval_steps_per_second": 3.102, | |
| "step": 28305 | |
| }, | |
| { | |
| "epoch": 85.59, | |
| "grad_norm": 11.074542045593262, | |
| "learning_rate": 7.147147147147148e-06, | |
| "loss": 0.1789, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 86.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.07748846709728241, | |
| "eval_runtime": 12.5701, | |
| "eval_samples_per_second": 795.538, | |
| "eval_steps_per_second": 3.182, | |
| "step": 28638 | |
| }, | |
| { | |
| "epoch": 87.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.07314252853393555, | |
| "eval_runtime": 13.4921, | |
| "eval_samples_per_second": 741.174, | |
| "eval_steps_per_second": 2.965, | |
| "step": 28971 | |
| }, | |
| { | |
| "epoch": 87.09, | |
| "grad_norm": 8.502799987792969, | |
| "learning_rate": 7.097097097097097e-06, | |
| "loss": 0.1755, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.07246743142604828, | |
| "eval_runtime": 14.1164, | |
| "eval_samples_per_second": 708.396, | |
| "eval_steps_per_second": 2.834, | |
| "step": 29304 | |
| }, | |
| { | |
| "epoch": 88.59, | |
| "grad_norm": 10.004383087158203, | |
| "learning_rate": 7.047047047047047e-06, | |
| "loss": 0.1694, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 89.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.07495511323213577, | |
| "eval_runtime": 13.0203, | |
| "eval_samples_per_second": 768.031, | |
| "eval_steps_per_second": 3.072, | |
| "step": 29637 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "eval_accuracy": 0.9815, | |
| "eval_loss": 0.07112333923578262, | |
| "eval_runtime": 12.8831, | |
| "eval_samples_per_second": 776.209, | |
| "eval_steps_per_second": 3.105, | |
| "step": 29970 | |
| }, | |
| { | |
| "epoch": 90.09, | |
| "grad_norm": 10.05745792388916, | |
| "learning_rate": 6.996996996996997e-06, | |
| "loss": 0.1739, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 91.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07672711461782455, | |
| "eval_runtime": 13.4897, | |
| "eval_samples_per_second": 741.306, | |
| "eval_steps_per_second": 2.965, | |
| "step": 30303 | |
| }, | |
| { | |
| "epoch": 91.59, | |
| "grad_norm": 7.928704738616943, | |
| "learning_rate": 6.9469469469469474e-06, | |
| "loss": 0.1726, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.0770508348941803, | |
| "eval_runtime": 13.4469, | |
| "eval_samples_per_second": 743.667, | |
| "eval_steps_per_second": 2.975, | |
| "step": 30636 | |
| }, | |
| { | |
| "epoch": 93.0, | |
| "eval_accuracy": 0.9786, | |
| "eval_loss": 0.0784955620765686, | |
| "eval_runtime": 13.8288, | |
| "eval_samples_per_second": 723.131, | |
| "eval_steps_per_second": 2.893, | |
| "step": 30969 | |
| }, | |
| { | |
| "epoch": 93.09, | |
| "grad_norm": 9.178421974182129, | |
| "learning_rate": 6.8968968968968975e-06, | |
| "loss": 0.1696, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 94.0, | |
| "eval_accuracy": 0.9787, | |
| "eval_loss": 0.07988455891609192, | |
| "eval_runtime": 13.675, | |
| "eval_samples_per_second": 731.261, | |
| "eval_steps_per_second": 2.925, | |
| "step": 31302 | |
| }, | |
| { | |
| "epoch": 94.59, | |
| "grad_norm": 7.237130165100098, | |
| "learning_rate": 6.846846846846848e-06, | |
| "loss": 0.1723, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "eval_accuracy": 0.979, | |
| "eval_loss": 0.07755716890096664, | |
| "eval_runtime": 13.4765, | |
| "eval_samples_per_second": 742.035, | |
| "eval_steps_per_second": 2.968, | |
| "step": 31635 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "eval_accuracy": 0.9796, | |
| "eval_loss": 0.07740277796983719, | |
| "eval_runtime": 13.7122, | |
| "eval_samples_per_second": 729.276, | |
| "eval_steps_per_second": 2.917, | |
| "step": 31968 | |
| }, | |
| { | |
| "epoch": 96.1, | |
| "grad_norm": 6.332306385040283, | |
| "learning_rate": 6.796796796796798e-06, | |
| "loss": 0.1692, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 97.0, | |
| "eval_accuracy": 0.9797, | |
| "eval_loss": 0.08065084367990494, | |
| "eval_runtime": 12.8364, | |
| "eval_samples_per_second": 779.036, | |
| "eval_steps_per_second": 3.116, | |
| "step": 32301 | |
| }, | |
| { | |
| "epoch": 97.6, | |
| "grad_norm": 6.978306770324707, | |
| "learning_rate": 6.746746746746748e-06, | |
| "loss": 0.17, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 98.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.07497260719537735, | |
| "eval_runtime": 12.9795, | |
| "eval_samples_per_second": 770.446, | |
| "eval_steps_per_second": 3.082, | |
| "step": 32634 | |
| }, | |
| { | |
| "epoch": 99.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.07648865878582001, | |
| "eval_runtime": 13.1724, | |
| "eval_samples_per_second": 759.163, | |
| "eval_steps_per_second": 3.037, | |
| "step": 32967 | |
| }, | |
| { | |
| "epoch": 99.1, | |
| "grad_norm": 9.569737434387207, | |
| "learning_rate": 6.696696696696697e-06, | |
| "loss": 0.1691, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.07629863917827606, | |
| "eval_runtime": 13.7613, | |
| "eval_samples_per_second": 726.674, | |
| "eval_steps_per_second": 2.907, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 100.6, | |
| "grad_norm": 9.273295402526855, | |
| "learning_rate": 6.646646646646647e-06, | |
| "loss": 0.165, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 101.0, | |
| "eval_accuracy": 0.9794, | |
| "eval_loss": 0.07651650160551071, | |
| "eval_runtime": 12.8929, | |
| "eval_samples_per_second": 775.622, | |
| "eval_steps_per_second": 3.102, | |
| "step": 33633 | |
| }, | |
| { | |
| "epoch": 102.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.07412749528884888, | |
| "eval_runtime": 13.1273, | |
| "eval_samples_per_second": 761.772, | |
| "eval_steps_per_second": 3.047, | |
| "step": 33966 | |
| }, | |
| { | |
| "epoch": 102.1, | |
| "grad_norm": 5.686313152313232, | |
| "learning_rate": 6.596596596596597e-06, | |
| "loss": 0.1678, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 103.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.07281830161809921, | |
| "eval_runtime": 14.1227, | |
| "eval_samples_per_second": 708.079, | |
| "eval_steps_per_second": 2.832, | |
| "step": 34299 | |
| }, | |
| { | |
| "epoch": 103.6, | |
| "grad_norm": 13.40892505645752, | |
| "learning_rate": 6.546546546546547e-06, | |
| "loss": 0.1663, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 104.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.07456088066101074, | |
| "eval_runtime": 12.9479, | |
| "eval_samples_per_second": 772.329, | |
| "eval_steps_per_second": 3.089, | |
| "step": 34632 | |
| }, | |
| { | |
| "epoch": 105.0, | |
| "eval_accuracy": 0.9796, | |
| "eval_loss": 0.07469187676906586, | |
| "eval_runtime": 13.464, | |
| "eval_samples_per_second": 742.723, | |
| "eval_steps_per_second": 2.971, | |
| "step": 34965 | |
| }, | |
| { | |
| "epoch": 105.11, | |
| "grad_norm": 3.3622846603393555, | |
| "learning_rate": 6.496496496496497e-06, | |
| "loss": 0.1697, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 106.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07429709285497665, | |
| "eval_runtime": 12.5016, | |
| "eval_samples_per_second": 799.901, | |
| "eval_steps_per_second": 3.2, | |
| "step": 35298 | |
| }, | |
| { | |
| "epoch": 106.61, | |
| "grad_norm": 13.544451713562012, | |
| "learning_rate": 6.446446446446447e-06, | |
| "loss": 0.1637, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 107.0, | |
| "eval_accuracy": 0.9796, | |
| "eval_loss": 0.07689350843429565, | |
| "eval_runtime": 13.0156, | |
| "eval_samples_per_second": 768.306, | |
| "eval_steps_per_second": 3.073, | |
| "step": 35631 | |
| }, | |
| { | |
| "epoch": 108.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.07509542256593704, | |
| "eval_runtime": 13.0921, | |
| "eval_samples_per_second": 763.817, | |
| "eval_steps_per_second": 3.055, | |
| "step": 35964 | |
| }, | |
| { | |
| "epoch": 108.11, | |
| "grad_norm": 11.040998458862305, | |
| "learning_rate": 6.396396396396397e-06, | |
| "loss": 0.1678, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 109.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.0769224464893341, | |
| "eval_runtime": 13.4563, | |
| "eval_samples_per_second": 743.145, | |
| "eval_steps_per_second": 2.973, | |
| "step": 36297 | |
| }, | |
| { | |
| "epoch": 109.61, | |
| "grad_norm": 7.243069171905518, | |
| "learning_rate": 6.3463463463463474e-06, | |
| "loss": 0.1674, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 110.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.07392393797636032, | |
| "eval_runtime": 12.9386, | |
| "eval_samples_per_second": 772.879, | |
| "eval_steps_per_second": 3.092, | |
| "step": 36630 | |
| }, | |
| { | |
| "epoch": 111.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.0809590220451355, | |
| "eval_runtime": 13.2637, | |
| "eval_samples_per_second": 753.935, | |
| "eval_steps_per_second": 3.016, | |
| "step": 36963 | |
| }, | |
| { | |
| "epoch": 111.11, | |
| "grad_norm": 8.149242401123047, | |
| "learning_rate": 6.296296296296297e-06, | |
| "loss": 0.1604, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 112.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.07439053803682327, | |
| "eval_runtime": 12.9959, | |
| "eval_samples_per_second": 769.471, | |
| "eval_steps_per_second": 3.078, | |
| "step": 37296 | |
| }, | |
| { | |
| "epoch": 112.61, | |
| "grad_norm": 6.591969966888428, | |
| "learning_rate": 6.246246246246247e-06, | |
| "loss": 0.1583, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 113.0, | |
| "eval_accuracy": 0.9816, | |
| "eval_loss": 0.07411955296993256, | |
| "eval_runtime": 12.6355, | |
| "eval_samples_per_second": 791.421, | |
| "eval_steps_per_second": 3.166, | |
| "step": 37629 | |
| }, | |
| { | |
| "epoch": 114.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07842327654361725, | |
| "eval_runtime": 14.6497, | |
| "eval_samples_per_second": 682.608, | |
| "eval_steps_per_second": 2.73, | |
| "step": 37962 | |
| }, | |
| { | |
| "epoch": 114.11, | |
| "grad_norm": 8.455940246582031, | |
| "learning_rate": 6.196196196196197e-06, | |
| "loss": 0.1592, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 115.0, | |
| "eval_accuracy": 0.9818, | |
| "eval_loss": 0.07287651300430298, | |
| "eval_runtime": 13.5165, | |
| "eval_samples_per_second": 739.838, | |
| "eval_steps_per_second": 2.959, | |
| "step": 38295 | |
| }, | |
| { | |
| "epoch": 115.62, | |
| "grad_norm": 6.092105388641357, | |
| "learning_rate": 6.146146146146147e-06, | |
| "loss": 0.1607, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 116.0, | |
| "eval_accuracy": 0.9818, | |
| "eval_loss": 0.07438412308692932, | |
| "eval_runtime": 13.6093, | |
| "eval_samples_per_second": 734.794, | |
| "eval_steps_per_second": 2.939, | |
| "step": 38628 | |
| }, | |
| { | |
| "epoch": 117.0, | |
| "eval_accuracy": 0.9817, | |
| "eval_loss": 0.07355909794569016, | |
| "eval_runtime": 12.6698, | |
| "eval_samples_per_second": 789.279, | |
| "eval_steps_per_second": 3.157, | |
| "step": 38961 | |
| }, | |
| { | |
| "epoch": 117.12, | |
| "grad_norm": 7.972623348236084, | |
| "learning_rate": 6.096096096096097e-06, | |
| "loss": 0.1657, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 118.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.0769243985414505, | |
| "eval_runtime": 13.3442, | |
| "eval_samples_per_second": 749.391, | |
| "eval_steps_per_second": 2.998, | |
| "step": 39294 | |
| }, | |
| { | |
| "epoch": 118.62, | |
| "grad_norm": 7.559940338134766, | |
| "learning_rate": 6.046046046046047e-06, | |
| "loss": 0.1605, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 119.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.0768662765622139, | |
| "eval_runtime": 13.0344, | |
| "eval_samples_per_second": 767.2, | |
| "eval_steps_per_second": 3.069, | |
| "step": 39627 | |
| }, | |
| { | |
| "epoch": 120.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.07865633815526962, | |
| "eval_runtime": 13.8055, | |
| "eval_samples_per_second": 724.347, | |
| "eval_steps_per_second": 2.897, | |
| "step": 39960 | |
| }, | |
| { | |
| "epoch": 120.12, | |
| "grad_norm": 7.175966739654541, | |
| "learning_rate": 5.995995995995997e-06, | |
| "loss": 0.1554, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 121.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.07854399085044861, | |
| "eval_runtime": 12.8799, | |
| "eval_samples_per_second": 776.402, | |
| "eval_steps_per_second": 3.106, | |
| "step": 40293 | |
| }, | |
| { | |
| "epoch": 121.62, | |
| "grad_norm": 12.97214126586914, | |
| "learning_rate": 5.945945945945947e-06, | |
| "loss": 0.157, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 122.0, | |
| "eval_accuracy": 0.9796, | |
| "eval_loss": 0.0760401040315628, | |
| "eval_runtime": 12.9319, | |
| "eval_samples_per_second": 773.283, | |
| "eval_steps_per_second": 3.093, | |
| "step": 40626 | |
| }, | |
| { | |
| "epoch": 123.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.07537718862295151, | |
| "eval_runtime": 12.9913, | |
| "eval_samples_per_second": 769.749, | |
| "eval_steps_per_second": 3.079, | |
| "step": 40959 | |
| }, | |
| { | |
| "epoch": 123.12, | |
| "grad_norm": 7.540937423706055, | |
| "learning_rate": 5.895895895895896e-06, | |
| "loss": 0.1549, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 124.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.07550998032093048, | |
| "eval_runtime": 14.417, | |
| "eval_samples_per_second": 693.624, | |
| "eval_steps_per_second": 2.774, | |
| "step": 41292 | |
| }, | |
| { | |
| "epoch": 124.62, | |
| "grad_norm": 6.355432987213135, | |
| "learning_rate": 5.8458458458458464e-06, | |
| "loss": 0.1578, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 125.0, | |
| "eval_accuracy": 0.9792, | |
| "eval_loss": 0.07649920880794525, | |
| "eval_runtime": 13.4926, | |
| "eval_samples_per_second": 741.15, | |
| "eval_steps_per_second": 2.965, | |
| "step": 41625 | |
| }, | |
| { | |
| "epoch": 126.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07526528090238571, | |
| "eval_runtime": 12.9079, | |
| "eval_samples_per_second": 774.72, | |
| "eval_steps_per_second": 3.099, | |
| "step": 41958 | |
| }, | |
| { | |
| "epoch": 126.13, | |
| "grad_norm": 6.478011131286621, | |
| "learning_rate": 5.7957957957957965e-06, | |
| "loss": 0.1531, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 127.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07793418318033218, | |
| "eval_runtime": 13.471, | |
| "eval_samples_per_second": 742.337, | |
| "eval_steps_per_second": 2.969, | |
| "step": 42291 | |
| }, | |
| { | |
| "epoch": 127.63, | |
| "grad_norm": 7.928163051605225, | |
| "learning_rate": 5.7457457457457466e-06, | |
| "loss": 0.1572, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 128.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07834824174642563, | |
| "eval_runtime": 13.8772, | |
| "eval_samples_per_second": 720.605, | |
| "eval_steps_per_second": 2.882, | |
| "step": 42624 | |
| }, | |
| { | |
| "epoch": 129.0, | |
| "eval_accuracy": 0.9796, | |
| "eval_loss": 0.0785522609949112, | |
| "eval_runtime": 12.947, | |
| "eval_samples_per_second": 772.377, | |
| "eval_steps_per_second": 3.09, | |
| "step": 42957 | |
| }, | |
| { | |
| "epoch": 129.13, | |
| "grad_norm": 19.900619506835938, | |
| "learning_rate": 5.695695695695697e-06, | |
| "loss": 0.1558, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 130.0, | |
| "eval_accuracy": 0.9814, | |
| "eval_loss": 0.0741908997297287, | |
| "eval_runtime": 12.8882, | |
| "eval_samples_per_second": 775.906, | |
| "eval_steps_per_second": 3.104, | |
| "step": 43290 | |
| }, | |
| { | |
| "epoch": 130.63, | |
| "grad_norm": 12.561553001403809, | |
| "learning_rate": 5.645645645645647e-06, | |
| "loss": 0.1515, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 131.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.07759422063827515, | |
| "eval_runtime": 14.2426, | |
| "eval_samples_per_second": 702.121, | |
| "eval_steps_per_second": 2.808, | |
| "step": 43623 | |
| }, | |
| { | |
| "epoch": 132.0, | |
| "eval_accuracy": 0.9793, | |
| "eval_loss": 0.08000089973211288, | |
| "eval_runtime": 13.0308, | |
| "eval_samples_per_second": 767.413, | |
| "eval_steps_per_second": 3.07, | |
| "step": 43956 | |
| }, | |
| { | |
| "epoch": 132.13, | |
| "grad_norm": 10.955676078796387, | |
| "learning_rate": 5.595595595595597e-06, | |
| "loss": 0.1526, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 133.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.07563788443803787, | |
| "eval_runtime": 12.9183, | |
| "eval_samples_per_second": 774.093, | |
| "eval_steps_per_second": 3.096, | |
| "step": 44289 | |
| }, | |
| { | |
| "epoch": 133.63, | |
| "grad_norm": 9.621336936950684, | |
| "learning_rate": 5.545545545545547e-06, | |
| "loss": 0.1523, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 134.0, | |
| "eval_accuracy": 0.9797, | |
| "eval_loss": 0.07889340072870255, | |
| "eval_runtime": 13.5904, | |
| "eval_samples_per_second": 735.813, | |
| "eval_steps_per_second": 2.943, | |
| "step": 44622 | |
| }, | |
| { | |
| "epoch": 135.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.07651440799236298, | |
| "eval_runtime": 13.0261, | |
| "eval_samples_per_second": 767.689, | |
| "eval_steps_per_second": 3.071, | |
| "step": 44955 | |
| }, | |
| { | |
| "epoch": 135.14, | |
| "grad_norm": 9.40494155883789, | |
| "learning_rate": 5.495495495495496e-06, | |
| "loss": 0.1519, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 136.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.07700727880001068, | |
| "eval_runtime": 14.2776, | |
| "eval_samples_per_second": 700.397, | |
| "eval_steps_per_second": 2.802, | |
| "step": 45288 | |
| }, | |
| { | |
| "epoch": 136.64, | |
| "grad_norm": 7.778809070587158, | |
| "learning_rate": 5.445445445445446e-06, | |
| "loss": 0.1491, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 137.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07937881350517273, | |
| "eval_runtime": 13.7045, | |
| "eval_samples_per_second": 729.689, | |
| "eval_steps_per_second": 2.919, | |
| "step": 45621 | |
| }, | |
| { | |
| "epoch": 138.0, | |
| "eval_accuracy": 0.9796, | |
| "eval_loss": 0.07901179045438766, | |
| "eval_runtime": 12.8776, | |
| "eval_samples_per_second": 776.54, | |
| "eval_steps_per_second": 3.106, | |
| "step": 45954 | |
| }, | |
| { | |
| "epoch": 138.14, | |
| "grad_norm": 12.694830894470215, | |
| "learning_rate": 5.395395395395396e-06, | |
| "loss": 0.1488, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 139.0, | |
| "eval_accuracy": 0.9796, | |
| "eval_loss": 0.07827717065811157, | |
| "eval_runtime": 13.01, | |
| "eval_samples_per_second": 768.642, | |
| "eval_steps_per_second": 3.075, | |
| "step": 46287 | |
| }, | |
| { | |
| "epoch": 139.64, | |
| "grad_norm": 5.728260517120361, | |
| "learning_rate": 5.345345345345346e-06, | |
| "loss": 0.1511, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 140.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07687978446483612, | |
| "eval_runtime": 13.4169, | |
| "eval_samples_per_second": 745.331, | |
| "eval_steps_per_second": 2.981, | |
| "step": 46620 | |
| }, | |
| { | |
| "epoch": 141.0, | |
| "eval_accuracy": 0.9797, | |
| "eval_loss": 0.0826837420463562, | |
| "eval_runtime": 13.6768, | |
| "eval_samples_per_second": 731.166, | |
| "eval_steps_per_second": 2.925, | |
| "step": 46953 | |
| }, | |
| { | |
| "epoch": 141.14, | |
| "grad_norm": 8.749393463134766, | |
| "learning_rate": 5.2952952952952955e-06, | |
| "loss": 0.1475, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 142.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07702562212944031, | |
| "eval_runtime": 13.4888, | |
| "eval_samples_per_second": 741.356, | |
| "eval_steps_per_second": 2.965, | |
| "step": 47286 | |
| }, | |
| { | |
| "epoch": 142.64, | |
| "grad_norm": 8.479342460632324, | |
| "learning_rate": 5.245245245245245e-06, | |
| "loss": 0.1449, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 143.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07797821611166, | |
| "eval_runtime": 13.0058, | |
| "eval_samples_per_second": 768.886, | |
| "eval_steps_per_second": 3.076, | |
| "step": 47619 | |
| }, | |
| { | |
| "epoch": 144.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.07707054167985916, | |
| "eval_runtime": 12.9845, | |
| "eval_samples_per_second": 770.15, | |
| "eval_steps_per_second": 3.081, | |
| "step": 47952 | |
| }, | |
| { | |
| "epoch": 144.14, | |
| "grad_norm": 10.80911636352539, | |
| "learning_rate": 5.195195195195195e-06, | |
| "loss": 0.146, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 145.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.0750807523727417, | |
| "eval_runtime": 14.0436, | |
| "eval_samples_per_second": 712.069, | |
| "eval_steps_per_second": 2.848, | |
| "step": 48285 | |
| }, | |
| { | |
| "epoch": 145.65, | |
| "grad_norm": 5.568371295928955, | |
| "learning_rate": 5.145145145145145e-06, | |
| "loss": 0.1473, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 146.0, | |
| "eval_accuracy": 0.9797, | |
| "eval_loss": 0.07933703809976578, | |
| "eval_runtime": 13.1022, | |
| "eval_samples_per_second": 763.232, | |
| "eval_steps_per_second": 3.053, | |
| "step": 48618 | |
| }, | |
| { | |
| "epoch": 147.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.07590621709823608, | |
| "eval_runtime": 13.1387, | |
| "eval_samples_per_second": 761.108, | |
| "eval_steps_per_second": 3.044, | |
| "step": 48951 | |
| }, | |
| { | |
| "epoch": 147.15, | |
| "grad_norm": 8.234355926513672, | |
| "learning_rate": 5.095095095095095e-06, | |
| "loss": 0.1466, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 148.0, | |
| "eval_accuracy": 0.9787, | |
| "eval_loss": 0.08211437612771988, | |
| "eval_runtime": 13.453, | |
| "eval_samples_per_second": 743.33, | |
| "eval_steps_per_second": 2.973, | |
| "step": 49284 | |
| }, | |
| { | |
| "epoch": 148.65, | |
| "grad_norm": 9.734493255615234, | |
| "learning_rate": 5.045045045045045e-06, | |
| "loss": 0.1472, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 149.0, | |
| "eval_accuracy": 0.9813, | |
| "eval_loss": 0.07566899061203003, | |
| "eval_runtime": 13.5127, | |
| "eval_samples_per_second": 740.042, | |
| "eval_steps_per_second": 2.96, | |
| "step": 49617 | |
| }, | |
| { | |
| "epoch": 150.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.07641930133104324, | |
| "eval_runtime": 13.729, | |
| "eval_samples_per_second": 728.384, | |
| "eval_steps_per_second": 2.914, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 150.15, | |
| "grad_norm": 9.083195686340332, | |
| "learning_rate": 4.994994994994996e-06, | |
| "loss": 0.1437, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 151.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.0816345363855362, | |
| "eval_runtime": 13.6081, | |
| "eval_samples_per_second": 734.856, | |
| "eval_steps_per_second": 2.939, | |
| "step": 50283 | |
| }, | |
| { | |
| "epoch": 151.65, | |
| "grad_norm": 16.20008087158203, | |
| "learning_rate": 4.944944944944945e-06, | |
| "loss": 0.1487, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 152.0, | |
| "eval_accuracy": 0.9818, | |
| "eval_loss": 0.07768727838993073, | |
| "eval_runtime": 12.9061, | |
| "eval_samples_per_second": 774.83, | |
| "eval_steps_per_second": 3.099, | |
| "step": 50616 | |
| }, | |
| { | |
| "epoch": 153.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.07950293272733688, | |
| "eval_runtime": 13.0523, | |
| "eval_samples_per_second": 766.151, | |
| "eval_steps_per_second": 3.065, | |
| "step": 50949 | |
| }, | |
| { | |
| "epoch": 153.15, | |
| "grad_norm": 6.783934593200684, | |
| "learning_rate": 4.894894894894895e-06, | |
| "loss": 0.1455, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 154.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.07836713641881943, | |
| "eval_runtime": 13.4341, | |
| "eval_samples_per_second": 744.377, | |
| "eval_steps_per_second": 2.978, | |
| "step": 51282 | |
| }, | |
| { | |
| "epoch": 154.65, | |
| "grad_norm": 7.791309833526611, | |
| "learning_rate": 4.844844844844845e-06, | |
| "loss": 0.1463, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 155.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.07995989918708801, | |
| "eval_runtime": 13.7204, | |
| "eval_samples_per_second": 728.844, | |
| "eval_steps_per_second": 2.915, | |
| "step": 51615 | |
| }, | |
| { | |
| "epoch": 156.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.07914280891418457, | |
| "eval_runtime": 13.045, | |
| "eval_samples_per_second": 766.58, | |
| "eval_steps_per_second": 3.066, | |
| "step": 51948 | |
| }, | |
| { | |
| "epoch": 156.16, | |
| "grad_norm": 7.225980281829834, | |
| "learning_rate": 4.794794794794795e-06, | |
| "loss": 0.1449, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 157.0, | |
| "eval_accuracy": 0.9815, | |
| "eval_loss": 0.0777197852730751, | |
| "eval_runtime": 12.8795, | |
| "eval_samples_per_second": 776.43, | |
| "eval_steps_per_second": 3.106, | |
| "step": 52281 | |
| }, | |
| { | |
| "epoch": 157.66, | |
| "grad_norm": 7.848995208740234, | |
| "learning_rate": 4.7447447447447454e-06, | |
| "loss": 0.1413, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 158.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.07978815585374832, | |
| "eval_runtime": 13.0849, | |
| "eval_samples_per_second": 764.238, | |
| "eval_steps_per_second": 3.057, | |
| "step": 52614 | |
| }, | |
| { | |
| "epoch": 159.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.08010842651128769, | |
| "eval_runtime": 12.9948, | |
| "eval_samples_per_second": 769.539, | |
| "eval_steps_per_second": 3.078, | |
| "step": 52947 | |
| }, | |
| { | |
| "epoch": 159.16, | |
| "grad_norm": 10.857318878173828, | |
| "learning_rate": 4.6946946946946955e-06, | |
| "loss": 0.143, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 160.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.07897085696458817, | |
| "eval_runtime": 13.4824, | |
| "eval_samples_per_second": 741.707, | |
| "eval_steps_per_second": 2.967, | |
| "step": 53280 | |
| }, | |
| { | |
| "epoch": 160.66, | |
| "grad_norm": 8.192683219909668, | |
| "learning_rate": 4.6446446446446456e-06, | |
| "loss": 0.1462, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 161.0, | |
| "eval_accuracy": 0.9794, | |
| "eval_loss": 0.07847656309604645, | |
| "eval_runtime": 13.3614, | |
| "eval_samples_per_second": 748.422, | |
| "eval_steps_per_second": 2.994, | |
| "step": 53613 | |
| }, | |
| { | |
| "epoch": 162.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.07839296758174896, | |
| "eval_runtime": 13.4943, | |
| "eval_samples_per_second": 741.054, | |
| "eval_steps_per_second": 2.964, | |
| "step": 53946 | |
| }, | |
| { | |
| "epoch": 162.16, | |
| "grad_norm": 5.753213882446289, | |
| "learning_rate": 4.594594594594596e-06, | |
| "loss": 0.1454, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 163.0, | |
| "eval_accuracy": 0.9814, | |
| "eval_loss": 0.07774946093559265, | |
| "eval_runtime": 13.5161, | |
| "eval_samples_per_second": 739.858, | |
| "eval_steps_per_second": 2.959, | |
| "step": 54279 | |
| }, | |
| { | |
| "epoch": 163.66, | |
| "grad_norm": 23.634429931640625, | |
| "learning_rate": 4.544544544544545e-06, | |
| "loss": 0.1404, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 164.0, | |
| "eval_accuracy": 0.9817, | |
| "eval_loss": 0.07676123827695847, | |
| "eval_runtime": 13.7785, | |
| "eval_samples_per_second": 725.767, | |
| "eval_steps_per_second": 2.903, | |
| "step": 54612 | |
| }, | |
| { | |
| "epoch": 165.0, | |
| "eval_accuracy": 0.9795, | |
| "eval_loss": 0.07868321239948273, | |
| "eval_runtime": 13.3337, | |
| "eval_samples_per_second": 749.978, | |
| "eval_steps_per_second": 3.0, | |
| "step": 54945 | |
| }, | |
| { | |
| "epoch": 165.17, | |
| "grad_norm": 14.497030258178711, | |
| "learning_rate": 4.494494494494495e-06, | |
| "loss": 0.1404, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 166.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.08142885565757751, | |
| "eval_runtime": 13.0878, | |
| "eval_samples_per_second": 764.068, | |
| "eval_steps_per_second": 3.056, | |
| "step": 55278 | |
| }, | |
| { | |
| "epoch": 166.67, | |
| "grad_norm": 5.504241943359375, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.1438, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 167.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08015668392181396, | |
| "eval_runtime": 13.3375, | |
| "eval_samples_per_second": 749.766, | |
| "eval_steps_per_second": 2.999, | |
| "step": 55611 | |
| }, | |
| { | |
| "epoch": 168.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.0773804783821106, | |
| "eval_runtime": 13.1562, | |
| "eval_samples_per_second": 760.1, | |
| "eval_steps_per_second": 3.04, | |
| "step": 55944 | |
| }, | |
| { | |
| "epoch": 168.17, | |
| "grad_norm": 10.65889835357666, | |
| "learning_rate": 4.394394394394394e-06, | |
| "loss": 0.1405, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 169.0, | |
| "eval_accuracy": 0.9793, | |
| "eval_loss": 0.07769276201725006, | |
| "eval_runtime": 13.37, | |
| "eval_samples_per_second": 747.945, | |
| "eval_steps_per_second": 2.992, | |
| "step": 56277 | |
| }, | |
| { | |
| "epoch": 169.67, | |
| "grad_norm": 9.663138389587402, | |
| "learning_rate": 4.344344344344344e-06, | |
| "loss": 0.1465, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 170.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.07831669598817825, | |
| "eval_runtime": 13.9555, | |
| "eval_samples_per_second": 716.565, | |
| "eval_steps_per_second": 2.866, | |
| "step": 56610 | |
| }, | |
| { | |
| "epoch": 171.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.08174577355384827, | |
| "eval_runtime": 13.3581, | |
| "eval_samples_per_second": 748.612, | |
| "eval_steps_per_second": 2.994, | |
| "step": 56943 | |
| }, | |
| { | |
| "epoch": 171.17, | |
| "grad_norm": 11.15052604675293, | |
| "learning_rate": 4.294294294294294e-06, | |
| "loss": 0.1404, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 172.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.0780324712395668, | |
| "eval_runtime": 12.9812, | |
| "eval_samples_per_second": 770.346, | |
| "eval_steps_per_second": 3.081, | |
| "step": 57276 | |
| }, | |
| { | |
| "epoch": 172.67, | |
| "grad_norm": 10.398097038269043, | |
| "learning_rate": 4.2442442442442444e-06, | |
| "loss": 0.1367, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 173.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.07895645499229431, | |
| "eval_runtime": 12.8468, | |
| "eval_samples_per_second": 778.402, | |
| "eval_steps_per_second": 3.114, | |
| "step": 57609 | |
| }, | |
| { | |
| "epoch": 174.0, | |
| "eval_accuracy": 0.9816, | |
| "eval_loss": 0.07868947833776474, | |
| "eval_runtime": 13.3373, | |
| "eval_samples_per_second": 749.779, | |
| "eval_steps_per_second": 2.999, | |
| "step": 57942 | |
| }, | |
| { | |
| "epoch": 174.17, | |
| "grad_norm": 8.292234420776367, | |
| "learning_rate": 4.1941941941941945e-06, | |
| "loss": 0.1399, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 175.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.08106452971696854, | |
| "eval_runtime": 12.854, | |
| "eval_samples_per_second": 777.968, | |
| "eval_steps_per_second": 3.112, | |
| "step": 58275 | |
| }, | |
| { | |
| "epoch": 175.68, | |
| "grad_norm": 12.446533203125, | |
| "learning_rate": 4.1441441441441446e-06, | |
| "loss": 0.1418, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 176.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.08040361106395721, | |
| "eval_runtime": 13.1526, | |
| "eval_samples_per_second": 760.308, | |
| "eval_steps_per_second": 3.041, | |
| "step": 58608 | |
| }, | |
| { | |
| "epoch": 177.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.07995961606502533, | |
| "eval_runtime": 13.0397, | |
| "eval_samples_per_second": 766.888, | |
| "eval_steps_per_second": 3.068, | |
| "step": 58941 | |
| }, | |
| { | |
| "epoch": 177.18, | |
| "grad_norm": 9.551538467407227, | |
| "learning_rate": 4.094094094094095e-06, | |
| "loss": 0.1381, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 178.0, | |
| "eval_accuracy": 0.9814, | |
| "eval_loss": 0.07857974618673325, | |
| "eval_runtime": 13.3646, | |
| "eval_samples_per_second": 748.245, | |
| "eval_steps_per_second": 2.993, | |
| "step": 59274 | |
| }, | |
| { | |
| "epoch": 178.68, | |
| "grad_norm": 7.961233615875244, | |
| "learning_rate": 4.044044044044044e-06, | |
| "loss": 0.1357, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 179.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.0797557458281517, | |
| "eval_runtime": 13.833, | |
| "eval_samples_per_second": 722.907, | |
| "eval_steps_per_second": 2.892, | |
| "step": 59607 | |
| }, | |
| { | |
| "epoch": 180.0, | |
| "eval_accuracy": 0.9813, | |
| "eval_loss": 0.07922037690877914, | |
| "eval_runtime": 13.1611, | |
| "eval_samples_per_second": 759.818, | |
| "eval_steps_per_second": 3.039, | |
| "step": 59940 | |
| }, | |
| { | |
| "epoch": 180.18, | |
| "grad_norm": 8.392486572265625, | |
| "learning_rate": 3.993993993993994e-06, | |
| "loss": 0.1465, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 181.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.08021984249353409, | |
| "eval_runtime": 12.7756, | |
| "eval_samples_per_second": 782.741, | |
| "eval_steps_per_second": 3.131, | |
| "step": 60273 | |
| }, | |
| { | |
| "epoch": 181.68, | |
| "grad_norm": 5.668210506439209, | |
| "learning_rate": 3.943943943943944e-06, | |
| "loss": 0.1366, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 182.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.07884296774864197, | |
| "eval_runtime": 12.9767, | |
| "eval_samples_per_second": 770.61, | |
| "eval_steps_per_second": 3.082, | |
| "step": 60606 | |
| }, | |
| { | |
| "epoch": 183.0, | |
| "eval_accuracy": 0.979, | |
| "eval_loss": 0.0805293619632721, | |
| "eval_runtime": 12.9332, | |
| "eval_samples_per_second": 773.205, | |
| "eval_steps_per_second": 3.093, | |
| "step": 60939 | |
| }, | |
| { | |
| "epoch": 183.18, | |
| "grad_norm": 9.771552085876465, | |
| "learning_rate": 3.893893893893894e-06, | |
| "loss": 0.139, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 184.0, | |
| "eval_accuracy": 0.9794, | |
| "eval_loss": 0.0822456106543541, | |
| "eval_runtime": 13.3118, | |
| "eval_samples_per_second": 751.211, | |
| "eval_steps_per_second": 3.005, | |
| "step": 61272 | |
| }, | |
| { | |
| "epoch": 184.68, | |
| "grad_norm": 10.898391723632812, | |
| "learning_rate": 3.843843843843844e-06, | |
| "loss": 0.1381, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 185.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08079157024621964, | |
| "eval_runtime": 12.8717, | |
| "eval_samples_per_second": 776.899, | |
| "eval_steps_per_second": 3.108, | |
| "step": 61605 | |
| }, | |
| { | |
| "epoch": 186.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08059785515069962, | |
| "eval_runtime": 12.418, | |
| "eval_samples_per_second": 805.284, | |
| "eval_steps_per_second": 3.221, | |
| "step": 61938 | |
| }, | |
| { | |
| "epoch": 186.19, | |
| "grad_norm": 6.1758246421813965, | |
| "learning_rate": 3.793793793793794e-06, | |
| "loss": 0.1367, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 187.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.07853790372610092, | |
| "eval_runtime": 12.9215, | |
| "eval_samples_per_second": 773.902, | |
| "eval_steps_per_second": 3.096, | |
| "step": 62271 | |
| }, | |
| { | |
| "epoch": 187.69, | |
| "grad_norm": 9.155027389526367, | |
| "learning_rate": 3.743743743743744e-06, | |
| "loss": 0.1354, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 188.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.0803978368639946, | |
| "eval_runtime": 13.5157, | |
| "eval_samples_per_second": 739.883, | |
| "eval_steps_per_second": 2.96, | |
| "step": 62604 | |
| }, | |
| { | |
| "epoch": 189.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.07950347661972046, | |
| "eval_runtime": 13.0185, | |
| "eval_samples_per_second": 768.138, | |
| "eval_steps_per_second": 3.073, | |
| "step": 62937 | |
| }, | |
| { | |
| "epoch": 189.19, | |
| "grad_norm": 9.88645076751709, | |
| "learning_rate": 3.693693693693694e-06, | |
| "loss": 0.137, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 190.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.07970842719078064, | |
| "eval_runtime": 13.0486, | |
| "eval_samples_per_second": 766.367, | |
| "eval_steps_per_second": 3.065, | |
| "step": 63270 | |
| }, | |
| { | |
| "epoch": 190.69, | |
| "grad_norm": 10.085098266601562, | |
| "learning_rate": 3.643643643643644e-06, | |
| "loss": 0.1351, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 191.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.07862575352191925, | |
| "eval_runtime": 13.7359, | |
| "eval_samples_per_second": 728.019, | |
| "eval_steps_per_second": 2.912, | |
| "step": 63603 | |
| }, | |
| { | |
| "epoch": 192.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.07779725641012192, | |
| "eval_runtime": 14.1749, | |
| "eval_samples_per_second": 705.473, | |
| "eval_steps_per_second": 2.822, | |
| "step": 63936 | |
| }, | |
| { | |
| "epoch": 192.19, | |
| "grad_norm": 7.259002685546875, | |
| "learning_rate": 3.593593593593594e-06, | |
| "loss": 0.1345, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 193.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.07995971292257309, | |
| "eval_runtime": 13.3268, | |
| "eval_samples_per_second": 750.366, | |
| "eval_steps_per_second": 3.001, | |
| "step": 64269 | |
| }, | |
| { | |
| "epoch": 193.69, | |
| "grad_norm": 6.42719030380249, | |
| "learning_rate": 3.5435435435435437e-06, | |
| "loss": 0.1377, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 194.0, | |
| "eval_accuracy": 0.9799, | |
| "eval_loss": 0.07895601540803909, | |
| "eval_runtime": 12.9129, | |
| "eval_samples_per_second": 774.417, | |
| "eval_steps_per_second": 3.098, | |
| "step": 64602 | |
| }, | |
| { | |
| "epoch": 195.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.08155795186758041, | |
| "eval_runtime": 13.7447, | |
| "eval_samples_per_second": 727.555, | |
| "eval_steps_per_second": 2.91, | |
| "step": 64935 | |
| }, | |
| { | |
| "epoch": 195.2, | |
| "grad_norm": 7.303466320037842, | |
| "learning_rate": 3.4934934934934938e-06, | |
| "loss": 0.1339, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 196.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.08134587854146957, | |
| "eval_runtime": 12.87, | |
| "eval_samples_per_second": 777.004, | |
| "eval_steps_per_second": 3.108, | |
| "step": 65268 | |
| }, | |
| { | |
| "epoch": 196.7, | |
| "grad_norm": 10.115856170654297, | |
| "learning_rate": 3.443443443443444e-06, | |
| "loss": 0.1338, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 197.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.07863133400678635, | |
| "eval_runtime": 13.1588, | |
| "eval_samples_per_second": 759.949, | |
| "eval_steps_per_second": 3.04, | |
| "step": 65601 | |
| }, | |
| { | |
| "epoch": 198.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08128491789102554, | |
| "eval_runtime": 12.3451, | |
| "eval_samples_per_second": 810.038, | |
| "eval_steps_per_second": 3.24, | |
| "step": 65934 | |
| }, | |
| { | |
| "epoch": 198.2, | |
| "grad_norm": 9.01919174194336, | |
| "learning_rate": 3.393393393393394e-06, | |
| "loss": 0.1371, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 199.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.08089832216501236, | |
| "eval_runtime": 13.1128, | |
| "eval_samples_per_second": 762.612, | |
| "eval_steps_per_second": 3.05, | |
| "step": 66267 | |
| }, | |
| { | |
| "epoch": 199.7, | |
| "grad_norm": 9.190634727478027, | |
| "learning_rate": 3.3433433433433436e-06, | |
| "loss": 0.1339, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 200.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.07968232780694962, | |
| "eval_runtime": 12.8919, | |
| "eval_samples_per_second": 775.68, | |
| "eval_steps_per_second": 3.103, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 201.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.08057761192321777, | |
| "eval_runtime": 12.9886, | |
| "eval_samples_per_second": 769.904, | |
| "eval_steps_per_second": 3.08, | |
| "step": 66933 | |
| }, | |
| { | |
| "epoch": 201.2, | |
| "grad_norm": 9.490571022033691, | |
| "learning_rate": 3.2932932932932936e-06, | |
| "loss": 0.131, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 202.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.08165069669485092, | |
| "eval_runtime": 13.9588, | |
| "eval_samples_per_second": 716.394, | |
| "eval_steps_per_second": 2.866, | |
| "step": 67266 | |
| }, | |
| { | |
| "epoch": 202.7, | |
| "grad_norm": 8.564950942993164, | |
| "learning_rate": 3.2432432432432437e-06, | |
| "loss": 0.1365, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 203.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.08228688687086105, | |
| "eval_runtime": 12.9615, | |
| "eval_samples_per_second": 771.513, | |
| "eval_steps_per_second": 3.086, | |
| "step": 67599 | |
| }, | |
| { | |
| "epoch": 204.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.08267272263765335, | |
| "eval_runtime": 12.8976, | |
| "eval_samples_per_second": 775.339, | |
| "eval_steps_per_second": 3.101, | |
| "step": 67932 | |
| }, | |
| { | |
| "epoch": 204.2, | |
| "grad_norm": 9.844771385192871, | |
| "learning_rate": 3.1931931931931938e-06, | |
| "loss": 0.1358, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 205.0, | |
| "eval_accuracy": 0.9816, | |
| "eval_loss": 0.0804433524608612, | |
| "eval_runtime": 12.8434, | |
| "eval_samples_per_second": 778.613, | |
| "eval_steps_per_second": 3.114, | |
| "step": 68265 | |
| }, | |
| { | |
| "epoch": 205.71, | |
| "grad_norm": 9.6033935546875, | |
| "learning_rate": 3.1431431431431434e-06, | |
| "loss": 0.132, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 206.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08253764361143112, | |
| "eval_runtime": 13.4062, | |
| "eval_samples_per_second": 745.922, | |
| "eval_steps_per_second": 2.984, | |
| "step": 68598 | |
| }, | |
| { | |
| "epoch": 207.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.07984968274831772, | |
| "eval_runtime": 13.6899, | |
| "eval_samples_per_second": 730.467, | |
| "eval_steps_per_second": 2.922, | |
| "step": 68931 | |
| }, | |
| { | |
| "epoch": 207.21, | |
| "grad_norm": 7.0395355224609375, | |
| "learning_rate": 3.0930930930930935e-06, | |
| "loss": 0.1396, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 208.0, | |
| "eval_accuracy": 0.9813, | |
| "eval_loss": 0.08085375279188156, | |
| "eval_runtime": 12.8706, | |
| "eval_samples_per_second": 776.962, | |
| "eval_steps_per_second": 3.108, | |
| "step": 69264 | |
| }, | |
| { | |
| "epoch": 208.71, | |
| "grad_norm": 12.84909725189209, | |
| "learning_rate": 3.0430430430430436e-06, | |
| "loss": 0.1324, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 209.0, | |
| "eval_accuracy": 0.9815, | |
| "eval_loss": 0.07963848859071732, | |
| "eval_runtime": 12.9764, | |
| "eval_samples_per_second": 770.628, | |
| "eval_steps_per_second": 3.083, | |
| "step": 69597 | |
| }, | |
| { | |
| "epoch": 210.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08001097291707993, | |
| "eval_runtime": 13.4375, | |
| "eval_samples_per_second": 744.185, | |
| "eval_steps_per_second": 2.977, | |
| "step": 69930 | |
| }, | |
| { | |
| "epoch": 210.21, | |
| "grad_norm": 8.406508445739746, | |
| "learning_rate": 2.9929929929929936e-06, | |
| "loss": 0.1324, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 211.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.08123359829187393, | |
| "eval_runtime": 13.1971, | |
| "eval_samples_per_second": 757.742, | |
| "eval_steps_per_second": 3.031, | |
| "step": 70263 | |
| }, | |
| { | |
| "epoch": 211.71, | |
| "grad_norm": 4.204705715179443, | |
| "learning_rate": 2.942942942942943e-06, | |
| "loss": 0.1343, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 212.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.08246932923793793, | |
| "eval_runtime": 13.3417, | |
| "eval_samples_per_second": 749.532, | |
| "eval_steps_per_second": 2.998, | |
| "step": 70596 | |
| }, | |
| { | |
| "epoch": 213.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.08172763139009476, | |
| "eval_runtime": 12.9861, | |
| "eval_samples_per_second": 770.053, | |
| "eval_steps_per_second": 3.08, | |
| "step": 70929 | |
| }, | |
| { | |
| "epoch": 213.21, | |
| "grad_norm": 8.177204132080078, | |
| "learning_rate": 2.892892892892893e-06, | |
| "loss": 0.1322, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 214.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.08131828904151917, | |
| "eval_runtime": 14.0986, | |
| "eval_samples_per_second": 709.289, | |
| "eval_steps_per_second": 2.837, | |
| "step": 71262 | |
| }, | |
| { | |
| "epoch": 214.71, | |
| "grad_norm": 8.844195365905762, | |
| "learning_rate": 2.842842842842843e-06, | |
| "loss": 0.133, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 215.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.0824679508805275, | |
| "eval_runtime": 12.94, | |
| "eval_samples_per_second": 772.8, | |
| "eval_steps_per_second": 3.091, | |
| "step": 71595 | |
| }, | |
| { | |
| "epoch": 216.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.0828867107629776, | |
| "eval_runtime": 12.9965, | |
| "eval_samples_per_second": 769.439, | |
| "eval_steps_per_second": 3.078, | |
| "step": 71928 | |
| }, | |
| { | |
| "epoch": 216.22, | |
| "grad_norm": 11.01076889038086, | |
| "learning_rate": 2.7927927927927926e-06, | |
| "loss": 0.1336, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 217.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08191470056772232, | |
| "eval_runtime": 12.6388, | |
| "eval_samples_per_second": 791.211, | |
| "eval_steps_per_second": 3.165, | |
| "step": 72261 | |
| }, | |
| { | |
| "epoch": 217.72, | |
| "grad_norm": 8.309555053710938, | |
| "learning_rate": 2.7427427427427427e-06, | |
| "loss": 0.1287, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 218.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.08172294497489929, | |
| "eval_runtime": 12.869, | |
| "eval_samples_per_second": 777.063, | |
| "eval_steps_per_second": 3.108, | |
| "step": 72594 | |
| }, | |
| { | |
| "epoch": 219.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.08100100606679916, | |
| "eval_runtime": 13.9577, | |
| "eval_samples_per_second": 716.449, | |
| "eval_steps_per_second": 2.866, | |
| "step": 72927 | |
| }, | |
| { | |
| "epoch": 219.22, | |
| "grad_norm": 10.596402168273926, | |
| "learning_rate": 2.6926926926926928e-06, | |
| "loss": 0.1322, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 220.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.08346739411354065, | |
| "eval_runtime": 12.8881, | |
| "eval_samples_per_second": 775.91, | |
| "eval_steps_per_second": 3.104, | |
| "step": 73260 | |
| }, | |
| { | |
| "epoch": 220.72, | |
| "grad_norm": 8.293975830078125, | |
| "learning_rate": 2.642642642642643e-06, | |
| "loss": 0.1287, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 221.0, | |
| "eval_accuracy": 0.9798, | |
| "eval_loss": 0.08478812873363495, | |
| "eval_runtime": 12.482, | |
| "eval_samples_per_second": 801.151, | |
| "eval_steps_per_second": 3.205, | |
| "step": 73593 | |
| }, | |
| { | |
| "epoch": 222.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.08156371861696243, | |
| "eval_runtime": 12.9596, | |
| "eval_samples_per_second": 771.628, | |
| "eval_steps_per_second": 3.087, | |
| "step": 73926 | |
| }, | |
| { | |
| "epoch": 222.22, | |
| "grad_norm": 9.707475662231445, | |
| "learning_rate": 2.5925925925925925e-06, | |
| "loss": 0.1317, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 223.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.08239776641130447, | |
| "eval_runtime": 13.8203, | |
| "eval_samples_per_second": 723.571, | |
| "eval_steps_per_second": 2.894, | |
| "step": 74259 | |
| }, | |
| { | |
| "epoch": 223.72, | |
| "grad_norm": 5.2577996253967285, | |
| "learning_rate": 2.5425425425425426e-06, | |
| "loss": 0.1308, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 224.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.08223745971918106, | |
| "eval_runtime": 13.4783, | |
| "eval_samples_per_second": 741.934, | |
| "eval_steps_per_second": 2.968, | |
| "step": 74592 | |
| }, | |
| { | |
| "epoch": 225.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.0822429209947586, | |
| "eval_runtime": 13.2583, | |
| "eval_samples_per_second": 754.244, | |
| "eval_steps_per_second": 3.017, | |
| "step": 74925 | |
| }, | |
| { | |
| "epoch": 225.23, | |
| "grad_norm": 6.952250957489014, | |
| "learning_rate": 2.4924924924924926e-06, | |
| "loss": 0.1247, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 226.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.08117574453353882, | |
| "eval_runtime": 13.5159, | |
| "eval_samples_per_second": 739.872, | |
| "eval_steps_per_second": 2.959, | |
| "step": 75258 | |
| }, | |
| { | |
| "epoch": 226.73, | |
| "grad_norm": 17.568580627441406, | |
| "learning_rate": 2.4424424424424427e-06, | |
| "loss": 0.129, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 227.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08187758177518845, | |
| "eval_runtime": 12.7892, | |
| "eval_samples_per_second": 781.912, | |
| "eval_steps_per_second": 3.128, | |
| "step": 75591 | |
| }, | |
| { | |
| "epoch": 228.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08235606551170349, | |
| "eval_runtime": 12.9107, | |
| "eval_samples_per_second": 774.55, | |
| "eval_steps_per_second": 3.098, | |
| "step": 75924 | |
| }, | |
| { | |
| "epoch": 228.23, | |
| "grad_norm": 13.310216903686523, | |
| "learning_rate": 2.3923923923923923e-06, | |
| "loss": 0.1315, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 229.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.08291840553283691, | |
| "eval_runtime": 13.4267, | |
| "eval_samples_per_second": 744.787, | |
| "eval_steps_per_second": 2.979, | |
| "step": 76257 | |
| }, | |
| { | |
| "epoch": 229.73, | |
| "grad_norm": 7.18035888671875, | |
| "learning_rate": 2.3423423423423424e-06, | |
| "loss": 0.1243, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 230.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.08134060353040695, | |
| "eval_runtime": 12.9054, | |
| "eval_samples_per_second": 774.871, | |
| "eval_steps_per_second": 3.099, | |
| "step": 76590 | |
| }, | |
| { | |
| "epoch": 231.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.08125565946102142, | |
| "eval_runtime": 13.8266, | |
| "eval_samples_per_second": 723.246, | |
| "eval_steps_per_second": 2.893, | |
| "step": 76923 | |
| }, | |
| { | |
| "epoch": 231.23, | |
| "grad_norm": 11.132826805114746, | |
| "learning_rate": 2.2922922922922925e-06, | |
| "loss": 0.1244, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 232.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08288297057151794, | |
| "eval_runtime": 13.8545, | |
| "eval_samples_per_second": 721.786, | |
| "eval_steps_per_second": 2.887, | |
| "step": 77256 | |
| }, | |
| { | |
| "epoch": 232.73, | |
| "grad_norm": 7.415234565734863, | |
| "learning_rate": 2.2422422422422426e-06, | |
| "loss": 0.1286, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 233.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.083954356610775, | |
| "eval_runtime": 13.1117, | |
| "eval_samples_per_second": 762.679, | |
| "eval_steps_per_second": 3.051, | |
| "step": 77589 | |
| }, | |
| { | |
| "epoch": 234.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08230035752058029, | |
| "eval_runtime": 13.3702, | |
| "eval_samples_per_second": 747.932, | |
| "eval_steps_per_second": 2.992, | |
| "step": 77922 | |
| }, | |
| { | |
| "epoch": 234.23, | |
| "grad_norm": 7.36590576171875, | |
| "learning_rate": 2.192192192192192e-06, | |
| "loss": 0.1261, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 235.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.08295118808746338, | |
| "eval_runtime": 13.748, | |
| "eval_samples_per_second": 727.381, | |
| "eval_steps_per_second": 2.91, | |
| "step": 78255 | |
| }, | |
| { | |
| "epoch": 235.74, | |
| "grad_norm": 10.516325950622559, | |
| "learning_rate": 2.1421421421421423e-06, | |
| "loss": 0.1238, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 236.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.08197174966335297, | |
| "eval_runtime": 12.9286, | |
| "eval_samples_per_second": 773.481, | |
| "eval_steps_per_second": 3.094, | |
| "step": 78588 | |
| }, | |
| { | |
| "epoch": 237.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08315034210681915, | |
| "eval_runtime": 13.634, | |
| "eval_samples_per_second": 733.458, | |
| "eval_steps_per_second": 2.934, | |
| "step": 78921 | |
| }, | |
| { | |
| "epoch": 237.24, | |
| "grad_norm": 5.020528316497803, | |
| "learning_rate": 2.0920920920920923e-06, | |
| "loss": 0.1296, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 238.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.08168121427297592, | |
| "eval_runtime": 14.4842, | |
| "eval_samples_per_second": 690.406, | |
| "eval_steps_per_second": 2.762, | |
| "step": 79254 | |
| }, | |
| { | |
| "epoch": 238.74, | |
| "grad_norm": 11.957234382629395, | |
| "learning_rate": 2.0420420420420424e-06, | |
| "loss": 0.1278, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 239.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08146882057189941, | |
| "eval_runtime": 14.503, | |
| "eval_samples_per_second": 689.511, | |
| "eval_steps_per_second": 2.758, | |
| "step": 79587 | |
| }, | |
| { | |
| "epoch": 240.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08267929404973984, | |
| "eval_runtime": 12.9081, | |
| "eval_samples_per_second": 774.71, | |
| "eval_steps_per_second": 3.099, | |
| "step": 79920 | |
| }, | |
| { | |
| "epoch": 240.24, | |
| "grad_norm": 10.550077438354492, | |
| "learning_rate": 1.991991991991992e-06, | |
| "loss": 0.1246, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 241.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08258900791406631, | |
| "eval_runtime": 13.3618, | |
| "eval_samples_per_second": 748.404, | |
| "eval_steps_per_second": 2.994, | |
| "step": 80253 | |
| }, | |
| { | |
| "epoch": 241.74, | |
| "grad_norm": 14.927352905273438, | |
| "learning_rate": 1.941941941941942e-06, | |
| "loss": 0.128, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 242.0, | |
| "eval_accuracy": 0.9797, | |
| "eval_loss": 0.08207195997238159, | |
| "eval_runtime": 13.4168, | |
| "eval_samples_per_second": 745.333, | |
| "eval_steps_per_second": 2.981, | |
| "step": 80586 | |
| }, | |
| { | |
| "epoch": 243.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08075813204050064, | |
| "eval_runtime": 12.9166, | |
| "eval_samples_per_second": 774.198, | |
| "eval_steps_per_second": 3.097, | |
| "step": 80919 | |
| }, | |
| { | |
| "epoch": 243.24, | |
| "grad_norm": 10.435842514038086, | |
| "learning_rate": 1.8918918918918922e-06, | |
| "loss": 0.1274, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 244.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.0817038044333458, | |
| "eval_runtime": 12.9068, | |
| "eval_samples_per_second": 774.784, | |
| "eval_steps_per_second": 3.099, | |
| "step": 81252 | |
| }, | |
| { | |
| "epoch": 244.74, | |
| "grad_norm": 6.686298370361328, | |
| "learning_rate": 1.841841841841842e-06, | |
| "loss": 0.1232, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 245.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.0812101811170578, | |
| "eval_runtime": 13.3736, | |
| "eval_samples_per_second": 747.741, | |
| "eval_steps_per_second": 2.991, | |
| "step": 81585 | |
| }, | |
| { | |
| "epoch": 246.0, | |
| "eval_accuracy": 0.9809, | |
| "eval_loss": 0.08127359300851822, | |
| "eval_runtime": 13.8034, | |
| "eval_samples_per_second": 724.46, | |
| "eval_steps_per_second": 2.898, | |
| "step": 81918 | |
| }, | |
| { | |
| "epoch": 246.25, | |
| "grad_norm": 9.9036865234375, | |
| "learning_rate": 1.7917917917917917e-06, | |
| "loss": 0.1281, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 247.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.0803731232881546, | |
| "eval_runtime": 13.6609, | |
| "eval_samples_per_second": 732.019, | |
| "eval_steps_per_second": 2.928, | |
| "step": 82251 | |
| }, | |
| { | |
| "epoch": 247.75, | |
| "grad_norm": 9.58124828338623, | |
| "learning_rate": 1.7417417417417418e-06, | |
| "loss": 0.1236, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 248.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08054234832525253, | |
| "eval_runtime": 12.9985, | |
| "eval_samples_per_second": 769.319, | |
| "eval_steps_per_second": 3.077, | |
| "step": 82584 | |
| }, | |
| { | |
| "epoch": 249.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08253397792577744, | |
| "eval_runtime": 12.7246, | |
| "eval_samples_per_second": 785.882, | |
| "eval_steps_per_second": 3.144, | |
| "step": 82917 | |
| }, | |
| { | |
| "epoch": 249.25, | |
| "grad_norm": 11.608097076416016, | |
| "learning_rate": 1.6916916916916916e-06, | |
| "loss": 0.1223, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 250.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.08115767687559128, | |
| "eval_runtime": 13.6973, | |
| "eval_samples_per_second": 730.07, | |
| "eval_steps_per_second": 2.92, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 250.75, | |
| "grad_norm": 7.931227207183838, | |
| "learning_rate": 1.6416416416416417e-06, | |
| "loss": 0.1278, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 251.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08087089657783508, | |
| "eval_runtime": 12.8657, | |
| "eval_samples_per_second": 777.263, | |
| "eval_steps_per_second": 3.109, | |
| "step": 83583 | |
| }, | |
| { | |
| "epoch": 252.0, | |
| "eval_accuracy": 0.9818, | |
| "eval_loss": 0.07839205116033554, | |
| "eval_runtime": 12.9826, | |
| "eval_samples_per_second": 770.262, | |
| "eval_steps_per_second": 3.081, | |
| "step": 83916 | |
| }, | |
| { | |
| "epoch": 252.25, | |
| "grad_norm": 6.618145942687988, | |
| "learning_rate": 1.5915915915915916e-06, | |
| "loss": 0.1238, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 253.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.07928815484046936, | |
| "eval_runtime": 13.4795, | |
| "eval_samples_per_second": 741.867, | |
| "eval_steps_per_second": 2.967, | |
| "step": 84249 | |
| }, | |
| { | |
| "epoch": 253.75, | |
| "grad_norm": 6.5788397789001465, | |
| "learning_rate": 1.5415415415415416e-06, | |
| "loss": 0.1259, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 254.0, | |
| "eval_accuracy": 0.9814, | |
| "eval_loss": 0.08129309117794037, | |
| "eval_runtime": 12.8787, | |
| "eval_samples_per_second": 776.478, | |
| "eval_steps_per_second": 3.106, | |
| "step": 84582 | |
| }, | |
| { | |
| "epoch": 255.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08033791929483414, | |
| "eval_runtime": 12.7033, | |
| "eval_samples_per_second": 787.197, | |
| "eval_steps_per_second": 3.149, | |
| "step": 84915 | |
| }, | |
| { | |
| "epoch": 255.26, | |
| "grad_norm": 8.367218017578125, | |
| "learning_rate": 1.4914914914914915e-06, | |
| "loss": 0.1261, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 256.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08045142143964767, | |
| "eval_runtime": 12.9511, | |
| "eval_samples_per_second": 772.133, | |
| "eval_steps_per_second": 3.089, | |
| "step": 85248 | |
| }, | |
| { | |
| "epoch": 256.76, | |
| "grad_norm": 17.39365005493164, | |
| "learning_rate": 1.4414414414414416e-06, | |
| "loss": 0.1312, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 257.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08164441585540771, | |
| "eval_runtime": 13.3735, | |
| "eval_samples_per_second": 747.745, | |
| "eval_steps_per_second": 2.991, | |
| "step": 85581 | |
| }, | |
| { | |
| "epoch": 258.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08030729740858078, | |
| "eval_runtime": 12.9143, | |
| "eval_samples_per_second": 774.336, | |
| "eval_steps_per_second": 3.097, | |
| "step": 85914 | |
| }, | |
| { | |
| "epoch": 258.26, | |
| "grad_norm": 12.668910026550293, | |
| "learning_rate": 1.3913913913913914e-06, | |
| "loss": 0.1237, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 259.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.07897236198186874, | |
| "eval_runtime": 13.3964, | |
| "eval_samples_per_second": 746.469, | |
| "eval_steps_per_second": 2.986, | |
| "step": 86247 | |
| }, | |
| { | |
| "epoch": 259.76, | |
| "grad_norm": 3.696176767349243, | |
| "learning_rate": 1.3413413413413415e-06, | |
| "loss": 0.1234, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 260.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.07928313314914703, | |
| "eval_runtime": 13.8621, | |
| "eval_samples_per_second": 721.391, | |
| "eval_steps_per_second": 2.886, | |
| "step": 86580 | |
| }, | |
| { | |
| "epoch": 261.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.07920601218938828, | |
| "eval_runtime": 12.907, | |
| "eval_samples_per_second": 774.775, | |
| "eval_steps_per_second": 3.099, | |
| "step": 86913 | |
| }, | |
| { | |
| "epoch": 261.26, | |
| "grad_norm": 11.28502082824707, | |
| "learning_rate": 1.2912912912912913e-06, | |
| "loss": 0.1237, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 262.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.08003947883844376, | |
| "eval_runtime": 13.8177, | |
| "eval_samples_per_second": 723.709, | |
| "eval_steps_per_second": 2.895, | |
| "step": 87246 | |
| }, | |
| { | |
| "epoch": 262.76, | |
| "grad_norm": 13.543560981750488, | |
| "learning_rate": 1.2412412412412414e-06, | |
| "loss": 0.1257, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 263.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08235891908407211, | |
| "eval_runtime": 13.4574, | |
| "eval_samples_per_second": 743.088, | |
| "eval_steps_per_second": 2.972, | |
| "step": 87579 | |
| }, | |
| { | |
| "epoch": 264.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08182436227798462, | |
| "eval_runtime": 12.9778, | |
| "eval_samples_per_second": 770.546, | |
| "eval_steps_per_second": 3.082, | |
| "step": 87912 | |
| }, | |
| { | |
| "epoch": 264.26, | |
| "grad_norm": 11.065189361572266, | |
| "learning_rate": 1.1911911911911913e-06, | |
| "loss": 0.1219, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 265.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.08205542713403702, | |
| "eval_runtime": 13.6001, | |
| "eval_samples_per_second": 735.288, | |
| "eval_steps_per_second": 2.941, | |
| "step": 88245 | |
| }, | |
| { | |
| "epoch": 265.77, | |
| "grad_norm": 9.291784286499023, | |
| "learning_rate": 1.1411411411411411e-06, | |
| "loss": 0.1298, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 266.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08165726810693741, | |
| "eval_runtime": 13.3896, | |
| "eval_samples_per_second": 746.85, | |
| "eval_steps_per_second": 2.987, | |
| "step": 88578 | |
| }, | |
| { | |
| "epoch": 267.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08162441104650497, | |
| "eval_runtime": 14.0305, | |
| "eval_samples_per_second": 712.733, | |
| "eval_steps_per_second": 2.851, | |
| "step": 88911 | |
| }, | |
| { | |
| "epoch": 267.27, | |
| "grad_norm": 17.33576202392578, | |
| "learning_rate": 1.0910910910910912e-06, | |
| "loss": 0.1222, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 268.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.08136063069105148, | |
| "eval_runtime": 12.8095, | |
| "eval_samples_per_second": 780.671, | |
| "eval_steps_per_second": 3.123, | |
| "step": 89244 | |
| }, | |
| { | |
| "epoch": 268.77, | |
| "grad_norm": 11.170260429382324, | |
| "learning_rate": 1.041041041041041e-06, | |
| "loss": 0.1268, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 269.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.08162767440080643, | |
| "eval_runtime": 13.5821, | |
| "eval_samples_per_second": 736.263, | |
| "eval_steps_per_second": 2.945, | |
| "step": 89577 | |
| }, | |
| { | |
| "epoch": 270.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08254320919513702, | |
| "eval_runtime": 12.9419, | |
| "eval_samples_per_second": 772.681, | |
| "eval_steps_per_second": 3.091, | |
| "step": 89910 | |
| }, | |
| { | |
| "epoch": 270.27, | |
| "grad_norm": 10.08292007446289, | |
| "learning_rate": 9.909909909909911e-07, | |
| "loss": 0.1239, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 271.0, | |
| "eval_accuracy": 0.9802, | |
| "eval_loss": 0.08088234812021255, | |
| "eval_runtime": 12.9857, | |
| "eval_samples_per_second": 770.075, | |
| "eval_steps_per_second": 3.08, | |
| "step": 90243 | |
| }, | |
| { | |
| "epoch": 271.77, | |
| "grad_norm": 7.639751434326172, | |
| "learning_rate": 9.409409409409411e-07, | |
| "loss": 0.1277, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 272.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.0805734246969223, | |
| "eval_runtime": 12.6096, | |
| "eval_samples_per_second": 793.045, | |
| "eval_steps_per_second": 3.172, | |
| "step": 90576 | |
| }, | |
| { | |
| "epoch": 273.0, | |
| "eval_accuracy": 0.98, | |
| "eval_loss": 0.08124550431966782, | |
| "eval_runtime": 13.1278, | |
| "eval_samples_per_second": 761.741, | |
| "eval_steps_per_second": 3.047, | |
| "step": 90909 | |
| }, | |
| { | |
| "epoch": 273.27, | |
| "grad_norm": 7.800063133239746, | |
| "learning_rate": 8.90890890890891e-07, | |
| "loss": 0.1235, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 274.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08137263357639313, | |
| "eval_runtime": 13.3744, | |
| "eval_samples_per_second": 747.7, | |
| "eval_steps_per_second": 2.991, | |
| "step": 91242 | |
| }, | |
| { | |
| "epoch": 274.77, | |
| "grad_norm": 13.224382400512695, | |
| "learning_rate": 8.40840840840841e-07, | |
| "loss": 0.1261, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 275.0, | |
| "eval_accuracy": 0.9801, | |
| "eval_loss": 0.08086758852005005, | |
| "eval_runtime": 12.9048, | |
| "eval_samples_per_second": 774.907, | |
| "eval_steps_per_second": 3.1, | |
| "step": 91575 | |
| }, | |
| { | |
| "epoch": 276.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.080258309841156, | |
| "eval_runtime": 14.2222, | |
| "eval_samples_per_second": 703.127, | |
| "eval_steps_per_second": 2.813, | |
| "step": 91908 | |
| }, | |
| { | |
| "epoch": 276.28, | |
| "grad_norm": 8.529864311218262, | |
| "learning_rate": 7.907907907907908e-07, | |
| "loss": 0.1219, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 277.0, | |
| "eval_accuracy": 0.9803, | |
| "eval_loss": 0.08069344609975815, | |
| "eval_runtime": 13.3763, | |
| "eval_samples_per_second": 747.589, | |
| "eval_steps_per_second": 2.99, | |
| "step": 92241 | |
| }, | |
| { | |
| "epoch": 277.78, | |
| "grad_norm": 5.7626051902771, | |
| "learning_rate": 7.407407407407407e-07, | |
| "loss": 0.1235, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 278.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.0805598720908165, | |
| "eval_runtime": 13.2417, | |
| "eval_samples_per_second": 755.192, | |
| "eval_steps_per_second": 3.021, | |
| "step": 92574 | |
| }, | |
| { | |
| "epoch": 279.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.07991771399974823, | |
| "eval_runtime": 12.9989, | |
| "eval_samples_per_second": 769.296, | |
| "eval_steps_per_second": 3.077, | |
| "step": 92907 | |
| }, | |
| { | |
| "epoch": 279.28, | |
| "grad_norm": 12.886475563049316, | |
| "learning_rate": 6.906906906906907e-07, | |
| "loss": 0.1232, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 280.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08009103685617447, | |
| "eval_runtime": 14.1951, | |
| "eval_samples_per_second": 704.469, | |
| "eval_steps_per_second": 2.818, | |
| "step": 93240 | |
| }, | |
| { | |
| "epoch": 280.78, | |
| "grad_norm": 13.245797157287598, | |
| "learning_rate": 6.406406406406407e-07, | |
| "loss": 0.1236, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 281.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.08077774941921234, | |
| "eval_runtime": 13.9349, | |
| "eval_samples_per_second": 717.624, | |
| "eval_steps_per_second": 2.87, | |
| "step": 93573 | |
| }, | |
| { | |
| "epoch": 282.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08111685514450073, | |
| "eval_runtime": 13.0495, | |
| "eval_samples_per_second": 766.313, | |
| "eval_steps_per_second": 3.065, | |
| "step": 93906 | |
| }, | |
| { | |
| "epoch": 282.28, | |
| "grad_norm": 6.8997673988342285, | |
| "learning_rate": 5.905905905905906e-07, | |
| "loss": 0.1195, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 283.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.08137265592813492, | |
| "eval_runtime": 13.163, | |
| "eval_samples_per_second": 759.705, | |
| "eval_steps_per_second": 3.039, | |
| "step": 94239 | |
| }, | |
| { | |
| "epoch": 283.78, | |
| "grad_norm": 12.197209358215332, | |
| "learning_rate": 5.405405405405406e-07, | |
| "loss": 0.1191, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 284.0, | |
| "eval_accuracy": 0.9804, | |
| "eval_loss": 0.08120004087686539, | |
| "eval_runtime": 12.9217, | |
| "eval_samples_per_second": 773.893, | |
| "eval_steps_per_second": 3.096, | |
| "step": 94572 | |
| }, | |
| { | |
| "epoch": 285.0, | |
| "eval_accuracy": 0.9805, | |
| "eval_loss": 0.08181598037481308, | |
| "eval_runtime": 12.828, | |
| "eval_samples_per_second": 779.547, | |
| "eval_steps_per_second": 3.118, | |
| "step": 94905 | |
| }, | |
| { | |
| "epoch": 285.29, | |
| "grad_norm": 6.001578330993652, | |
| "learning_rate": 4.904904904904905e-07, | |
| "loss": 0.1205, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 286.0, | |
| "eval_accuracy": 0.9807, | |
| "eval_loss": 0.08141326904296875, | |
| "eval_runtime": 13.7647, | |
| "eval_samples_per_second": 726.495, | |
| "eval_steps_per_second": 2.906, | |
| "step": 95238 | |
| }, | |
| { | |
| "epoch": 286.79, | |
| "grad_norm": 9.633207321166992, | |
| "learning_rate": 4.4044044044044046e-07, | |
| "loss": 0.1203, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 287.0, | |
| "eval_accuracy": 0.9808, | |
| "eval_loss": 0.08182702958583832, | |
| "eval_runtime": 14.1767, | |
| "eval_samples_per_second": 705.381, | |
| "eval_steps_per_second": 2.822, | |
| "step": 95571 | |
| }, | |
| { | |
| "epoch": 288.0, | |
| "eval_accuracy": 0.9806, | |
| "eval_loss": 0.08031768351793289, | |
| "eval_runtime": 14.019, | |
| "eval_samples_per_second": 713.316, | |
| "eval_steps_per_second": 2.853, | |
| "step": 95904 | |
| }, | |
| { | |
| "epoch": 288.29, | |
| "grad_norm": 9.451753616333008, | |
| "learning_rate": 3.903903903903904e-07, | |
| "loss": 0.1197, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 289.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.0809708833694458, | |
| "eval_runtime": 13.7936, | |
| "eval_samples_per_second": 724.975, | |
| "eval_steps_per_second": 2.9, | |
| "step": 96237 | |
| }, | |
| { | |
| "epoch": 289.79, | |
| "grad_norm": 10.313632011413574, | |
| "learning_rate": 3.403403403403404e-07, | |
| "loss": 0.1233, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 290.0, | |
| "eval_accuracy": 0.9811, | |
| "eval_loss": 0.08130063861608505, | |
| "eval_runtime": 13.4821, | |
| "eval_samples_per_second": 741.722, | |
| "eval_steps_per_second": 2.967, | |
| "step": 96570 | |
| }, | |
| { | |
| "epoch": 291.0, | |
| "eval_accuracy": 0.9813, | |
| "eval_loss": 0.08096129447221756, | |
| "eval_runtime": 13.9986, | |
| "eval_samples_per_second": 714.359, | |
| "eval_steps_per_second": 2.857, | |
| "step": 96903 | |
| }, | |
| { | |
| "epoch": 291.29, | |
| "grad_norm": 6.7220892906188965, | |
| "learning_rate": 2.9029029029029035e-07, | |
| "loss": 0.12, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 292.0, | |
| "eval_accuracy": 0.9813, | |
| "eval_loss": 0.08056668192148209, | |
| "eval_runtime": 13.2921, | |
| "eval_samples_per_second": 752.329, | |
| "eval_steps_per_second": 3.009, | |
| "step": 97236 | |
| }, | |
| { | |
| "epoch": 292.79, | |
| "grad_norm": 7.212859630584717, | |
| "learning_rate": 2.4024024024024026e-07, | |
| "loss": 0.1219, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 293.0, | |
| "eval_accuracy": 0.9816, | |
| "eval_loss": 0.08098697662353516, | |
| "eval_runtime": 13.5812, | |
| "eval_samples_per_second": 736.31, | |
| "eval_steps_per_second": 2.945, | |
| "step": 97569 | |
| }, | |
| { | |
| "epoch": 294.0, | |
| "eval_accuracy": 0.9815, | |
| "eval_loss": 0.08067005127668381, | |
| "eval_runtime": 12.9034, | |
| "eval_samples_per_second": 774.988, | |
| "eval_steps_per_second": 3.1, | |
| "step": 97902 | |
| }, | |
| { | |
| "epoch": 294.29, | |
| "grad_norm": 7.5087409019470215, | |
| "learning_rate": 1.9019019019019022e-07, | |
| "loss": 0.1202, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 295.0, | |
| "eval_accuracy": 0.9813, | |
| "eval_loss": 0.08077917248010635, | |
| "eval_runtime": 13.4699, | |
| "eval_samples_per_second": 742.397, | |
| "eval_steps_per_second": 2.97, | |
| "step": 98235 | |
| }, | |
| { | |
| "epoch": 295.8, | |
| "grad_norm": 7.660182952880859, | |
| "learning_rate": 1.4014014014014016e-07, | |
| "loss": 0.1228, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 296.0, | |
| "eval_accuracy": 0.9815, | |
| "eval_loss": 0.0807722955942154, | |
| "eval_runtime": 13.0168, | |
| "eval_samples_per_second": 768.237, | |
| "eval_steps_per_second": 3.073, | |
| "step": 98568 | |
| }, | |
| { | |
| "epoch": 297.0, | |
| "eval_accuracy": 0.9813, | |
| "eval_loss": 0.08067157864570618, | |
| "eval_runtime": 13.4303, | |
| "eval_samples_per_second": 744.586, | |
| "eval_steps_per_second": 2.978, | |
| "step": 98901 | |
| }, | |
| { | |
| "epoch": 297.3, | |
| "grad_norm": 10.4266357421875, | |
| "learning_rate": 9.00900900900901e-08, | |
| "loss": 0.1212, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 298.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.08074088394641876, | |
| "eval_runtime": 12.8481, | |
| "eval_samples_per_second": 778.327, | |
| "eval_steps_per_second": 3.113, | |
| "step": 99234 | |
| }, | |
| { | |
| "epoch": 298.8, | |
| "grad_norm": 10.557640075683594, | |
| "learning_rate": 4.004004004004004e-08, | |
| "loss": 0.1214, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 299.0, | |
| "eval_accuracy": 0.9812, | |
| "eval_loss": 0.0807051733136177, | |
| "eval_runtime": 13.1178, | |
| "eval_samples_per_second": 762.323, | |
| "eval_steps_per_second": 3.049, | |
| "step": 99567 | |
| }, | |
| { | |
| "epoch": 300.0, | |
| "eval_accuracy": 0.981, | |
| "eval_loss": 0.08068788051605225, | |
| "eval_runtime": 12.9887, | |
| "eval_samples_per_second": 769.902, | |
| "eval_steps_per_second": 3.08, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 300.0, | |
| "step": 99900, | |
| "total_flos": 3.1698470226124734e+20, | |
| "train_loss": 0.17093151241451413, | |
| "train_runtime": 47820.897, | |
| "train_samples_per_second": 266.62, | |
| "train_steps_per_second": 2.089 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 99900, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 300, | |
| "save_steps": 500, | |
| "total_flos": 3.1698470226124734e+20, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |