diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4123 @@ +{ + "best_metric": 0.9818, + "best_model_checkpoint": "../../checkpoint/cifar10/swin-tiny/checkpoint-38295", + "epoch": 300.0, + "eval_steps": 500, + "global_step": 99900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.0, + "eval_accuracy": 0.9223, + "eval_loss": 0.2584497928619385, + "eval_runtime": 21.3622, + "eval_samples_per_second": 468.117, + "eval_steps_per_second": 1.872, + "step": 333 + }, + { + "epoch": 1.5, + "grad_norm": 14.220479011535645, + "learning_rate": 9.949949949949951e-06, + "loss": 0.9076, + "step": 500 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.945, + "eval_loss": 0.1637052595615387, + "eval_runtime": 13.6699, + "eval_samples_per_second": 731.533, + "eval_steps_per_second": 2.926, + "step": 666 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9553, + "eval_loss": 0.1344435065984726, + "eval_runtime": 13.1721, + "eval_samples_per_second": 759.181, + "eval_steps_per_second": 3.037, + "step": 999 + }, + { + "epoch": 3.0, + "grad_norm": 9.328938484191895, + "learning_rate": 9.899899899899901e-06, + "loss": 0.4797, + "step": 1000 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9604, + "eval_loss": 0.1205841451883316, + "eval_runtime": 12.6584, + "eval_samples_per_second": 789.992, + "eval_steps_per_second": 3.16, + "step": 1332 + }, + { + "epoch": 4.5, + "grad_norm": 14.11563777923584, + "learning_rate": 9.849849849849851e-06, + "loss": 0.4193, + "step": 1500 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9635, + "eval_loss": 0.11088060587644577, + "eval_runtime": 12.7891, + "eval_samples_per_second": 781.918, + "eval_steps_per_second": 3.128, + "step": 1665 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.9661, + "eval_loss": 0.10564317554235458, + "eval_runtime": 12.9686, + "eval_samples_per_second": 771.094, + "eval_steps_per_second": 3.084, + "step": 1998 + }, + { + "epoch": 6.01, + "grad_norm": 12.565740585327148, + "learning_rate": 9.799799799799801e-06, + "loss": 0.3846, + "step": 2000 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9688, + "eval_loss": 0.09508195519447327, + "eval_runtime": 13.2698, + "eval_samples_per_second": 753.592, + "eval_steps_per_second": 3.014, + "step": 2331 + }, + { + "epoch": 7.51, + "grad_norm": 9.896069526672363, + "learning_rate": 9.749749749749751e-06, + "loss": 0.3572, + "step": 2500 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9689, + "eval_loss": 0.09568808227777481, + "eval_runtime": 13.6448, + "eval_samples_per_second": 732.879, + "eval_steps_per_second": 2.932, + "step": 2664 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9693, + "eval_loss": 0.09088099747896194, + "eval_runtime": 13.9779, + "eval_samples_per_second": 715.417, + "eval_steps_per_second": 2.862, + "step": 2997 + }, + { + "epoch": 9.01, + "grad_norm": 9.739038467407227, + "learning_rate": 9.699699699699701e-06, + "loss": 0.3409, + "step": 3000 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.971, + "eval_loss": 0.0861617922782898, + "eval_runtime": 13.5874, + "eval_samples_per_second": 735.979, + "eval_steps_per_second": 2.944, + "step": 3330 + }, + { + "epoch": 10.51, + "grad_norm": 7.383803367614746, + "learning_rate": 9.649649649649651e-06, + "loss": 0.3319, + "step": 3500 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.9721, + "eval_loss": 0.08562646806240082, + "eval_runtime": 13.5289, + "eval_samples_per_second": 739.158, + "eval_steps_per_second": 2.957, + "step": 3663 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.972, + "eval_loss": 0.08723447471857071, + "eval_runtime": 13.3531, + "eval_samples_per_second": 748.887, + "eval_steps_per_second": 2.996, + "step": 3996 + }, + { + "epoch": 12.01, + "grad_norm": 11.866540908813477, + "learning_rate": 9.5995995995996e-06, + "loss": 0.3253, + "step": 4000 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.973, + "eval_loss": 0.08058160543441772, + "eval_runtime": 14.1547, + "eval_samples_per_second": 706.479, + "eval_steps_per_second": 2.826, + "step": 4329 + }, + { + "epoch": 13.51, + "grad_norm": 7.938398361206055, + "learning_rate": 9.54954954954955e-06, + "loss": 0.3084, + "step": 4500 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.9738, + "eval_loss": 0.08162784576416016, + "eval_runtime": 14.1065, + "eval_samples_per_second": 708.895, + "eval_steps_per_second": 2.836, + "step": 4662 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.9742, + "eval_loss": 0.07894858717918396, + "eval_runtime": 13.886, + "eval_samples_per_second": 720.149, + "eval_steps_per_second": 2.881, + "step": 4995 + }, + { + "epoch": 15.02, + "grad_norm": 16.568248748779297, + "learning_rate": 9.4994994994995e-06, + "loss": 0.3022, + "step": 5000 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.9746, + "eval_loss": 0.07670588046312332, + "eval_runtime": 13.5929, + "eval_samples_per_second": 735.676, + "eval_steps_per_second": 2.943, + "step": 5328 + }, + { + "epoch": 16.52, + "grad_norm": 13.009441375732422, + "learning_rate": 9.44944944944945e-06, + "loss": 0.2894, + "step": 5500 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.9725, + "eval_loss": 0.0805484876036644, + "eval_runtime": 13.3932, + "eval_samples_per_second": 746.649, + "eval_steps_per_second": 2.987, + "step": 5661 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.9759, + "eval_loss": 0.0759720578789711, + "eval_runtime": 13.5457, + "eval_samples_per_second": 738.24, + "eval_steps_per_second": 2.953, + "step": 5994 + }, + { + "epoch": 18.02, + "grad_norm": 13.468392372131348, + "learning_rate": 9.3993993993994e-06, + "loss": 0.2842, + "step": 6000 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.9744, + "eval_loss": 0.07423894852399826, + "eval_runtime": 13.6253, + "eval_samples_per_second": 733.929, + "eval_steps_per_second": 2.936, + "step": 6327 + }, + { + "epoch": 19.52, + "grad_norm": 12.263895988464355, + "learning_rate": 9.34934934934935e-06, + "loss": 0.2712, + "step": 6500 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.9738, + "eval_loss": 0.07846847176551819, + "eval_runtime": 12.9608, + "eval_samples_per_second": 771.556, + "eval_steps_per_second": 3.086, + "step": 6660 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.9735, + "eval_loss": 0.07904864102602005, + "eval_runtime": 12.8724, + "eval_samples_per_second": 776.858, + "eval_steps_per_second": 3.107, + "step": 6993 + }, + { + "epoch": 21.02, + "grad_norm": 10.845202445983887, + "learning_rate": 9.2992992992993e-06, + "loss": 0.2729, + "step": 7000 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.9759, + "eval_loss": 0.07514221966266632, + "eval_runtime": 14.1339, + "eval_samples_per_second": 707.52, + "eval_steps_per_second": 2.83, + "step": 7326 + }, + { + "epoch": 22.52, + "grad_norm": 11.12897777557373, + "learning_rate": 9.24924924924925e-06, + "loss": 0.2634, + "step": 7500 + }, + { + "epoch": 23.0, + "eval_accuracy": 0.9737, + "eval_loss": 0.07959982007741928, + "eval_runtime": 13.0065, + "eval_samples_per_second": 768.844, + "eval_steps_per_second": 3.075, + "step": 7659 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.9752, + "eval_loss": 0.07558540254831314, + "eval_runtime": 13.805, + "eval_samples_per_second": 724.375, + "eval_steps_per_second": 2.897, + "step": 7992 + }, + { + "epoch": 24.02, + "grad_norm": 10.100821495056152, + "learning_rate": 9.1991991991992e-06, + "loss": 0.2591, + "step": 8000 + }, + { + "epoch": 25.0, + "eval_accuracy": 0.9759, + "eval_loss": 0.07549387961626053, + "eval_runtime": 13.4677, + "eval_samples_per_second": 742.518, + "eval_steps_per_second": 2.97, + "step": 8325 + }, + { + "epoch": 25.53, + "grad_norm": 9.881790161132812, + "learning_rate": 9.14914914914915e-06, + "loss": 0.253, + "step": 8500 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.9746, + "eval_loss": 0.07933681458234787, + "eval_runtime": 13.2517, + "eval_samples_per_second": 754.619, + "eval_steps_per_second": 3.018, + "step": 8658 + }, + { + "epoch": 27.0, + "eval_accuracy": 0.9765, + "eval_loss": 0.07278025895357132, + "eval_runtime": 13.5258, + "eval_samples_per_second": 739.327, + "eval_steps_per_second": 2.957, + "step": 8991 + }, + { + "epoch": 27.03, + "grad_norm": 7.72860860824585, + "learning_rate": 9.0990990990991e-06, + "loss": 0.2518, + "step": 9000 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.9748, + "eval_loss": 0.07914856821298599, + "eval_runtime": 13.7348, + "eval_samples_per_second": 728.079, + "eval_steps_per_second": 2.912, + "step": 9324 + }, + { + "epoch": 28.53, + "grad_norm": 8.068327903747559, + "learning_rate": 9.04904904904905e-06, + "loss": 0.2482, + "step": 9500 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.9756, + "eval_loss": 0.07918867468833923, + "eval_runtime": 13.3633, + "eval_samples_per_second": 748.316, + "eval_steps_per_second": 2.993, + "step": 9657 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.9764, + "eval_loss": 0.07418718934059143, + "eval_runtime": 12.9493, + "eval_samples_per_second": 772.24, + "eval_steps_per_second": 3.089, + "step": 9990 + }, + { + "epoch": 30.03, + "grad_norm": 8.977522850036621, + "learning_rate": 8.998998998999e-06, + "loss": 0.2429, + "step": 10000 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.9757, + "eval_loss": 0.07399851083755493, + "eval_runtime": 13.4787, + "eval_samples_per_second": 741.913, + "eval_steps_per_second": 2.968, + "step": 10323 + }, + { + "epoch": 31.53, + "grad_norm": 11.080597877502441, + "learning_rate": 8.94894894894895e-06, + "loss": 0.2405, + "step": 10500 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.9757, + "eval_loss": 0.07426943629980087, + "eval_runtime": 12.8343, + "eval_samples_per_second": 779.16, + "eval_steps_per_second": 3.117, + "step": 10656 + }, + { + "epoch": 33.0, + "eval_accuracy": 0.9757, + "eval_loss": 0.07429418712854385, + "eval_runtime": 12.9825, + "eval_samples_per_second": 770.266, + "eval_steps_per_second": 3.081, + "step": 10989 + }, + { + "epoch": 33.03, + "grad_norm": 7.3039140701293945, + "learning_rate": 8.8988988988989e-06, + "loss": 0.234, + "step": 11000 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.9769, + "eval_loss": 0.07486932724714279, + "eval_runtime": 12.96, + "eval_samples_per_second": 771.606, + "eval_steps_per_second": 3.086, + "step": 11322 + }, + { + "epoch": 34.53, + "grad_norm": 8.610194206237793, + "learning_rate": 8.84884884884885e-06, + "loss": 0.2353, + "step": 11500 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.975, + "eval_loss": 0.0768030509352684, + "eval_runtime": 13.519, + "eval_samples_per_second": 739.698, + "eval_steps_per_second": 2.959, + "step": 11655 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.9771, + "eval_loss": 0.07342812418937683, + "eval_runtime": 14.3472, + "eval_samples_per_second": 697.001, + "eval_steps_per_second": 2.788, + "step": 11988 + }, + { + "epoch": 36.04, + "grad_norm": 7.767194747924805, + "learning_rate": 8.798798798798799e-06, + "loss": 0.2329, + "step": 12000 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.9755, + "eval_loss": 0.07778933644294739, + "eval_runtime": 13.5633, + "eval_samples_per_second": 737.284, + "eval_steps_per_second": 2.949, + "step": 12321 + }, + { + "epoch": 37.54, + "grad_norm": 11.39279842376709, + "learning_rate": 8.74874874874875e-06, + "loss": 0.2289, + "step": 12500 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.9771, + "eval_loss": 0.07622923702001572, + "eval_runtime": 13.5603, + "eval_samples_per_second": 737.447, + "eval_steps_per_second": 2.95, + "step": 12654 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.9761, + "eval_loss": 0.07648137956857681, + "eval_runtime": 13.4622, + "eval_samples_per_second": 742.82, + "eval_steps_per_second": 2.971, + "step": 12987 + }, + { + "epoch": 39.04, + "grad_norm": 8.879070281982422, + "learning_rate": 8.6986986986987e-06, + "loss": 0.227, + "step": 13000 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.9768, + "eval_loss": 0.07394447922706604, + "eval_runtime": 13.4641, + "eval_samples_per_second": 742.715, + "eval_steps_per_second": 2.971, + "step": 13320 + }, + { + "epoch": 40.54, + "grad_norm": 10.858572006225586, + "learning_rate": 8.64864864864865e-06, + "loss": 0.2213, + "step": 13500 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.9773, + "eval_loss": 0.07473840564489365, + "eval_runtime": 12.9211, + "eval_samples_per_second": 773.93, + "eval_steps_per_second": 3.096, + "step": 13653 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.9786, + "eval_loss": 0.07195272296667099, + "eval_runtime": 13.3716, + "eval_samples_per_second": 747.852, + "eval_steps_per_second": 2.991, + "step": 13986 + }, + { + "epoch": 42.04, + "grad_norm": 9.299273490905762, + "learning_rate": 8.5985985985986e-06, + "loss": 0.217, + "step": 14000 + }, + { + "epoch": 43.0, + "eval_accuracy": 0.9771, + "eval_loss": 0.07661354541778564, + "eval_runtime": 13.4888, + "eval_samples_per_second": 741.354, + "eval_steps_per_second": 2.965, + "step": 14319 + }, + { + "epoch": 43.54, + "grad_norm": 9.49695873260498, + "learning_rate": 8.54854854854855e-06, + "loss": 0.22, + "step": 14500 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.9767, + "eval_loss": 0.07640816271305084, + "eval_runtime": 14.0377, + "eval_samples_per_second": 712.365, + "eval_steps_per_second": 2.849, + "step": 14652 + }, + { + "epoch": 45.0, + "eval_accuracy": 0.9779, + "eval_loss": 0.07278802245855331, + "eval_runtime": 13.4886, + "eval_samples_per_second": 741.366, + "eval_steps_per_second": 2.965, + "step": 14985 + }, + { + "epoch": 45.05, + "grad_norm": 12.065461158752441, + "learning_rate": 8.4984984984985e-06, + "loss": 0.2179, + "step": 15000 + }, + { + "epoch": 46.0, + "eval_accuracy": 0.9785, + "eval_loss": 0.0740213543176651, + "eval_runtime": 14.112, + "eval_samples_per_second": 708.617, + "eval_steps_per_second": 2.834, + "step": 15318 + }, + { + "epoch": 46.55, + "grad_norm": 9.281307220458984, + "learning_rate": 8.44844844844845e-06, + "loss": 0.2074, + "step": 15500 + }, + { + "epoch": 47.0, + "eval_accuracy": 0.9793, + "eval_loss": 0.0712471604347229, + "eval_runtime": 13.5017, + "eval_samples_per_second": 740.647, + "eval_steps_per_second": 2.963, + "step": 15651 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.9783, + "eval_loss": 0.0759299248456955, + "eval_runtime": 13.3849, + "eval_samples_per_second": 747.113, + "eval_steps_per_second": 2.988, + "step": 15984 + }, + { + "epoch": 48.05, + "grad_norm": 6.8984503746032715, + "learning_rate": 8.398398398398398e-06, + "loss": 0.2096, + "step": 16000 + }, + { + "epoch": 49.0, + "eval_accuracy": 0.9791, + "eval_loss": 0.07268951088190079, + "eval_runtime": 13.5376, + "eval_samples_per_second": 738.686, + "eval_steps_per_second": 2.955, + "step": 16317 + }, + { + "epoch": 49.55, + "grad_norm": 8.968807220458984, + "learning_rate": 8.348348348348348e-06, + "loss": 0.2097, + "step": 16500 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.9792, + "eval_loss": 0.07472656667232513, + "eval_runtime": 13.5262, + "eval_samples_per_second": 739.304, + "eval_steps_per_second": 2.957, + "step": 16650 + }, + { + "epoch": 51.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.0754549577832222, + "eval_runtime": 13.1606, + "eval_samples_per_second": 759.845, + "eval_steps_per_second": 3.039, + "step": 16983 + }, + { + "epoch": 51.05, + "grad_norm": 8.540103912353516, + "learning_rate": 8.298298298298298e-06, + "loss": 0.2063, + "step": 17000 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.9788, + "eval_loss": 0.0741283968091011, + "eval_runtime": 13.8466, + "eval_samples_per_second": 722.201, + "eval_steps_per_second": 2.889, + "step": 17316 + }, + { + "epoch": 52.55, + "grad_norm": 7.042116165161133, + "learning_rate": 8.248248248248248e-06, + "loss": 0.2054, + "step": 17500 + }, + { + "epoch": 53.0, + "eval_accuracy": 0.9784, + "eval_loss": 0.0738772302865982, + "eval_runtime": 13.021, + "eval_samples_per_second": 767.992, + "eval_steps_per_second": 3.072, + "step": 17649 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.9779, + "eval_loss": 0.07553113251924515, + "eval_runtime": 12.8958, + "eval_samples_per_second": 775.444, + "eval_steps_per_second": 3.102, + "step": 17982 + }, + { + "epoch": 54.05, + "grad_norm": 9.23681640625, + "learning_rate": 8.198198198198198e-06, + "loss": 0.2003, + "step": 18000 + }, + { + "epoch": 55.0, + "eval_accuracy": 0.9784, + "eval_loss": 0.07760650664567947, + "eval_runtime": 12.6634, + "eval_samples_per_second": 789.678, + "eval_steps_per_second": 3.159, + "step": 18315 + }, + { + "epoch": 55.56, + "grad_norm": 5.839297771453857, + "learning_rate": 8.148148148148148e-06, + "loss": 0.2009, + "step": 18500 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.9786, + "eval_loss": 0.07352690398693085, + "eval_runtime": 13.1656, + "eval_samples_per_second": 759.554, + "eval_steps_per_second": 3.038, + "step": 18648 + }, + { + "epoch": 57.0, + "eval_accuracy": 0.9769, + "eval_loss": 0.07721856981515884, + "eval_runtime": 12.8626, + "eval_samples_per_second": 777.447, + "eval_steps_per_second": 3.11, + "step": 18981 + }, + { + "epoch": 57.06, + "grad_norm": 10.131054878234863, + "learning_rate": 8.098098098098098e-06, + "loss": 0.1999, + "step": 19000 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.9789, + "eval_loss": 0.07691636681556702, + "eval_runtime": 12.8042, + "eval_samples_per_second": 780.991, + "eval_steps_per_second": 3.124, + "step": 19314 + }, + { + "epoch": 58.56, + "grad_norm": 7.643968105316162, + "learning_rate": 8.048048048048048e-06, + "loss": 0.1973, + "step": 19500 + }, + { + "epoch": 59.0, + "eval_accuracy": 0.9793, + "eval_loss": 0.07336228340864182, + "eval_runtime": 13.6825, + "eval_samples_per_second": 730.861, + "eval_steps_per_second": 2.923, + "step": 19647 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.9787, + "eval_loss": 0.07408491522073746, + "eval_runtime": 13.4079, + "eval_samples_per_second": 745.831, + "eval_steps_per_second": 2.983, + "step": 19980 + }, + { + "epoch": 60.06, + "grad_norm": 9.443299293518066, + "learning_rate": 7.997997997997999e-06, + "loss": 0.1953, + "step": 20000 + }, + { + "epoch": 61.0, + "eval_accuracy": 0.978, + "eval_loss": 0.07513260841369629, + "eval_runtime": 13.4048, + "eval_samples_per_second": 745.999, + "eval_steps_per_second": 2.984, + "step": 20313 + }, + { + "epoch": 61.56, + "grad_norm": 16.85797119140625, + "learning_rate": 7.947947947947949e-06, + "loss": 0.1937, + "step": 20500 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.9786, + "eval_loss": 0.07370081543922424, + "eval_runtime": 13.3055, + "eval_samples_per_second": 751.568, + "eval_steps_per_second": 3.006, + "step": 20646 + }, + { + "epoch": 63.0, + "eval_accuracy": 0.9786, + "eval_loss": 0.07323586940765381, + "eval_runtime": 12.7695, + "eval_samples_per_second": 783.119, + "eval_steps_per_second": 3.132, + "step": 20979 + }, + { + "epoch": 63.06, + "grad_norm": 8.4561128616333, + "learning_rate": 7.897897897897899e-06, + "loss": 0.1946, + "step": 21000 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.9789, + "eval_loss": 0.07585693150758743, + "eval_runtime": 13.6921, + "eval_samples_per_second": 730.349, + "eval_steps_per_second": 2.921, + "step": 21312 + }, + { + "epoch": 64.56, + "grad_norm": 11.68150806427002, + "learning_rate": 7.847847847847849e-06, + "loss": 0.1909, + "step": 21500 + }, + { + "epoch": 65.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.07349375635385513, + "eval_runtime": 12.8445, + "eval_samples_per_second": 778.544, + "eval_steps_per_second": 3.114, + "step": 21645 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.9788, + "eval_loss": 0.07336971163749695, + "eval_runtime": 12.8882, + "eval_samples_per_second": 775.905, + "eval_steps_per_second": 3.104, + "step": 21978 + }, + { + "epoch": 66.07, + "grad_norm": 8.738271713256836, + "learning_rate": 7.797797797797799e-06, + "loss": 0.1935, + "step": 22000 + }, + { + "epoch": 67.0, + "eval_accuracy": 0.9793, + "eval_loss": 0.07337453961372375, + "eval_runtime": 12.8166, + "eval_samples_per_second": 780.24, + "eval_steps_per_second": 3.121, + "step": 22311 + }, + { + "epoch": 67.57, + "grad_norm": 6.386814117431641, + "learning_rate": 7.747747747747749e-06, + "loss": 0.1936, + "step": 22500 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.07239189743995667, + "eval_runtime": 12.831, + "eval_samples_per_second": 779.362, + "eval_steps_per_second": 3.117, + "step": 22644 + }, + { + "epoch": 69.0, + "eval_accuracy": 0.9785, + "eval_loss": 0.07570048421621323, + "eval_runtime": 12.8964, + "eval_samples_per_second": 775.412, + "eval_steps_per_second": 3.102, + "step": 22977 + }, + { + "epoch": 69.07, + "grad_norm": 9.476435661315918, + "learning_rate": 7.697697697697697e-06, + "loss": 0.1858, + "step": 23000 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.07100442796945572, + "eval_runtime": 13.0999, + "eval_samples_per_second": 763.367, + "eval_steps_per_second": 3.053, + "step": 23310 + }, + { + "epoch": 70.57, + "grad_norm": 9.190871238708496, + "learning_rate": 7.647647647647647e-06, + "loss": 0.1871, + "step": 23500 + }, + { + "epoch": 71.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.07596922665834427, + "eval_runtime": 13.423, + "eval_samples_per_second": 744.991, + "eval_steps_per_second": 2.98, + "step": 23643 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.07650475203990936, + "eval_runtime": 13.055, + "eval_samples_per_second": 765.99, + "eval_steps_per_second": 3.064, + "step": 23976 + }, + { + "epoch": 72.07, + "grad_norm": 10.302529335021973, + "learning_rate": 7.597597597597598e-06, + "loss": 0.1836, + "step": 24000 + }, + { + "epoch": 73.0, + "eval_accuracy": 0.9787, + "eval_loss": 0.07714686542749405, + "eval_runtime": 13.4272, + "eval_samples_per_second": 744.757, + "eval_steps_per_second": 2.979, + "step": 24309 + }, + { + "epoch": 73.57, + "grad_norm": 7.050232410430908, + "learning_rate": 7.547547547547548e-06, + "loss": 0.1827, + "step": 24500 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.9782, + "eval_loss": 0.07620517909526825, + "eval_runtime": 12.8858, + "eval_samples_per_second": 776.045, + "eval_steps_per_second": 3.104, + "step": 24642 + }, + { + "epoch": 75.0, + "eval_accuracy": 0.9781, + "eval_loss": 0.0778127908706665, + "eval_runtime": 13.234, + "eval_samples_per_second": 755.629, + "eval_steps_per_second": 3.023, + "step": 24975 + }, + { + "epoch": 75.08, + "grad_norm": 8.824182510375977, + "learning_rate": 7.4974974974974975e-06, + "loss": 0.1847, + "step": 25000 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.9781, + "eval_loss": 0.08140425384044647, + "eval_runtime": 13.9137, + "eval_samples_per_second": 718.714, + "eval_steps_per_second": 2.875, + "step": 25308 + }, + { + "epoch": 76.58, + "grad_norm": 8.920430183410645, + "learning_rate": 7.447447447447448e-06, + "loss": 0.1815, + "step": 25500 + }, + { + "epoch": 77.0, + "eval_accuracy": 0.9788, + "eval_loss": 0.07689312100410461, + "eval_runtime": 13.1404, + "eval_samples_per_second": 761.014, + "eval_steps_per_second": 3.044, + "step": 25641 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.07370501756668091, + "eval_runtime": 13.7683, + "eval_samples_per_second": 726.307, + "eval_steps_per_second": 2.905, + "step": 25974 + }, + { + "epoch": 78.08, + "grad_norm": 9.352115631103516, + "learning_rate": 7.397397397397398e-06, + "loss": 0.1786, + "step": 26000 + }, + { + "epoch": 79.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.07396883517503738, + "eval_runtime": 13.0019, + "eval_samples_per_second": 769.121, + "eval_steps_per_second": 3.076, + "step": 26307 + }, + { + "epoch": 79.58, + "grad_norm": 14.500313758850098, + "learning_rate": 7.347347347347348e-06, + "loss": 0.1819, + "step": 26500 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.07097125053405762, + "eval_runtime": 13.6192, + "eval_samples_per_second": 734.256, + "eval_steps_per_second": 2.937, + "step": 26640 + }, + { + "epoch": 81.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.07538946717977524, + "eval_runtime": 13.1675, + "eval_samples_per_second": 759.445, + "eval_steps_per_second": 3.038, + "step": 26973 + }, + { + "epoch": 81.08, + "grad_norm": 6.939184188842773, + "learning_rate": 7.297297297297298e-06, + "loss": 0.1767, + "step": 27000 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.9789, + "eval_loss": 0.07721950113773346, + "eval_runtime": 13.1312, + "eval_samples_per_second": 761.542, + "eval_steps_per_second": 3.046, + "step": 27306 + }, + { + "epoch": 82.58, + "grad_norm": 6.59556770324707, + "learning_rate": 7.247247247247248e-06, + "loss": 0.1792, + "step": 27500 + }, + { + "epoch": 83.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.0765281617641449, + "eval_runtime": 13.0707, + "eval_samples_per_second": 765.07, + "eval_steps_per_second": 3.06, + "step": 27639 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.07408629357814789, + "eval_runtime": 12.4754, + "eval_samples_per_second": 801.579, + "eval_steps_per_second": 3.206, + "step": 27972 + }, + { + "epoch": 84.08, + "grad_norm": 7.376372814178467, + "learning_rate": 7.197197197197198e-06, + "loss": 0.1752, + "step": 28000 + }, + { + "epoch": 85.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.0741427093744278, + "eval_runtime": 12.8952, + "eval_samples_per_second": 775.481, + "eval_steps_per_second": 3.102, + "step": 28305 + }, + { + "epoch": 85.59, + "grad_norm": 11.074542045593262, + "learning_rate": 7.147147147147148e-06, + "loss": 0.1789, + "step": 28500 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.07748846709728241, + "eval_runtime": 12.5701, + "eval_samples_per_second": 795.538, + "eval_steps_per_second": 3.182, + "step": 28638 + }, + { + "epoch": 87.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.07314252853393555, + "eval_runtime": 13.4921, + "eval_samples_per_second": 741.174, + "eval_steps_per_second": 2.965, + "step": 28971 + }, + { + "epoch": 87.09, + "grad_norm": 8.502799987792969, + "learning_rate": 7.097097097097097e-06, + "loss": 0.1755, + "step": 29000 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.07246743142604828, + "eval_runtime": 14.1164, + "eval_samples_per_second": 708.396, + "eval_steps_per_second": 2.834, + "step": 29304 + }, + { + "epoch": 88.59, + "grad_norm": 10.004383087158203, + "learning_rate": 7.047047047047047e-06, + "loss": 0.1694, + "step": 29500 + }, + { + "epoch": 89.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.07495511323213577, + "eval_runtime": 13.0203, + "eval_samples_per_second": 768.031, + "eval_steps_per_second": 3.072, + "step": 29637 + }, + { + "epoch": 90.0, + "eval_accuracy": 0.9815, + "eval_loss": 0.07112333923578262, + "eval_runtime": 12.8831, + "eval_samples_per_second": 776.209, + "eval_steps_per_second": 3.105, + "step": 29970 + }, + { + "epoch": 90.09, + "grad_norm": 10.05745792388916, + "learning_rate": 6.996996996996997e-06, + "loss": 0.1739, + "step": 30000 + }, + { + "epoch": 91.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07672711461782455, + "eval_runtime": 13.4897, + "eval_samples_per_second": 741.306, + "eval_steps_per_second": 2.965, + "step": 30303 + }, + { + "epoch": 91.59, + "grad_norm": 7.928704738616943, + "learning_rate": 6.9469469469469474e-06, + "loss": 0.1726, + "step": 30500 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.0770508348941803, + "eval_runtime": 13.4469, + "eval_samples_per_second": 743.667, + "eval_steps_per_second": 2.975, + "step": 30636 + }, + { + "epoch": 93.0, + "eval_accuracy": 0.9786, + "eval_loss": 0.0784955620765686, + "eval_runtime": 13.8288, + "eval_samples_per_second": 723.131, + "eval_steps_per_second": 2.893, + "step": 30969 + }, + { + "epoch": 93.09, + "grad_norm": 9.178421974182129, + "learning_rate": 6.8968968968968975e-06, + "loss": 0.1696, + "step": 31000 + }, + { + "epoch": 94.0, + "eval_accuracy": 0.9787, + "eval_loss": 0.07988455891609192, + "eval_runtime": 13.675, + "eval_samples_per_second": 731.261, + "eval_steps_per_second": 2.925, + "step": 31302 + }, + { + "epoch": 94.59, + "grad_norm": 7.237130165100098, + "learning_rate": 6.846846846846848e-06, + "loss": 0.1723, + "step": 31500 + }, + { + "epoch": 95.0, + "eval_accuracy": 0.979, + "eval_loss": 0.07755716890096664, + "eval_runtime": 13.4765, + "eval_samples_per_second": 742.035, + "eval_steps_per_second": 2.968, + "step": 31635 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.9796, + "eval_loss": 0.07740277796983719, + "eval_runtime": 13.7122, + "eval_samples_per_second": 729.276, + "eval_steps_per_second": 2.917, + "step": 31968 + }, + { + "epoch": 96.1, + "grad_norm": 6.332306385040283, + "learning_rate": 6.796796796796798e-06, + "loss": 0.1692, + "step": 32000 + }, + { + "epoch": 97.0, + "eval_accuracy": 0.9797, + "eval_loss": 0.08065084367990494, + "eval_runtime": 12.8364, + "eval_samples_per_second": 779.036, + "eval_steps_per_second": 3.116, + "step": 32301 + }, + { + "epoch": 97.6, + "grad_norm": 6.978306770324707, + "learning_rate": 6.746746746746748e-06, + "loss": 0.17, + "step": 32500 + }, + { + "epoch": 98.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.07497260719537735, + "eval_runtime": 12.9795, + "eval_samples_per_second": 770.446, + "eval_steps_per_second": 3.082, + "step": 32634 + }, + { + "epoch": 99.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.07648865878582001, + "eval_runtime": 13.1724, + "eval_samples_per_second": 759.163, + "eval_steps_per_second": 3.037, + "step": 32967 + }, + { + "epoch": 99.1, + "grad_norm": 9.569737434387207, + "learning_rate": 6.696696696696697e-06, + "loss": 0.1691, + "step": 33000 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.07629863917827606, + "eval_runtime": 13.7613, + "eval_samples_per_second": 726.674, + "eval_steps_per_second": 2.907, + "step": 33300 + }, + { + "epoch": 100.6, + "grad_norm": 9.273295402526855, + "learning_rate": 6.646646646646647e-06, + "loss": 0.165, + "step": 33500 + }, + { + "epoch": 101.0, + "eval_accuracy": 0.9794, + "eval_loss": 0.07651650160551071, + "eval_runtime": 12.8929, + "eval_samples_per_second": 775.622, + "eval_steps_per_second": 3.102, + "step": 33633 + }, + { + "epoch": 102.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.07412749528884888, + "eval_runtime": 13.1273, + "eval_samples_per_second": 761.772, + "eval_steps_per_second": 3.047, + "step": 33966 + }, + { + "epoch": 102.1, + "grad_norm": 5.686313152313232, + "learning_rate": 6.596596596596597e-06, + "loss": 0.1678, + "step": 34000 + }, + { + "epoch": 103.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.07281830161809921, + "eval_runtime": 14.1227, + "eval_samples_per_second": 708.079, + "eval_steps_per_second": 2.832, + "step": 34299 + }, + { + "epoch": 103.6, + "grad_norm": 13.40892505645752, + "learning_rate": 6.546546546546547e-06, + "loss": 0.1663, + "step": 34500 + }, + { + "epoch": 104.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.07456088066101074, + "eval_runtime": 12.9479, + "eval_samples_per_second": 772.329, + "eval_steps_per_second": 3.089, + "step": 34632 + }, + { + "epoch": 105.0, + "eval_accuracy": 0.9796, + "eval_loss": 0.07469187676906586, + "eval_runtime": 13.464, + "eval_samples_per_second": 742.723, + "eval_steps_per_second": 2.971, + "step": 34965 + }, + { + "epoch": 105.11, + "grad_norm": 3.3622846603393555, + "learning_rate": 6.496496496496497e-06, + "loss": 0.1697, + "step": 35000 + }, + { + "epoch": 106.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07429709285497665, + "eval_runtime": 12.5016, + "eval_samples_per_second": 799.901, + "eval_steps_per_second": 3.2, + "step": 35298 + }, + { + "epoch": 106.61, + "grad_norm": 13.544451713562012, + "learning_rate": 6.446446446446447e-06, + "loss": 0.1637, + "step": 35500 + }, + { + "epoch": 107.0, + "eval_accuracy": 0.9796, + "eval_loss": 0.07689350843429565, + "eval_runtime": 13.0156, + "eval_samples_per_second": 768.306, + "eval_steps_per_second": 3.073, + "step": 35631 + }, + { + "epoch": 108.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.07509542256593704, + "eval_runtime": 13.0921, + "eval_samples_per_second": 763.817, + "eval_steps_per_second": 3.055, + "step": 35964 + }, + { + "epoch": 108.11, + "grad_norm": 11.040998458862305, + "learning_rate": 6.396396396396397e-06, + "loss": 0.1678, + "step": 36000 + }, + { + "epoch": 109.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.0769224464893341, + "eval_runtime": 13.4563, + "eval_samples_per_second": 743.145, + "eval_steps_per_second": 2.973, + "step": 36297 + }, + { + "epoch": 109.61, + "grad_norm": 7.243069171905518, + "learning_rate": 6.3463463463463474e-06, + "loss": 0.1674, + "step": 36500 + }, + { + "epoch": 110.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.07392393797636032, + "eval_runtime": 12.9386, + "eval_samples_per_second": 772.879, + "eval_steps_per_second": 3.092, + "step": 36630 + }, + { + "epoch": 111.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.0809590220451355, + "eval_runtime": 13.2637, + "eval_samples_per_second": 753.935, + "eval_steps_per_second": 3.016, + "step": 36963 + }, + { + "epoch": 111.11, + "grad_norm": 8.149242401123047, + "learning_rate": 6.296296296296297e-06, + "loss": 0.1604, + "step": 37000 + }, + { + "epoch": 112.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.07439053803682327, + "eval_runtime": 12.9959, + "eval_samples_per_second": 769.471, + "eval_steps_per_second": 3.078, + "step": 37296 + }, + { + "epoch": 112.61, + "grad_norm": 6.591969966888428, + "learning_rate": 6.246246246246247e-06, + "loss": 0.1583, + "step": 37500 + }, + { + "epoch": 113.0, + "eval_accuracy": 0.9816, + "eval_loss": 0.07411955296993256, + "eval_runtime": 12.6355, + "eval_samples_per_second": 791.421, + "eval_steps_per_second": 3.166, + "step": 37629 + }, + { + "epoch": 114.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07842327654361725, + "eval_runtime": 14.6497, + "eval_samples_per_second": 682.608, + "eval_steps_per_second": 2.73, + "step": 37962 + }, + { + "epoch": 114.11, + "grad_norm": 8.455940246582031, + "learning_rate": 6.196196196196197e-06, + "loss": 0.1592, + "step": 38000 + }, + { + "epoch": 115.0, + "eval_accuracy": 0.9818, + "eval_loss": 0.07287651300430298, + "eval_runtime": 13.5165, + "eval_samples_per_second": 739.838, + "eval_steps_per_second": 2.959, + "step": 38295 + }, + { + "epoch": 115.62, + "grad_norm": 6.092105388641357, + "learning_rate": 6.146146146146147e-06, + "loss": 0.1607, + "step": 38500 + }, + { + "epoch": 116.0, + "eval_accuracy": 0.9818, + "eval_loss": 0.07438412308692932, + "eval_runtime": 13.6093, + "eval_samples_per_second": 734.794, + "eval_steps_per_second": 2.939, + "step": 38628 + }, + { + "epoch": 117.0, + "eval_accuracy": 0.9817, + "eval_loss": 0.07355909794569016, + "eval_runtime": 12.6698, + "eval_samples_per_second": 789.279, + "eval_steps_per_second": 3.157, + "step": 38961 + }, + { + "epoch": 117.12, + "grad_norm": 7.972623348236084, + "learning_rate": 6.096096096096097e-06, + "loss": 0.1657, + "step": 39000 + }, + { + "epoch": 118.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.0769243985414505, + "eval_runtime": 13.3442, + "eval_samples_per_second": 749.391, + "eval_steps_per_second": 2.998, + "step": 39294 + }, + { + "epoch": 118.62, + "grad_norm": 7.559940338134766, + "learning_rate": 6.046046046046047e-06, + "loss": 0.1605, + "step": 39500 + }, + { + "epoch": 119.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.0768662765622139, + "eval_runtime": 13.0344, + "eval_samples_per_second": 767.2, + "eval_steps_per_second": 3.069, + "step": 39627 + }, + { + "epoch": 120.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.07865633815526962, + "eval_runtime": 13.8055, + "eval_samples_per_second": 724.347, + "eval_steps_per_second": 2.897, + "step": 39960 + }, + { + "epoch": 120.12, + "grad_norm": 7.175966739654541, + "learning_rate": 5.995995995995997e-06, + "loss": 0.1554, + "step": 40000 + }, + { + "epoch": 121.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.07854399085044861, + "eval_runtime": 12.8799, + "eval_samples_per_second": 776.402, + "eval_steps_per_second": 3.106, + "step": 40293 + }, + { + "epoch": 121.62, + "grad_norm": 12.97214126586914, + "learning_rate": 5.945945945945947e-06, + "loss": 0.157, + "step": 40500 + }, + { + "epoch": 122.0, + "eval_accuracy": 0.9796, + "eval_loss": 0.0760401040315628, + "eval_runtime": 12.9319, + "eval_samples_per_second": 773.283, + "eval_steps_per_second": 3.093, + "step": 40626 + }, + { + "epoch": 123.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.07537718862295151, + "eval_runtime": 12.9913, + "eval_samples_per_second": 769.749, + "eval_steps_per_second": 3.079, + "step": 40959 + }, + { + "epoch": 123.12, + "grad_norm": 7.540937423706055, + "learning_rate": 5.895895895895896e-06, + "loss": 0.1549, + "step": 41000 + }, + { + "epoch": 124.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.07550998032093048, + "eval_runtime": 14.417, + "eval_samples_per_second": 693.624, + "eval_steps_per_second": 2.774, + "step": 41292 + }, + { + "epoch": 124.62, + "grad_norm": 6.355432987213135, + "learning_rate": 5.8458458458458464e-06, + "loss": 0.1578, + "step": 41500 + }, + { + "epoch": 125.0, + "eval_accuracy": 0.9792, + "eval_loss": 0.07649920880794525, + "eval_runtime": 13.4926, + "eval_samples_per_second": 741.15, + "eval_steps_per_second": 2.965, + "step": 41625 + }, + { + "epoch": 126.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07526528090238571, + "eval_runtime": 12.9079, + "eval_samples_per_second": 774.72, + "eval_steps_per_second": 3.099, + "step": 41958 + }, + { + "epoch": 126.13, + "grad_norm": 6.478011131286621, + "learning_rate": 5.7957957957957965e-06, + "loss": 0.1531, + "step": 42000 + }, + { + "epoch": 127.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07793418318033218, + "eval_runtime": 13.471, + "eval_samples_per_second": 742.337, + "eval_steps_per_second": 2.969, + "step": 42291 + }, + { + "epoch": 127.63, + "grad_norm": 7.928163051605225, + "learning_rate": 5.7457457457457466e-06, + "loss": 0.1572, + "step": 42500 + }, + { + "epoch": 128.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07834824174642563, + "eval_runtime": 13.8772, + "eval_samples_per_second": 720.605, + "eval_steps_per_second": 2.882, + "step": 42624 + }, + { + "epoch": 129.0, + "eval_accuracy": 0.9796, + "eval_loss": 0.0785522609949112, + "eval_runtime": 12.947, + "eval_samples_per_second": 772.377, + "eval_steps_per_second": 3.09, + "step": 42957 + }, + { + "epoch": 129.13, + "grad_norm": 19.900619506835938, + "learning_rate": 5.695695695695697e-06, + "loss": 0.1558, + "step": 43000 + }, + { + "epoch": 130.0, + "eval_accuracy": 0.9814, + "eval_loss": 0.0741908997297287, + "eval_runtime": 12.8882, + "eval_samples_per_second": 775.906, + "eval_steps_per_second": 3.104, + "step": 43290 + }, + { + "epoch": 130.63, + "grad_norm": 12.561553001403809, + "learning_rate": 5.645645645645647e-06, + "loss": 0.1515, + "step": 43500 + }, + { + "epoch": 131.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.07759422063827515, + "eval_runtime": 14.2426, + "eval_samples_per_second": 702.121, + "eval_steps_per_second": 2.808, + "step": 43623 + }, + { + "epoch": 132.0, + "eval_accuracy": 0.9793, + "eval_loss": 0.08000089973211288, + "eval_runtime": 13.0308, + "eval_samples_per_second": 767.413, + "eval_steps_per_second": 3.07, + "step": 43956 + }, + { + "epoch": 132.13, + "grad_norm": 10.955676078796387, + "learning_rate": 5.595595595595597e-06, + "loss": 0.1526, + "step": 44000 + }, + { + "epoch": 133.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.07563788443803787, + "eval_runtime": 12.9183, + "eval_samples_per_second": 774.093, + "eval_steps_per_second": 3.096, + "step": 44289 + }, + { + "epoch": 133.63, + "grad_norm": 9.621336936950684, + "learning_rate": 5.545545545545547e-06, + "loss": 0.1523, + "step": 44500 + }, + { + "epoch": 134.0, + "eval_accuracy": 0.9797, + "eval_loss": 0.07889340072870255, + "eval_runtime": 13.5904, + "eval_samples_per_second": 735.813, + "eval_steps_per_second": 2.943, + "step": 44622 + }, + { + "epoch": 135.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.07651440799236298, + "eval_runtime": 13.0261, + "eval_samples_per_second": 767.689, + "eval_steps_per_second": 3.071, + "step": 44955 + }, + { + "epoch": 135.14, + "grad_norm": 9.40494155883789, + "learning_rate": 5.495495495495496e-06, + "loss": 0.1519, + "step": 45000 + }, + { + "epoch": 136.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.07700727880001068, + "eval_runtime": 14.2776, + "eval_samples_per_second": 700.397, + "eval_steps_per_second": 2.802, + "step": 45288 + }, + { + "epoch": 136.64, + "grad_norm": 7.778809070587158, + "learning_rate": 5.445445445445446e-06, + "loss": 0.1491, + "step": 45500 + }, + { + "epoch": 137.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07937881350517273, + "eval_runtime": 13.7045, + "eval_samples_per_second": 729.689, + "eval_steps_per_second": 2.919, + "step": 45621 + }, + { + "epoch": 138.0, + "eval_accuracy": 0.9796, + "eval_loss": 0.07901179045438766, + "eval_runtime": 12.8776, + "eval_samples_per_second": 776.54, + "eval_steps_per_second": 3.106, + "step": 45954 + }, + { + "epoch": 138.14, + "grad_norm": 12.694830894470215, + "learning_rate": 5.395395395395396e-06, + "loss": 0.1488, + "step": 46000 + }, + { + "epoch": 139.0, + "eval_accuracy": 0.9796, + "eval_loss": 0.07827717065811157, + "eval_runtime": 13.01, + "eval_samples_per_second": 768.642, + "eval_steps_per_second": 3.075, + "step": 46287 + }, + { + "epoch": 139.64, + "grad_norm": 5.728260517120361, + "learning_rate": 5.345345345345346e-06, + "loss": 0.1511, + "step": 46500 + }, + { + "epoch": 140.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07687978446483612, + "eval_runtime": 13.4169, + "eval_samples_per_second": 745.331, + "eval_steps_per_second": 2.981, + "step": 46620 + }, + { + "epoch": 141.0, + "eval_accuracy": 0.9797, + "eval_loss": 0.0826837420463562, + "eval_runtime": 13.6768, + "eval_samples_per_second": 731.166, + "eval_steps_per_second": 2.925, + "step": 46953 + }, + { + "epoch": 141.14, + "grad_norm": 8.749393463134766, + "learning_rate": 5.2952952952952955e-06, + "loss": 0.1475, + "step": 47000 + }, + { + "epoch": 142.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07702562212944031, + "eval_runtime": 13.4888, + "eval_samples_per_second": 741.356, + "eval_steps_per_second": 2.965, + "step": 47286 + }, + { + "epoch": 142.64, + "grad_norm": 8.479342460632324, + "learning_rate": 5.245245245245245e-06, + "loss": 0.1449, + "step": 47500 + }, + { + "epoch": 143.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07797821611166, + "eval_runtime": 13.0058, + "eval_samples_per_second": 768.886, + "eval_steps_per_second": 3.076, + "step": 47619 + }, + { + "epoch": 144.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.07707054167985916, + "eval_runtime": 12.9845, + "eval_samples_per_second": 770.15, + "eval_steps_per_second": 3.081, + "step": 47952 + }, + { + "epoch": 144.14, + "grad_norm": 10.80911636352539, + "learning_rate": 5.195195195195195e-06, + "loss": 0.146, + "step": 48000 + }, + { + "epoch": 145.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.0750807523727417, + "eval_runtime": 14.0436, + "eval_samples_per_second": 712.069, + "eval_steps_per_second": 2.848, + "step": 48285 + }, + { + "epoch": 145.65, + "grad_norm": 5.568371295928955, + "learning_rate": 5.145145145145145e-06, + "loss": 0.1473, + "step": 48500 + }, + { + "epoch": 146.0, + "eval_accuracy": 0.9797, + "eval_loss": 0.07933703809976578, + "eval_runtime": 13.1022, + "eval_samples_per_second": 763.232, + "eval_steps_per_second": 3.053, + "step": 48618 + }, + { + "epoch": 147.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.07590621709823608, + "eval_runtime": 13.1387, + "eval_samples_per_second": 761.108, + "eval_steps_per_second": 3.044, + "step": 48951 + }, + { + "epoch": 147.15, + "grad_norm": 8.234355926513672, + "learning_rate": 5.095095095095095e-06, + "loss": 0.1466, + "step": 49000 + }, + { + "epoch": 148.0, + "eval_accuracy": 0.9787, + "eval_loss": 0.08211437612771988, + "eval_runtime": 13.453, + "eval_samples_per_second": 743.33, + "eval_steps_per_second": 2.973, + "step": 49284 + }, + { + "epoch": 148.65, + "grad_norm": 9.734493255615234, + "learning_rate": 5.045045045045045e-06, + "loss": 0.1472, + "step": 49500 + }, + { + "epoch": 149.0, + "eval_accuracy": 0.9813, + "eval_loss": 0.07566899061203003, + "eval_runtime": 13.5127, + "eval_samples_per_second": 740.042, + "eval_steps_per_second": 2.96, + "step": 49617 + }, + { + "epoch": 150.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.07641930133104324, + "eval_runtime": 13.729, + "eval_samples_per_second": 728.384, + "eval_steps_per_second": 2.914, + "step": 49950 + }, + { + "epoch": 150.15, + "grad_norm": 9.083195686340332, + "learning_rate": 4.994994994994996e-06, + "loss": 0.1437, + "step": 50000 + }, + { + "epoch": 151.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.0816345363855362, + "eval_runtime": 13.6081, + "eval_samples_per_second": 734.856, + "eval_steps_per_second": 2.939, + "step": 50283 + }, + { + "epoch": 151.65, + "grad_norm": 16.20008087158203, + "learning_rate": 4.944944944944945e-06, + "loss": 0.1487, + "step": 50500 + }, + { + "epoch": 152.0, + "eval_accuracy": 0.9818, + "eval_loss": 0.07768727838993073, + "eval_runtime": 12.9061, + "eval_samples_per_second": 774.83, + "eval_steps_per_second": 3.099, + "step": 50616 + }, + { + "epoch": 153.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.07950293272733688, + "eval_runtime": 13.0523, + "eval_samples_per_second": 766.151, + "eval_steps_per_second": 3.065, + "step": 50949 + }, + { + "epoch": 153.15, + "grad_norm": 6.783934593200684, + "learning_rate": 4.894894894894895e-06, + "loss": 0.1455, + "step": 51000 + }, + { + "epoch": 154.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.07836713641881943, + "eval_runtime": 13.4341, + "eval_samples_per_second": 744.377, + "eval_steps_per_second": 2.978, + "step": 51282 + }, + { + "epoch": 154.65, + "grad_norm": 7.791309833526611, + "learning_rate": 4.844844844844845e-06, + "loss": 0.1463, + "step": 51500 + }, + { + "epoch": 155.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.07995989918708801, + "eval_runtime": 13.7204, + "eval_samples_per_second": 728.844, + "eval_steps_per_second": 2.915, + "step": 51615 + }, + { + "epoch": 156.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.07914280891418457, + "eval_runtime": 13.045, + "eval_samples_per_second": 766.58, + "eval_steps_per_second": 3.066, + "step": 51948 + }, + { + "epoch": 156.16, + "grad_norm": 7.225980281829834, + "learning_rate": 4.794794794794795e-06, + "loss": 0.1449, + "step": 52000 + }, + { + "epoch": 157.0, + "eval_accuracy": 0.9815, + "eval_loss": 0.0777197852730751, + "eval_runtime": 12.8795, + "eval_samples_per_second": 776.43, + "eval_steps_per_second": 3.106, + "step": 52281 + }, + { + "epoch": 157.66, + "grad_norm": 7.848995208740234, + "learning_rate": 4.7447447447447454e-06, + "loss": 0.1413, + "step": 52500 + }, + { + "epoch": 158.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.07978815585374832, + "eval_runtime": 13.0849, + "eval_samples_per_second": 764.238, + "eval_steps_per_second": 3.057, + "step": 52614 + }, + { + "epoch": 159.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.08010842651128769, + "eval_runtime": 12.9948, + "eval_samples_per_second": 769.539, + "eval_steps_per_second": 3.078, + "step": 52947 + }, + { + "epoch": 159.16, + "grad_norm": 10.857318878173828, + "learning_rate": 4.6946946946946955e-06, + "loss": 0.143, + "step": 53000 + }, + { + "epoch": 160.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.07897085696458817, + "eval_runtime": 13.4824, + "eval_samples_per_second": 741.707, + "eval_steps_per_second": 2.967, + "step": 53280 + }, + { + "epoch": 160.66, + "grad_norm": 8.192683219909668, + "learning_rate": 4.6446446446446456e-06, + "loss": 0.1462, + "step": 53500 + }, + { + "epoch": 161.0, + "eval_accuracy": 0.9794, + "eval_loss": 0.07847656309604645, + "eval_runtime": 13.3614, + "eval_samples_per_second": 748.422, + "eval_steps_per_second": 2.994, + "step": 53613 + }, + { + "epoch": 162.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.07839296758174896, + "eval_runtime": 13.4943, + "eval_samples_per_second": 741.054, + "eval_steps_per_second": 2.964, + "step": 53946 + }, + { + "epoch": 162.16, + "grad_norm": 5.753213882446289, + "learning_rate": 4.594594594594596e-06, + "loss": 0.1454, + "step": 54000 + }, + { + "epoch": 163.0, + "eval_accuracy": 0.9814, + "eval_loss": 0.07774946093559265, + "eval_runtime": 13.5161, + "eval_samples_per_second": 739.858, + "eval_steps_per_second": 2.959, + "step": 54279 + }, + { + "epoch": 163.66, + "grad_norm": 23.634429931640625, + "learning_rate": 4.544544544544545e-06, + "loss": 0.1404, + "step": 54500 + }, + { + "epoch": 164.0, + "eval_accuracy": 0.9817, + "eval_loss": 0.07676123827695847, + "eval_runtime": 13.7785, + "eval_samples_per_second": 725.767, + "eval_steps_per_second": 2.903, + "step": 54612 + }, + { + "epoch": 165.0, + "eval_accuracy": 0.9795, + "eval_loss": 0.07868321239948273, + "eval_runtime": 13.3337, + "eval_samples_per_second": 749.978, + "eval_steps_per_second": 3.0, + "step": 54945 + }, + { + "epoch": 165.17, + "grad_norm": 14.497030258178711, + "learning_rate": 4.494494494494495e-06, + "loss": 0.1404, + "step": 55000 + }, + { + "epoch": 166.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.08142885565757751, + "eval_runtime": 13.0878, + "eval_samples_per_second": 764.068, + "eval_steps_per_second": 3.056, + "step": 55278 + }, + { + "epoch": 166.67, + "grad_norm": 5.504241943359375, + "learning_rate": 4.444444444444444e-06, + "loss": 0.1438, + "step": 55500 + }, + { + "epoch": 167.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08015668392181396, + "eval_runtime": 13.3375, + "eval_samples_per_second": 749.766, + "eval_steps_per_second": 2.999, + "step": 55611 + }, + { + "epoch": 168.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.0773804783821106, + "eval_runtime": 13.1562, + "eval_samples_per_second": 760.1, + "eval_steps_per_second": 3.04, + "step": 55944 + }, + { + "epoch": 168.17, + "grad_norm": 10.65889835357666, + "learning_rate": 4.394394394394394e-06, + "loss": 0.1405, + "step": 56000 + }, + { + "epoch": 169.0, + "eval_accuracy": 0.9793, + "eval_loss": 0.07769276201725006, + "eval_runtime": 13.37, + "eval_samples_per_second": 747.945, + "eval_steps_per_second": 2.992, + "step": 56277 + }, + { + "epoch": 169.67, + "grad_norm": 9.663138389587402, + "learning_rate": 4.344344344344344e-06, + "loss": 0.1465, + "step": 56500 + }, + { + "epoch": 170.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.07831669598817825, + "eval_runtime": 13.9555, + "eval_samples_per_second": 716.565, + "eval_steps_per_second": 2.866, + "step": 56610 + }, + { + "epoch": 171.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.08174577355384827, + "eval_runtime": 13.3581, + "eval_samples_per_second": 748.612, + "eval_steps_per_second": 2.994, + "step": 56943 + }, + { + "epoch": 171.17, + "grad_norm": 11.15052604675293, + "learning_rate": 4.294294294294294e-06, + "loss": 0.1404, + "step": 57000 + }, + { + "epoch": 172.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.0780324712395668, + "eval_runtime": 12.9812, + "eval_samples_per_second": 770.346, + "eval_steps_per_second": 3.081, + "step": 57276 + }, + { + "epoch": 172.67, + "grad_norm": 10.398097038269043, + "learning_rate": 4.2442442442442444e-06, + "loss": 0.1367, + "step": 57500 + }, + { + "epoch": 173.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.07895645499229431, + "eval_runtime": 12.8468, + "eval_samples_per_second": 778.402, + "eval_steps_per_second": 3.114, + "step": 57609 + }, + { + "epoch": 174.0, + "eval_accuracy": 0.9816, + "eval_loss": 0.07868947833776474, + "eval_runtime": 13.3373, + "eval_samples_per_second": 749.779, + "eval_steps_per_second": 2.999, + "step": 57942 + }, + { + "epoch": 174.17, + "grad_norm": 8.292234420776367, + "learning_rate": 4.1941941941941945e-06, + "loss": 0.1399, + "step": 58000 + }, + { + "epoch": 175.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.08106452971696854, + "eval_runtime": 12.854, + "eval_samples_per_second": 777.968, + "eval_steps_per_second": 3.112, + "step": 58275 + }, + { + "epoch": 175.68, + "grad_norm": 12.446533203125, + "learning_rate": 4.1441441441441446e-06, + "loss": 0.1418, + "step": 58500 + }, + { + "epoch": 176.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.08040361106395721, + "eval_runtime": 13.1526, + "eval_samples_per_second": 760.308, + "eval_steps_per_second": 3.041, + "step": 58608 + }, + { + "epoch": 177.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.07995961606502533, + "eval_runtime": 13.0397, + "eval_samples_per_second": 766.888, + "eval_steps_per_second": 3.068, + "step": 58941 + }, + { + "epoch": 177.18, + "grad_norm": 9.551538467407227, + "learning_rate": 4.094094094094095e-06, + "loss": 0.1381, + "step": 59000 + }, + { + "epoch": 178.0, + "eval_accuracy": 0.9814, + "eval_loss": 0.07857974618673325, + "eval_runtime": 13.3646, + "eval_samples_per_second": 748.245, + "eval_steps_per_second": 2.993, + "step": 59274 + }, + { + "epoch": 178.68, + "grad_norm": 7.961233615875244, + "learning_rate": 4.044044044044044e-06, + "loss": 0.1357, + "step": 59500 + }, + { + "epoch": 179.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.0797557458281517, + "eval_runtime": 13.833, + "eval_samples_per_second": 722.907, + "eval_steps_per_second": 2.892, + "step": 59607 + }, + { + "epoch": 180.0, + "eval_accuracy": 0.9813, + "eval_loss": 0.07922037690877914, + "eval_runtime": 13.1611, + "eval_samples_per_second": 759.818, + "eval_steps_per_second": 3.039, + "step": 59940 + }, + { + "epoch": 180.18, + "grad_norm": 8.392486572265625, + "learning_rate": 3.993993993993994e-06, + "loss": 0.1465, + "step": 60000 + }, + { + "epoch": 181.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.08021984249353409, + "eval_runtime": 12.7756, + "eval_samples_per_second": 782.741, + "eval_steps_per_second": 3.131, + "step": 60273 + }, + { + "epoch": 181.68, + "grad_norm": 5.668210506439209, + "learning_rate": 3.943943943943944e-06, + "loss": 0.1366, + "step": 60500 + }, + { + "epoch": 182.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.07884296774864197, + "eval_runtime": 12.9767, + "eval_samples_per_second": 770.61, + "eval_steps_per_second": 3.082, + "step": 60606 + }, + { + "epoch": 183.0, + "eval_accuracy": 0.979, + "eval_loss": 0.0805293619632721, + "eval_runtime": 12.9332, + "eval_samples_per_second": 773.205, + "eval_steps_per_second": 3.093, + "step": 60939 + }, + { + "epoch": 183.18, + "grad_norm": 9.771552085876465, + "learning_rate": 3.893893893893894e-06, + "loss": 0.139, + "step": 61000 + }, + { + "epoch": 184.0, + "eval_accuracy": 0.9794, + "eval_loss": 0.0822456106543541, + "eval_runtime": 13.3118, + "eval_samples_per_second": 751.211, + "eval_steps_per_second": 3.005, + "step": 61272 + }, + { + "epoch": 184.68, + "grad_norm": 10.898391723632812, + "learning_rate": 3.843843843843844e-06, + "loss": 0.1381, + "step": 61500 + }, + { + "epoch": 185.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08079157024621964, + "eval_runtime": 12.8717, + "eval_samples_per_second": 776.899, + "eval_steps_per_second": 3.108, + "step": 61605 + }, + { + "epoch": 186.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08059785515069962, + "eval_runtime": 12.418, + "eval_samples_per_second": 805.284, + "eval_steps_per_second": 3.221, + "step": 61938 + }, + { + "epoch": 186.19, + "grad_norm": 6.1758246421813965, + "learning_rate": 3.793793793793794e-06, + "loss": 0.1367, + "step": 62000 + }, + { + "epoch": 187.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.07853790372610092, + "eval_runtime": 12.9215, + "eval_samples_per_second": 773.902, + "eval_steps_per_second": 3.096, + "step": 62271 + }, + { + "epoch": 187.69, + "grad_norm": 9.155027389526367, + "learning_rate": 3.743743743743744e-06, + "loss": 0.1354, + "step": 62500 + }, + { + "epoch": 188.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.0803978368639946, + "eval_runtime": 13.5157, + "eval_samples_per_second": 739.883, + "eval_steps_per_second": 2.96, + "step": 62604 + }, + { + "epoch": 189.0, + "eval_accuracy": 0.98, + "eval_loss": 0.07950347661972046, + "eval_runtime": 13.0185, + "eval_samples_per_second": 768.138, + "eval_steps_per_second": 3.073, + "step": 62937 + }, + { + "epoch": 189.19, + "grad_norm": 9.88645076751709, + "learning_rate": 3.693693693693694e-06, + "loss": 0.137, + "step": 63000 + }, + { + "epoch": 190.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.07970842719078064, + "eval_runtime": 13.0486, + "eval_samples_per_second": 766.367, + "eval_steps_per_second": 3.065, + "step": 63270 + }, + { + "epoch": 190.69, + "grad_norm": 10.085098266601562, + "learning_rate": 3.643643643643644e-06, + "loss": 0.1351, + "step": 63500 + }, + { + "epoch": 191.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.07862575352191925, + "eval_runtime": 13.7359, + "eval_samples_per_second": 728.019, + "eval_steps_per_second": 2.912, + "step": 63603 + }, + { + "epoch": 192.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.07779725641012192, + "eval_runtime": 14.1749, + "eval_samples_per_second": 705.473, + "eval_steps_per_second": 2.822, + "step": 63936 + }, + { + "epoch": 192.19, + "grad_norm": 7.259002685546875, + "learning_rate": 3.593593593593594e-06, + "loss": 0.1345, + "step": 64000 + }, + { + "epoch": 193.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.07995971292257309, + "eval_runtime": 13.3268, + "eval_samples_per_second": 750.366, + "eval_steps_per_second": 3.001, + "step": 64269 + }, + { + "epoch": 193.69, + "grad_norm": 6.42719030380249, + "learning_rate": 3.5435435435435437e-06, + "loss": 0.1377, + "step": 64500 + }, + { + "epoch": 194.0, + "eval_accuracy": 0.9799, + "eval_loss": 0.07895601540803909, + "eval_runtime": 12.9129, + "eval_samples_per_second": 774.417, + "eval_steps_per_second": 3.098, + "step": 64602 + }, + { + "epoch": 195.0, + "eval_accuracy": 0.98, + "eval_loss": 0.08155795186758041, + "eval_runtime": 13.7447, + "eval_samples_per_second": 727.555, + "eval_steps_per_second": 2.91, + "step": 64935 + }, + { + "epoch": 195.2, + "grad_norm": 7.303466320037842, + "learning_rate": 3.4934934934934938e-06, + "loss": 0.1339, + "step": 65000 + }, + { + "epoch": 196.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.08134587854146957, + "eval_runtime": 12.87, + "eval_samples_per_second": 777.004, + "eval_steps_per_second": 3.108, + "step": 65268 + }, + { + "epoch": 196.7, + "grad_norm": 10.115856170654297, + "learning_rate": 3.443443443443444e-06, + "loss": 0.1338, + "step": 65500 + }, + { + "epoch": 197.0, + "eval_accuracy": 0.981, + "eval_loss": 0.07863133400678635, + "eval_runtime": 13.1588, + "eval_samples_per_second": 759.949, + "eval_steps_per_second": 3.04, + "step": 65601 + }, + { + "epoch": 198.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08128491789102554, + "eval_runtime": 12.3451, + "eval_samples_per_second": 810.038, + "eval_steps_per_second": 3.24, + "step": 65934 + }, + { + "epoch": 198.2, + "grad_norm": 9.01919174194336, + "learning_rate": 3.393393393393394e-06, + "loss": 0.1371, + "step": 66000 + }, + { + "epoch": 199.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.08089832216501236, + "eval_runtime": 13.1128, + "eval_samples_per_second": 762.612, + "eval_steps_per_second": 3.05, + "step": 66267 + }, + { + "epoch": 199.7, + "grad_norm": 9.190634727478027, + "learning_rate": 3.3433433433433436e-06, + "loss": 0.1339, + "step": 66500 + }, + { + "epoch": 200.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.07968232780694962, + "eval_runtime": 12.8919, + "eval_samples_per_second": 775.68, + "eval_steps_per_second": 3.103, + "step": 66600 + }, + { + "epoch": 201.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.08057761192321777, + "eval_runtime": 12.9886, + "eval_samples_per_second": 769.904, + "eval_steps_per_second": 3.08, + "step": 66933 + }, + { + "epoch": 201.2, + "grad_norm": 9.490571022033691, + "learning_rate": 3.2932932932932936e-06, + "loss": 0.131, + "step": 67000 + }, + { + "epoch": 202.0, + "eval_accuracy": 0.98, + "eval_loss": 0.08165069669485092, + "eval_runtime": 13.9588, + "eval_samples_per_second": 716.394, + "eval_steps_per_second": 2.866, + "step": 67266 + }, + { + "epoch": 202.7, + "grad_norm": 8.564950942993164, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.1365, + "step": 67500 + }, + { + "epoch": 203.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.08228688687086105, + "eval_runtime": 12.9615, + "eval_samples_per_second": 771.513, + "eval_steps_per_second": 3.086, + "step": 67599 + }, + { + "epoch": 204.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.08267272263765335, + "eval_runtime": 12.8976, + "eval_samples_per_second": 775.339, + "eval_steps_per_second": 3.101, + "step": 67932 + }, + { + "epoch": 204.2, + "grad_norm": 9.844771385192871, + "learning_rate": 3.1931931931931938e-06, + "loss": 0.1358, + "step": 68000 + }, + { + "epoch": 205.0, + "eval_accuracy": 0.9816, + "eval_loss": 0.0804433524608612, + "eval_runtime": 12.8434, + "eval_samples_per_second": 778.613, + "eval_steps_per_second": 3.114, + "step": 68265 + }, + { + "epoch": 205.71, + "grad_norm": 9.6033935546875, + "learning_rate": 3.1431431431431434e-06, + "loss": 0.132, + "step": 68500 + }, + { + "epoch": 206.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08253764361143112, + "eval_runtime": 13.4062, + "eval_samples_per_second": 745.922, + "eval_steps_per_second": 2.984, + "step": 68598 + }, + { + "epoch": 207.0, + "eval_accuracy": 0.981, + "eval_loss": 0.07984968274831772, + "eval_runtime": 13.6899, + "eval_samples_per_second": 730.467, + "eval_steps_per_second": 2.922, + "step": 68931 + }, + { + "epoch": 207.21, + "grad_norm": 7.0395355224609375, + "learning_rate": 3.0930930930930935e-06, + "loss": 0.1396, + "step": 69000 + }, + { + "epoch": 208.0, + "eval_accuracy": 0.9813, + "eval_loss": 0.08085375279188156, + "eval_runtime": 12.8706, + "eval_samples_per_second": 776.962, + "eval_steps_per_second": 3.108, + "step": 69264 + }, + { + "epoch": 208.71, + "grad_norm": 12.84909725189209, + "learning_rate": 3.0430430430430436e-06, + "loss": 0.1324, + "step": 69500 + }, + { + "epoch": 209.0, + "eval_accuracy": 0.9815, + "eval_loss": 0.07963848859071732, + "eval_runtime": 12.9764, + "eval_samples_per_second": 770.628, + "eval_steps_per_second": 3.083, + "step": 69597 + }, + { + "epoch": 210.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08001097291707993, + "eval_runtime": 13.4375, + "eval_samples_per_second": 744.185, + "eval_steps_per_second": 2.977, + "step": 69930 + }, + { + "epoch": 210.21, + "grad_norm": 8.406508445739746, + "learning_rate": 2.9929929929929936e-06, + "loss": 0.1324, + "step": 70000 + }, + { + "epoch": 211.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.08123359829187393, + "eval_runtime": 13.1971, + "eval_samples_per_second": 757.742, + "eval_steps_per_second": 3.031, + "step": 70263 + }, + { + "epoch": 211.71, + "grad_norm": 4.204705715179443, + "learning_rate": 2.942942942942943e-06, + "loss": 0.1343, + "step": 70500 + }, + { + "epoch": 212.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.08246932923793793, + "eval_runtime": 13.3417, + "eval_samples_per_second": 749.532, + "eval_steps_per_second": 2.998, + "step": 70596 + }, + { + "epoch": 213.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.08172763139009476, + "eval_runtime": 12.9861, + "eval_samples_per_second": 770.053, + "eval_steps_per_second": 3.08, + "step": 70929 + }, + { + "epoch": 213.21, + "grad_norm": 8.177204132080078, + "learning_rate": 2.892892892892893e-06, + "loss": 0.1322, + "step": 71000 + }, + { + "epoch": 214.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.08131828904151917, + "eval_runtime": 14.0986, + "eval_samples_per_second": 709.289, + "eval_steps_per_second": 2.837, + "step": 71262 + }, + { + "epoch": 214.71, + "grad_norm": 8.844195365905762, + "learning_rate": 2.842842842842843e-06, + "loss": 0.133, + "step": 71500 + }, + { + "epoch": 215.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.0824679508805275, + "eval_runtime": 12.94, + "eval_samples_per_second": 772.8, + "eval_steps_per_second": 3.091, + "step": 71595 + }, + { + "epoch": 216.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.0828867107629776, + "eval_runtime": 12.9965, + "eval_samples_per_second": 769.439, + "eval_steps_per_second": 3.078, + "step": 71928 + }, + { + "epoch": 216.22, + "grad_norm": 11.01076889038086, + "learning_rate": 2.7927927927927926e-06, + "loss": 0.1336, + "step": 72000 + }, + { + "epoch": 217.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08191470056772232, + "eval_runtime": 12.6388, + "eval_samples_per_second": 791.211, + "eval_steps_per_second": 3.165, + "step": 72261 + }, + { + "epoch": 217.72, + "grad_norm": 8.309555053710938, + "learning_rate": 2.7427427427427427e-06, + "loss": 0.1287, + "step": 72500 + }, + { + "epoch": 218.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.08172294497489929, + "eval_runtime": 12.869, + "eval_samples_per_second": 777.063, + "eval_steps_per_second": 3.108, + "step": 72594 + }, + { + "epoch": 219.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.08100100606679916, + "eval_runtime": 13.9577, + "eval_samples_per_second": 716.449, + "eval_steps_per_second": 2.866, + "step": 72927 + }, + { + "epoch": 219.22, + "grad_norm": 10.596402168273926, + "learning_rate": 2.6926926926926928e-06, + "loss": 0.1322, + "step": 73000 + }, + { + "epoch": 220.0, + "eval_accuracy": 0.98, + "eval_loss": 0.08346739411354065, + "eval_runtime": 12.8881, + "eval_samples_per_second": 775.91, + "eval_steps_per_second": 3.104, + "step": 73260 + }, + { + "epoch": 220.72, + "grad_norm": 8.293975830078125, + "learning_rate": 2.642642642642643e-06, + "loss": 0.1287, + "step": 73500 + }, + { + "epoch": 221.0, + "eval_accuracy": 0.9798, + "eval_loss": 0.08478812873363495, + "eval_runtime": 12.482, + "eval_samples_per_second": 801.151, + "eval_steps_per_second": 3.205, + "step": 73593 + }, + { + "epoch": 222.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.08156371861696243, + "eval_runtime": 12.9596, + "eval_samples_per_second": 771.628, + "eval_steps_per_second": 3.087, + "step": 73926 + }, + { + "epoch": 222.22, + "grad_norm": 9.707475662231445, + "learning_rate": 2.5925925925925925e-06, + "loss": 0.1317, + "step": 74000 + }, + { + "epoch": 223.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.08239776641130447, + "eval_runtime": 13.8203, + "eval_samples_per_second": 723.571, + "eval_steps_per_second": 2.894, + "step": 74259 + }, + { + "epoch": 223.72, + "grad_norm": 5.2577996253967285, + "learning_rate": 2.5425425425425426e-06, + "loss": 0.1308, + "step": 74500 + }, + { + "epoch": 224.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.08223745971918106, + "eval_runtime": 13.4783, + "eval_samples_per_second": 741.934, + "eval_steps_per_second": 2.968, + "step": 74592 + }, + { + "epoch": 225.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.0822429209947586, + "eval_runtime": 13.2583, + "eval_samples_per_second": 754.244, + "eval_steps_per_second": 3.017, + "step": 74925 + }, + { + "epoch": 225.23, + "grad_norm": 6.952250957489014, + "learning_rate": 2.4924924924924926e-06, + "loss": 0.1247, + "step": 75000 + }, + { + "epoch": 226.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.08117574453353882, + "eval_runtime": 13.5159, + "eval_samples_per_second": 739.872, + "eval_steps_per_second": 2.959, + "step": 75258 + }, + { + "epoch": 226.73, + "grad_norm": 17.568580627441406, + "learning_rate": 2.4424424424424427e-06, + "loss": 0.129, + "step": 75500 + }, + { + "epoch": 227.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08187758177518845, + "eval_runtime": 12.7892, + "eval_samples_per_second": 781.912, + "eval_steps_per_second": 3.128, + "step": 75591 + }, + { + "epoch": 228.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08235606551170349, + "eval_runtime": 12.9107, + "eval_samples_per_second": 774.55, + "eval_steps_per_second": 3.098, + "step": 75924 + }, + { + "epoch": 228.23, + "grad_norm": 13.310216903686523, + "learning_rate": 2.3923923923923923e-06, + "loss": 0.1315, + "step": 76000 + }, + { + "epoch": 229.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.08291840553283691, + "eval_runtime": 13.4267, + "eval_samples_per_second": 744.787, + "eval_steps_per_second": 2.979, + "step": 76257 + }, + { + "epoch": 229.73, + "grad_norm": 7.18035888671875, + "learning_rate": 2.3423423423423424e-06, + "loss": 0.1243, + "step": 76500 + }, + { + "epoch": 230.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.08134060353040695, + "eval_runtime": 12.9054, + "eval_samples_per_second": 774.871, + "eval_steps_per_second": 3.099, + "step": 76590 + }, + { + "epoch": 231.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.08125565946102142, + "eval_runtime": 13.8266, + "eval_samples_per_second": 723.246, + "eval_steps_per_second": 2.893, + "step": 76923 + }, + { + "epoch": 231.23, + "grad_norm": 11.132826805114746, + "learning_rate": 2.2922922922922925e-06, + "loss": 0.1244, + "step": 77000 + }, + { + "epoch": 232.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08288297057151794, + "eval_runtime": 13.8545, + "eval_samples_per_second": 721.786, + "eval_steps_per_second": 2.887, + "step": 77256 + }, + { + "epoch": 232.73, + "grad_norm": 7.415234565734863, + "learning_rate": 2.2422422422422426e-06, + "loss": 0.1286, + "step": 77500 + }, + { + "epoch": 233.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.083954356610775, + "eval_runtime": 13.1117, + "eval_samples_per_second": 762.679, + "eval_steps_per_second": 3.051, + "step": 77589 + }, + { + "epoch": 234.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08230035752058029, + "eval_runtime": 13.3702, + "eval_samples_per_second": 747.932, + "eval_steps_per_second": 2.992, + "step": 77922 + }, + { + "epoch": 234.23, + "grad_norm": 7.36590576171875, + "learning_rate": 2.192192192192192e-06, + "loss": 0.1261, + "step": 78000 + }, + { + "epoch": 235.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.08295118808746338, + "eval_runtime": 13.748, + "eval_samples_per_second": 727.381, + "eval_steps_per_second": 2.91, + "step": 78255 + }, + { + "epoch": 235.74, + "grad_norm": 10.516325950622559, + "learning_rate": 2.1421421421421423e-06, + "loss": 0.1238, + "step": 78500 + }, + { + "epoch": 236.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.08197174966335297, + "eval_runtime": 12.9286, + "eval_samples_per_second": 773.481, + "eval_steps_per_second": 3.094, + "step": 78588 + }, + { + "epoch": 237.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08315034210681915, + "eval_runtime": 13.634, + "eval_samples_per_second": 733.458, + "eval_steps_per_second": 2.934, + "step": 78921 + }, + { + "epoch": 237.24, + "grad_norm": 5.020528316497803, + "learning_rate": 2.0920920920920923e-06, + "loss": 0.1296, + "step": 79000 + }, + { + "epoch": 238.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.08168121427297592, + "eval_runtime": 14.4842, + "eval_samples_per_second": 690.406, + "eval_steps_per_second": 2.762, + "step": 79254 + }, + { + "epoch": 238.74, + "grad_norm": 11.957234382629395, + "learning_rate": 2.0420420420420424e-06, + "loss": 0.1278, + "step": 79500 + }, + { + "epoch": 239.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08146882057189941, + "eval_runtime": 14.503, + "eval_samples_per_second": 689.511, + "eval_steps_per_second": 2.758, + "step": 79587 + }, + { + "epoch": 240.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08267929404973984, + "eval_runtime": 12.9081, + "eval_samples_per_second": 774.71, + "eval_steps_per_second": 3.099, + "step": 79920 + }, + { + "epoch": 240.24, + "grad_norm": 10.550077438354492, + "learning_rate": 1.991991991991992e-06, + "loss": 0.1246, + "step": 80000 + }, + { + "epoch": 241.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08258900791406631, + "eval_runtime": 13.3618, + "eval_samples_per_second": 748.404, + "eval_steps_per_second": 2.994, + "step": 80253 + }, + { + "epoch": 241.74, + "grad_norm": 14.927352905273438, + "learning_rate": 1.941941941941942e-06, + "loss": 0.128, + "step": 80500 + }, + { + "epoch": 242.0, + "eval_accuracy": 0.9797, + "eval_loss": 0.08207195997238159, + "eval_runtime": 13.4168, + "eval_samples_per_second": 745.333, + "eval_steps_per_second": 2.981, + "step": 80586 + }, + { + "epoch": 243.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08075813204050064, + "eval_runtime": 12.9166, + "eval_samples_per_second": 774.198, + "eval_steps_per_second": 3.097, + "step": 80919 + }, + { + "epoch": 243.24, + "grad_norm": 10.435842514038086, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.1274, + "step": 81000 + }, + { + "epoch": 244.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.0817038044333458, + "eval_runtime": 12.9068, + "eval_samples_per_second": 774.784, + "eval_steps_per_second": 3.099, + "step": 81252 + }, + { + "epoch": 244.74, + "grad_norm": 6.686298370361328, + "learning_rate": 1.841841841841842e-06, + "loss": 0.1232, + "step": 81500 + }, + { + "epoch": 245.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.0812101811170578, + "eval_runtime": 13.3736, + "eval_samples_per_second": 747.741, + "eval_steps_per_second": 2.991, + "step": 81585 + }, + { + "epoch": 246.0, + "eval_accuracy": 0.9809, + "eval_loss": 0.08127359300851822, + "eval_runtime": 13.8034, + "eval_samples_per_second": 724.46, + "eval_steps_per_second": 2.898, + "step": 81918 + }, + { + "epoch": 246.25, + "grad_norm": 9.9036865234375, + "learning_rate": 1.7917917917917917e-06, + "loss": 0.1281, + "step": 82000 + }, + { + "epoch": 247.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.0803731232881546, + "eval_runtime": 13.6609, + "eval_samples_per_second": 732.019, + "eval_steps_per_second": 2.928, + "step": 82251 + }, + { + "epoch": 247.75, + "grad_norm": 9.58124828338623, + "learning_rate": 1.7417417417417418e-06, + "loss": 0.1236, + "step": 82500 + }, + { + "epoch": 248.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08054234832525253, + "eval_runtime": 12.9985, + "eval_samples_per_second": 769.319, + "eval_steps_per_second": 3.077, + "step": 82584 + }, + { + "epoch": 249.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08253397792577744, + "eval_runtime": 12.7246, + "eval_samples_per_second": 785.882, + "eval_steps_per_second": 3.144, + "step": 82917 + }, + { + "epoch": 249.25, + "grad_norm": 11.608097076416016, + "learning_rate": 1.6916916916916916e-06, + "loss": 0.1223, + "step": 83000 + }, + { + "epoch": 250.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.08115767687559128, + "eval_runtime": 13.6973, + "eval_samples_per_second": 730.07, + "eval_steps_per_second": 2.92, + "step": 83250 + }, + { + "epoch": 250.75, + "grad_norm": 7.931227207183838, + "learning_rate": 1.6416416416416417e-06, + "loss": 0.1278, + "step": 83500 + }, + { + "epoch": 251.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08087089657783508, + "eval_runtime": 12.8657, + "eval_samples_per_second": 777.263, + "eval_steps_per_second": 3.109, + "step": 83583 + }, + { + "epoch": 252.0, + "eval_accuracy": 0.9818, + "eval_loss": 0.07839205116033554, + "eval_runtime": 12.9826, + "eval_samples_per_second": 770.262, + "eval_steps_per_second": 3.081, + "step": 83916 + }, + { + "epoch": 252.25, + "grad_norm": 6.618145942687988, + "learning_rate": 1.5915915915915916e-06, + "loss": 0.1238, + "step": 84000 + }, + { + "epoch": 253.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.07928815484046936, + "eval_runtime": 13.4795, + "eval_samples_per_second": 741.867, + "eval_steps_per_second": 2.967, + "step": 84249 + }, + { + "epoch": 253.75, + "grad_norm": 6.5788397789001465, + "learning_rate": 1.5415415415415416e-06, + "loss": 0.1259, + "step": 84500 + }, + { + "epoch": 254.0, + "eval_accuracy": 0.9814, + "eval_loss": 0.08129309117794037, + "eval_runtime": 12.8787, + "eval_samples_per_second": 776.478, + "eval_steps_per_second": 3.106, + "step": 84582 + }, + { + "epoch": 255.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08033791929483414, + "eval_runtime": 12.7033, + "eval_samples_per_second": 787.197, + "eval_steps_per_second": 3.149, + "step": 84915 + }, + { + "epoch": 255.26, + "grad_norm": 8.367218017578125, + "learning_rate": 1.4914914914914915e-06, + "loss": 0.1261, + "step": 85000 + }, + { + "epoch": 256.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08045142143964767, + "eval_runtime": 12.9511, + "eval_samples_per_second": 772.133, + "eval_steps_per_second": 3.089, + "step": 85248 + }, + { + "epoch": 256.76, + "grad_norm": 17.39365005493164, + "learning_rate": 1.4414414414414416e-06, + "loss": 0.1312, + "step": 85500 + }, + { + "epoch": 257.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08164441585540771, + "eval_runtime": 13.3735, + "eval_samples_per_second": 747.745, + "eval_steps_per_second": 2.991, + "step": 85581 + }, + { + "epoch": 258.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08030729740858078, + "eval_runtime": 12.9143, + "eval_samples_per_second": 774.336, + "eval_steps_per_second": 3.097, + "step": 85914 + }, + { + "epoch": 258.26, + "grad_norm": 12.668910026550293, + "learning_rate": 1.3913913913913914e-06, + "loss": 0.1237, + "step": 86000 + }, + { + "epoch": 259.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.07897236198186874, + "eval_runtime": 13.3964, + "eval_samples_per_second": 746.469, + "eval_steps_per_second": 2.986, + "step": 86247 + }, + { + "epoch": 259.76, + "grad_norm": 3.696176767349243, + "learning_rate": 1.3413413413413415e-06, + "loss": 0.1234, + "step": 86500 + }, + { + "epoch": 260.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.07928313314914703, + "eval_runtime": 13.8621, + "eval_samples_per_second": 721.391, + "eval_steps_per_second": 2.886, + "step": 86580 + }, + { + "epoch": 261.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.07920601218938828, + "eval_runtime": 12.907, + "eval_samples_per_second": 774.775, + "eval_steps_per_second": 3.099, + "step": 86913 + }, + { + "epoch": 261.26, + "grad_norm": 11.28502082824707, + "learning_rate": 1.2912912912912913e-06, + "loss": 0.1237, + "step": 87000 + }, + { + "epoch": 262.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.08003947883844376, + "eval_runtime": 13.8177, + "eval_samples_per_second": 723.709, + "eval_steps_per_second": 2.895, + "step": 87246 + }, + { + "epoch": 262.76, + "grad_norm": 13.543560981750488, + "learning_rate": 1.2412412412412414e-06, + "loss": 0.1257, + "step": 87500 + }, + { + "epoch": 263.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08235891908407211, + "eval_runtime": 13.4574, + "eval_samples_per_second": 743.088, + "eval_steps_per_second": 2.972, + "step": 87579 + }, + { + "epoch": 264.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08182436227798462, + "eval_runtime": 12.9778, + "eval_samples_per_second": 770.546, + "eval_steps_per_second": 3.082, + "step": 87912 + }, + { + "epoch": 264.26, + "grad_norm": 11.065189361572266, + "learning_rate": 1.1911911911911913e-06, + "loss": 0.1219, + "step": 88000 + }, + { + "epoch": 265.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.08205542713403702, + "eval_runtime": 13.6001, + "eval_samples_per_second": 735.288, + "eval_steps_per_second": 2.941, + "step": 88245 + }, + { + "epoch": 265.77, + "grad_norm": 9.291784286499023, + "learning_rate": 1.1411411411411411e-06, + "loss": 0.1298, + "step": 88500 + }, + { + "epoch": 266.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08165726810693741, + "eval_runtime": 13.3896, + "eval_samples_per_second": 746.85, + "eval_steps_per_second": 2.987, + "step": 88578 + }, + { + "epoch": 267.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08162441104650497, + "eval_runtime": 14.0305, + "eval_samples_per_second": 712.733, + "eval_steps_per_second": 2.851, + "step": 88911 + }, + { + "epoch": 267.27, + "grad_norm": 17.33576202392578, + "learning_rate": 1.0910910910910912e-06, + "loss": 0.1222, + "step": 89000 + }, + { + "epoch": 268.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.08136063069105148, + "eval_runtime": 12.8095, + "eval_samples_per_second": 780.671, + "eval_steps_per_second": 3.123, + "step": 89244 + }, + { + "epoch": 268.77, + "grad_norm": 11.170260429382324, + "learning_rate": 1.041041041041041e-06, + "loss": 0.1268, + "step": 89500 + }, + { + "epoch": 269.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.08162767440080643, + "eval_runtime": 13.5821, + "eval_samples_per_second": 736.263, + "eval_steps_per_second": 2.945, + "step": 89577 + }, + { + "epoch": 270.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08254320919513702, + "eval_runtime": 12.9419, + "eval_samples_per_second": 772.681, + "eval_steps_per_second": 3.091, + "step": 89910 + }, + { + "epoch": 270.27, + "grad_norm": 10.08292007446289, + "learning_rate": 9.909909909909911e-07, + "loss": 0.1239, + "step": 90000 + }, + { + "epoch": 271.0, + "eval_accuracy": 0.9802, + "eval_loss": 0.08088234812021255, + "eval_runtime": 12.9857, + "eval_samples_per_second": 770.075, + "eval_steps_per_second": 3.08, + "step": 90243 + }, + { + "epoch": 271.77, + "grad_norm": 7.639751434326172, + "learning_rate": 9.409409409409411e-07, + "loss": 0.1277, + "step": 90500 + }, + { + "epoch": 272.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.0805734246969223, + "eval_runtime": 12.6096, + "eval_samples_per_second": 793.045, + "eval_steps_per_second": 3.172, + "step": 90576 + }, + { + "epoch": 273.0, + "eval_accuracy": 0.98, + "eval_loss": 0.08124550431966782, + "eval_runtime": 13.1278, + "eval_samples_per_second": 761.741, + "eval_steps_per_second": 3.047, + "step": 90909 + }, + { + "epoch": 273.27, + "grad_norm": 7.800063133239746, + "learning_rate": 8.90890890890891e-07, + "loss": 0.1235, + "step": 91000 + }, + { + "epoch": 274.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08137263357639313, + "eval_runtime": 13.3744, + "eval_samples_per_second": 747.7, + "eval_steps_per_second": 2.991, + "step": 91242 + }, + { + "epoch": 274.77, + "grad_norm": 13.224382400512695, + "learning_rate": 8.40840840840841e-07, + "loss": 0.1261, + "step": 91500 + }, + { + "epoch": 275.0, + "eval_accuracy": 0.9801, + "eval_loss": 0.08086758852005005, + "eval_runtime": 12.9048, + "eval_samples_per_second": 774.907, + "eval_steps_per_second": 3.1, + "step": 91575 + }, + { + "epoch": 276.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.080258309841156, + "eval_runtime": 14.2222, + "eval_samples_per_second": 703.127, + "eval_steps_per_second": 2.813, + "step": 91908 + }, + { + "epoch": 276.28, + "grad_norm": 8.529864311218262, + "learning_rate": 7.907907907907908e-07, + "loss": 0.1219, + "step": 92000 + }, + { + "epoch": 277.0, + "eval_accuracy": 0.9803, + "eval_loss": 0.08069344609975815, + "eval_runtime": 13.3763, + "eval_samples_per_second": 747.589, + "eval_steps_per_second": 2.99, + "step": 92241 + }, + { + "epoch": 277.78, + "grad_norm": 5.7626051902771, + "learning_rate": 7.407407407407407e-07, + "loss": 0.1235, + "step": 92500 + }, + { + "epoch": 278.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.0805598720908165, + "eval_runtime": 13.2417, + "eval_samples_per_second": 755.192, + "eval_steps_per_second": 3.021, + "step": 92574 + }, + { + "epoch": 279.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.07991771399974823, + "eval_runtime": 12.9989, + "eval_samples_per_second": 769.296, + "eval_steps_per_second": 3.077, + "step": 92907 + }, + { + "epoch": 279.28, + "grad_norm": 12.886475563049316, + "learning_rate": 6.906906906906907e-07, + "loss": 0.1232, + "step": 93000 + }, + { + "epoch": 280.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08009103685617447, + "eval_runtime": 14.1951, + "eval_samples_per_second": 704.469, + "eval_steps_per_second": 2.818, + "step": 93240 + }, + { + "epoch": 280.78, + "grad_norm": 13.245797157287598, + "learning_rate": 6.406406406406407e-07, + "loss": 0.1236, + "step": 93500 + }, + { + "epoch": 281.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.08077774941921234, + "eval_runtime": 13.9349, + "eval_samples_per_second": 717.624, + "eval_steps_per_second": 2.87, + "step": 93573 + }, + { + "epoch": 282.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08111685514450073, + "eval_runtime": 13.0495, + "eval_samples_per_second": 766.313, + "eval_steps_per_second": 3.065, + "step": 93906 + }, + { + "epoch": 282.28, + "grad_norm": 6.8997673988342285, + "learning_rate": 5.905905905905906e-07, + "loss": 0.1195, + "step": 94000 + }, + { + "epoch": 283.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.08137265592813492, + "eval_runtime": 13.163, + "eval_samples_per_second": 759.705, + "eval_steps_per_second": 3.039, + "step": 94239 + }, + { + "epoch": 283.78, + "grad_norm": 12.197209358215332, + "learning_rate": 5.405405405405406e-07, + "loss": 0.1191, + "step": 94500 + }, + { + "epoch": 284.0, + "eval_accuracy": 0.9804, + "eval_loss": 0.08120004087686539, + "eval_runtime": 12.9217, + "eval_samples_per_second": 773.893, + "eval_steps_per_second": 3.096, + "step": 94572 + }, + { + "epoch": 285.0, + "eval_accuracy": 0.9805, + "eval_loss": 0.08181598037481308, + "eval_runtime": 12.828, + "eval_samples_per_second": 779.547, + "eval_steps_per_second": 3.118, + "step": 94905 + }, + { + "epoch": 285.29, + "grad_norm": 6.001578330993652, + "learning_rate": 4.904904904904905e-07, + "loss": 0.1205, + "step": 95000 + }, + { + "epoch": 286.0, + "eval_accuracy": 0.9807, + "eval_loss": 0.08141326904296875, + "eval_runtime": 13.7647, + "eval_samples_per_second": 726.495, + "eval_steps_per_second": 2.906, + "step": 95238 + }, + { + "epoch": 286.79, + "grad_norm": 9.633207321166992, + "learning_rate": 4.4044044044044046e-07, + "loss": 0.1203, + "step": 95500 + }, + { + "epoch": 287.0, + "eval_accuracy": 0.9808, + "eval_loss": 0.08182702958583832, + "eval_runtime": 14.1767, + "eval_samples_per_second": 705.381, + "eval_steps_per_second": 2.822, + "step": 95571 + }, + { + "epoch": 288.0, + "eval_accuracy": 0.9806, + "eval_loss": 0.08031768351793289, + "eval_runtime": 14.019, + "eval_samples_per_second": 713.316, + "eval_steps_per_second": 2.853, + "step": 95904 + }, + { + "epoch": 288.29, + "grad_norm": 9.451753616333008, + "learning_rate": 3.903903903903904e-07, + "loss": 0.1197, + "step": 96000 + }, + { + "epoch": 289.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.0809708833694458, + "eval_runtime": 13.7936, + "eval_samples_per_second": 724.975, + "eval_steps_per_second": 2.9, + "step": 96237 + }, + { + "epoch": 289.79, + "grad_norm": 10.313632011413574, + "learning_rate": 3.403403403403404e-07, + "loss": 0.1233, + "step": 96500 + }, + { + "epoch": 290.0, + "eval_accuracy": 0.9811, + "eval_loss": 0.08130063861608505, + "eval_runtime": 13.4821, + "eval_samples_per_second": 741.722, + "eval_steps_per_second": 2.967, + "step": 96570 + }, + { + "epoch": 291.0, + "eval_accuracy": 0.9813, + "eval_loss": 0.08096129447221756, + "eval_runtime": 13.9986, + "eval_samples_per_second": 714.359, + "eval_steps_per_second": 2.857, + "step": 96903 + }, + { + "epoch": 291.29, + "grad_norm": 6.7220892906188965, + "learning_rate": 2.9029029029029035e-07, + "loss": 0.12, + "step": 97000 + }, + { + "epoch": 292.0, + "eval_accuracy": 0.9813, + "eval_loss": 0.08056668192148209, + "eval_runtime": 13.2921, + "eval_samples_per_second": 752.329, + "eval_steps_per_second": 3.009, + "step": 97236 + }, + { + "epoch": 292.79, + "grad_norm": 7.212859630584717, + "learning_rate": 2.4024024024024026e-07, + "loss": 0.1219, + "step": 97500 + }, + { + "epoch": 293.0, + "eval_accuracy": 0.9816, + "eval_loss": 0.08098697662353516, + "eval_runtime": 13.5812, + "eval_samples_per_second": 736.31, + "eval_steps_per_second": 2.945, + "step": 97569 + }, + { + "epoch": 294.0, + "eval_accuracy": 0.9815, + "eval_loss": 0.08067005127668381, + "eval_runtime": 12.9034, + "eval_samples_per_second": 774.988, + "eval_steps_per_second": 3.1, + "step": 97902 + }, + { + "epoch": 294.29, + "grad_norm": 7.5087409019470215, + "learning_rate": 1.9019019019019022e-07, + "loss": 0.1202, + "step": 98000 + }, + { + "epoch": 295.0, + "eval_accuracy": 0.9813, + "eval_loss": 0.08077917248010635, + "eval_runtime": 13.4699, + "eval_samples_per_second": 742.397, + "eval_steps_per_second": 2.97, + "step": 98235 + }, + { + "epoch": 295.8, + "grad_norm": 7.660182952880859, + "learning_rate": 1.4014014014014016e-07, + "loss": 0.1228, + "step": 98500 + }, + { + "epoch": 296.0, + "eval_accuracy": 0.9815, + "eval_loss": 0.0807722955942154, + "eval_runtime": 13.0168, + "eval_samples_per_second": 768.237, + "eval_steps_per_second": 3.073, + "step": 98568 + }, + { + "epoch": 297.0, + "eval_accuracy": 0.9813, + "eval_loss": 0.08067157864570618, + "eval_runtime": 13.4303, + "eval_samples_per_second": 744.586, + "eval_steps_per_second": 2.978, + "step": 98901 + }, + { + "epoch": 297.3, + "grad_norm": 10.4266357421875, + "learning_rate": 9.00900900900901e-08, + "loss": 0.1212, + "step": 99000 + }, + { + "epoch": 298.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.08074088394641876, + "eval_runtime": 12.8481, + "eval_samples_per_second": 778.327, + "eval_steps_per_second": 3.113, + "step": 99234 + }, + { + "epoch": 298.8, + "grad_norm": 10.557640075683594, + "learning_rate": 4.004004004004004e-08, + "loss": 0.1214, + "step": 99500 + }, + { + "epoch": 299.0, + "eval_accuracy": 0.9812, + "eval_loss": 0.0807051733136177, + "eval_runtime": 13.1178, + "eval_samples_per_second": 762.323, + "eval_steps_per_second": 3.049, + "step": 99567 + }, + { + "epoch": 300.0, + "eval_accuracy": 0.981, + "eval_loss": 0.08068788051605225, + "eval_runtime": 12.9887, + "eval_samples_per_second": 769.902, + "eval_steps_per_second": 3.08, + "step": 99900 + }, + { + "epoch": 300.0, + "step": 99900, + "total_flos": 3.1698470226124734e+20, + "train_loss": 0.17093151241451413, + "train_runtime": 47820.897, + "train_samples_per_second": 266.62, + "train_steps_per_second": 2.089 + } + ], + "logging_steps": 500, + "max_steps": 99900, + "num_input_tokens_seen": 0, + "num_train_epochs": 300, + "save_steps": 500, + "total_flos": 3.1698470226124734e+20, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +}