{ "best_metric": 0.9818, "best_model_checkpoint": "../../checkpoint/cifar10/swin-tiny/checkpoint-38295", "epoch": 300.0, "eval_steps": 500, "global_step": 99900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_accuracy": 0.9223, "eval_loss": 0.2584497928619385, "eval_runtime": 21.3622, "eval_samples_per_second": 468.117, "eval_steps_per_second": 1.872, "step": 333 }, { "epoch": 1.5, "grad_norm": 14.220479011535645, "learning_rate": 9.949949949949951e-06, "loss": 0.9076, "step": 500 }, { "epoch": 2.0, "eval_accuracy": 0.945, "eval_loss": 0.1637052595615387, "eval_runtime": 13.6699, "eval_samples_per_second": 731.533, "eval_steps_per_second": 2.926, "step": 666 }, { "epoch": 3.0, "eval_accuracy": 0.9553, "eval_loss": 0.1344435065984726, "eval_runtime": 13.1721, "eval_samples_per_second": 759.181, "eval_steps_per_second": 3.037, "step": 999 }, { "epoch": 3.0, "grad_norm": 9.328938484191895, "learning_rate": 9.899899899899901e-06, "loss": 0.4797, "step": 1000 }, { "epoch": 4.0, "eval_accuracy": 0.9604, "eval_loss": 0.1205841451883316, "eval_runtime": 12.6584, "eval_samples_per_second": 789.992, "eval_steps_per_second": 3.16, "step": 1332 }, { "epoch": 4.5, "grad_norm": 14.11563777923584, "learning_rate": 9.849849849849851e-06, "loss": 0.4193, "step": 1500 }, { "epoch": 5.0, "eval_accuracy": 0.9635, "eval_loss": 0.11088060587644577, "eval_runtime": 12.7891, "eval_samples_per_second": 781.918, "eval_steps_per_second": 3.128, "step": 1665 }, { "epoch": 6.0, "eval_accuracy": 0.9661, "eval_loss": 0.10564317554235458, "eval_runtime": 12.9686, "eval_samples_per_second": 771.094, "eval_steps_per_second": 3.084, "step": 1998 }, { "epoch": 6.01, "grad_norm": 12.565740585327148, "learning_rate": 9.799799799799801e-06, "loss": 0.3846, "step": 2000 }, { "epoch": 7.0, "eval_accuracy": 0.9688, "eval_loss": 0.09508195519447327, "eval_runtime": 13.2698, "eval_samples_per_second": 753.592, "eval_steps_per_second": 3.014, "step": 2331 }, { "epoch": 7.51, "grad_norm": 9.896069526672363, "learning_rate": 9.749749749749751e-06, "loss": 0.3572, "step": 2500 }, { "epoch": 8.0, "eval_accuracy": 0.9689, "eval_loss": 0.09568808227777481, "eval_runtime": 13.6448, "eval_samples_per_second": 732.879, "eval_steps_per_second": 2.932, "step": 2664 }, { "epoch": 9.0, "eval_accuracy": 0.9693, "eval_loss": 0.09088099747896194, "eval_runtime": 13.9779, "eval_samples_per_second": 715.417, "eval_steps_per_second": 2.862, "step": 2997 }, { "epoch": 9.01, "grad_norm": 9.739038467407227, "learning_rate": 9.699699699699701e-06, "loss": 0.3409, "step": 3000 }, { "epoch": 10.0, "eval_accuracy": 0.971, "eval_loss": 0.0861617922782898, "eval_runtime": 13.5874, "eval_samples_per_second": 735.979, "eval_steps_per_second": 2.944, "step": 3330 }, { "epoch": 10.51, "grad_norm": 7.383803367614746, "learning_rate": 9.649649649649651e-06, "loss": 0.3319, "step": 3500 }, { "epoch": 11.0, "eval_accuracy": 0.9721, "eval_loss": 0.08562646806240082, "eval_runtime": 13.5289, "eval_samples_per_second": 739.158, "eval_steps_per_second": 2.957, "step": 3663 }, { "epoch": 12.0, "eval_accuracy": 0.972, "eval_loss": 0.08723447471857071, "eval_runtime": 13.3531, "eval_samples_per_second": 748.887, "eval_steps_per_second": 2.996, "step": 3996 }, { "epoch": 12.01, "grad_norm": 11.866540908813477, "learning_rate": 9.5995995995996e-06, "loss": 0.3253, "step": 4000 }, { "epoch": 13.0, "eval_accuracy": 0.973, "eval_loss": 0.08058160543441772, "eval_runtime": 14.1547, "eval_samples_per_second": 706.479, "eval_steps_per_second": 2.826, "step": 4329 }, { "epoch": 13.51, "grad_norm": 7.938398361206055, "learning_rate": 9.54954954954955e-06, "loss": 0.3084, "step": 4500 }, { "epoch": 14.0, "eval_accuracy": 0.9738, "eval_loss": 0.08162784576416016, "eval_runtime": 14.1065, "eval_samples_per_second": 708.895, "eval_steps_per_second": 2.836, "step": 4662 }, { "epoch": 15.0, "eval_accuracy": 0.9742, "eval_loss": 0.07894858717918396, "eval_runtime": 13.886, "eval_samples_per_second": 720.149, "eval_steps_per_second": 2.881, "step": 4995 }, { "epoch": 15.02, "grad_norm": 16.568248748779297, "learning_rate": 9.4994994994995e-06, "loss": 0.3022, "step": 5000 }, { "epoch": 16.0, "eval_accuracy": 0.9746, "eval_loss": 0.07670588046312332, "eval_runtime": 13.5929, "eval_samples_per_second": 735.676, "eval_steps_per_second": 2.943, "step": 5328 }, { "epoch": 16.52, "grad_norm": 13.009441375732422, "learning_rate": 9.44944944944945e-06, "loss": 0.2894, "step": 5500 }, { "epoch": 17.0, "eval_accuracy": 0.9725, "eval_loss": 0.0805484876036644, "eval_runtime": 13.3932, "eval_samples_per_second": 746.649, "eval_steps_per_second": 2.987, "step": 5661 }, { "epoch": 18.0, "eval_accuracy": 0.9759, "eval_loss": 0.0759720578789711, "eval_runtime": 13.5457, "eval_samples_per_second": 738.24, "eval_steps_per_second": 2.953, "step": 5994 }, { "epoch": 18.02, "grad_norm": 13.468392372131348, "learning_rate": 9.3993993993994e-06, "loss": 0.2842, "step": 6000 }, { "epoch": 19.0, "eval_accuracy": 0.9744, "eval_loss": 0.07423894852399826, "eval_runtime": 13.6253, "eval_samples_per_second": 733.929, "eval_steps_per_second": 2.936, "step": 6327 }, { "epoch": 19.52, "grad_norm": 12.263895988464355, "learning_rate": 9.34934934934935e-06, "loss": 0.2712, "step": 6500 }, { "epoch": 20.0, "eval_accuracy": 0.9738, "eval_loss": 0.07846847176551819, "eval_runtime": 12.9608, "eval_samples_per_second": 771.556, "eval_steps_per_second": 3.086, "step": 6660 }, { "epoch": 21.0, "eval_accuracy": 0.9735, "eval_loss": 0.07904864102602005, "eval_runtime": 12.8724, "eval_samples_per_second": 776.858, "eval_steps_per_second": 3.107, "step": 6993 }, { "epoch": 21.02, "grad_norm": 10.845202445983887, "learning_rate": 9.2992992992993e-06, "loss": 0.2729, "step": 7000 }, { "epoch": 22.0, "eval_accuracy": 0.9759, "eval_loss": 0.07514221966266632, "eval_runtime": 14.1339, "eval_samples_per_second": 707.52, "eval_steps_per_second": 2.83, "step": 7326 }, { "epoch": 22.52, "grad_norm": 11.12897777557373, "learning_rate": 9.24924924924925e-06, "loss": 0.2634, "step": 7500 }, { "epoch": 23.0, "eval_accuracy": 0.9737, "eval_loss": 0.07959982007741928, "eval_runtime": 13.0065, "eval_samples_per_second": 768.844, "eval_steps_per_second": 3.075, "step": 7659 }, { "epoch": 24.0, "eval_accuracy": 0.9752, "eval_loss": 0.07558540254831314, "eval_runtime": 13.805, "eval_samples_per_second": 724.375, "eval_steps_per_second": 2.897, "step": 7992 }, { "epoch": 24.02, "grad_norm": 10.100821495056152, "learning_rate": 9.1991991991992e-06, "loss": 0.2591, "step": 8000 }, { "epoch": 25.0, "eval_accuracy": 0.9759, "eval_loss": 0.07549387961626053, "eval_runtime": 13.4677, "eval_samples_per_second": 742.518, "eval_steps_per_second": 2.97, "step": 8325 }, { "epoch": 25.53, "grad_norm": 9.881790161132812, "learning_rate": 9.14914914914915e-06, "loss": 0.253, "step": 8500 }, { "epoch": 26.0, "eval_accuracy": 0.9746, "eval_loss": 0.07933681458234787, "eval_runtime": 13.2517, "eval_samples_per_second": 754.619, "eval_steps_per_second": 3.018, "step": 8658 }, { "epoch": 27.0, "eval_accuracy": 0.9765, "eval_loss": 0.07278025895357132, "eval_runtime": 13.5258, "eval_samples_per_second": 739.327, "eval_steps_per_second": 2.957, "step": 8991 }, { "epoch": 27.03, "grad_norm": 7.72860860824585, "learning_rate": 9.0990990990991e-06, "loss": 0.2518, "step": 9000 }, { "epoch": 28.0, "eval_accuracy": 0.9748, "eval_loss": 0.07914856821298599, "eval_runtime": 13.7348, "eval_samples_per_second": 728.079, "eval_steps_per_second": 2.912, "step": 9324 }, { "epoch": 28.53, "grad_norm": 8.068327903747559, "learning_rate": 9.04904904904905e-06, "loss": 0.2482, "step": 9500 }, { "epoch": 29.0, "eval_accuracy": 0.9756, "eval_loss": 0.07918867468833923, "eval_runtime": 13.3633, "eval_samples_per_second": 748.316, "eval_steps_per_second": 2.993, "step": 9657 }, { "epoch": 30.0, "eval_accuracy": 0.9764, "eval_loss": 0.07418718934059143, "eval_runtime": 12.9493, "eval_samples_per_second": 772.24, "eval_steps_per_second": 3.089, "step": 9990 }, { "epoch": 30.03, "grad_norm": 8.977522850036621, "learning_rate": 8.998998998999e-06, "loss": 0.2429, "step": 10000 }, { "epoch": 31.0, "eval_accuracy": 0.9757, "eval_loss": 0.07399851083755493, "eval_runtime": 13.4787, "eval_samples_per_second": 741.913, "eval_steps_per_second": 2.968, "step": 10323 }, { "epoch": 31.53, "grad_norm": 11.080597877502441, "learning_rate": 8.94894894894895e-06, "loss": 0.2405, "step": 10500 }, { "epoch": 32.0, "eval_accuracy": 0.9757, "eval_loss": 0.07426943629980087, "eval_runtime": 12.8343, "eval_samples_per_second": 779.16, "eval_steps_per_second": 3.117, "step": 10656 }, { "epoch": 33.0, "eval_accuracy": 0.9757, "eval_loss": 0.07429418712854385, "eval_runtime": 12.9825, "eval_samples_per_second": 770.266, "eval_steps_per_second": 3.081, "step": 10989 }, { "epoch": 33.03, "grad_norm": 7.3039140701293945, "learning_rate": 8.8988988988989e-06, "loss": 0.234, "step": 11000 }, { "epoch": 34.0, "eval_accuracy": 0.9769, "eval_loss": 0.07486932724714279, "eval_runtime": 12.96, "eval_samples_per_second": 771.606, "eval_steps_per_second": 3.086, "step": 11322 }, { "epoch": 34.53, "grad_norm": 8.610194206237793, "learning_rate": 8.84884884884885e-06, "loss": 0.2353, "step": 11500 }, { "epoch": 35.0, "eval_accuracy": 0.975, "eval_loss": 0.0768030509352684, "eval_runtime": 13.519, "eval_samples_per_second": 739.698, "eval_steps_per_second": 2.959, "step": 11655 }, { "epoch": 36.0, "eval_accuracy": 0.9771, "eval_loss": 0.07342812418937683, "eval_runtime": 14.3472, "eval_samples_per_second": 697.001, "eval_steps_per_second": 2.788, "step": 11988 }, { "epoch": 36.04, "grad_norm": 7.767194747924805, "learning_rate": 8.798798798798799e-06, "loss": 0.2329, "step": 12000 }, { "epoch": 37.0, "eval_accuracy": 0.9755, "eval_loss": 0.07778933644294739, "eval_runtime": 13.5633, "eval_samples_per_second": 737.284, "eval_steps_per_second": 2.949, "step": 12321 }, { "epoch": 37.54, "grad_norm": 11.39279842376709, "learning_rate": 8.74874874874875e-06, "loss": 0.2289, "step": 12500 }, { "epoch": 38.0, "eval_accuracy": 0.9771, "eval_loss": 0.07622923702001572, "eval_runtime": 13.5603, "eval_samples_per_second": 737.447, "eval_steps_per_second": 2.95, "step": 12654 }, { "epoch": 39.0, "eval_accuracy": 0.9761, "eval_loss": 0.07648137956857681, "eval_runtime": 13.4622, "eval_samples_per_second": 742.82, "eval_steps_per_second": 2.971, "step": 12987 }, { "epoch": 39.04, "grad_norm": 8.879070281982422, "learning_rate": 8.6986986986987e-06, "loss": 0.227, "step": 13000 }, { "epoch": 40.0, "eval_accuracy": 0.9768, "eval_loss": 0.07394447922706604, "eval_runtime": 13.4641, "eval_samples_per_second": 742.715, "eval_steps_per_second": 2.971, "step": 13320 }, { "epoch": 40.54, "grad_norm": 10.858572006225586, "learning_rate": 8.64864864864865e-06, "loss": 0.2213, "step": 13500 }, { "epoch": 41.0, "eval_accuracy": 0.9773, "eval_loss": 0.07473840564489365, "eval_runtime": 12.9211, "eval_samples_per_second": 773.93, "eval_steps_per_second": 3.096, "step": 13653 }, { "epoch": 42.0, "eval_accuracy": 0.9786, "eval_loss": 0.07195272296667099, "eval_runtime": 13.3716, "eval_samples_per_second": 747.852, "eval_steps_per_second": 2.991, "step": 13986 }, { "epoch": 42.04, "grad_norm": 9.299273490905762, "learning_rate": 8.5985985985986e-06, "loss": 0.217, "step": 14000 }, { "epoch": 43.0, "eval_accuracy": 0.9771, "eval_loss": 0.07661354541778564, "eval_runtime": 13.4888, "eval_samples_per_second": 741.354, "eval_steps_per_second": 2.965, "step": 14319 }, { "epoch": 43.54, "grad_norm": 9.49695873260498, "learning_rate": 8.54854854854855e-06, "loss": 0.22, "step": 14500 }, { "epoch": 44.0, "eval_accuracy": 0.9767, "eval_loss": 0.07640816271305084, "eval_runtime": 14.0377, "eval_samples_per_second": 712.365, "eval_steps_per_second": 2.849, "step": 14652 }, { "epoch": 45.0, "eval_accuracy": 0.9779, "eval_loss": 0.07278802245855331, "eval_runtime": 13.4886, "eval_samples_per_second": 741.366, "eval_steps_per_second": 2.965, "step": 14985 }, { "epoch": 45.05, "grad_norm": 12.065461158752441, "learning_rate": 8.4984984984985e-06, "loss": 0.2179, "step": 15000 }, { "epoch": 46.0, "eval_accuracy": 0.9785, "eval_loss": 0.0740213543176651, "eval_runtime": 14.112, "eval_samples_per_second": 708.617, "eval_steps_per_second": 2.834, "step": 15318 }, { "epoch": 46.55, "grad_norm": 9.281307220458984, "learning_rate": 8.44844844844845e-06, "loss": 0.2074, "step": 15500 }, { "epoch": 47.0, "eval_accuracy": 0.9793, "eval_loss": 0.0712471604347229, "eval_runtime": 13.5017, "eval_samples_per_second": 740.647, "eval_steps_per_second": 2.963, "step": 15651 }, { "epoch": 48.0, "eval_accuracy": 0.9783, "eval_loss": 0.0759299248456955, "eval_runtime": 13.3849, "eval_samples_per_second": 747.113, "eval_steps_per_second": 2.988, "step": 15984 }, { "epoch": 48.05, "grad_norm": 6.8984503746032715, "learning_rate": 8.398398398398398e-06, "loss": 0.2096, "step": 16000 }, { "epoch": 49.0, "eval_accuracy": 0.9791, "eval_loss": 0.07268951088190079, "eval_runtime": 13.5376, "eval_samples_per_second": 738.686, "eval_steps_per_second": 2.955, "step": 16317 }, { "epoch": 49.55, "grad_norm": 8.968807220458984, "learning_rate": 8.348348348348348e-06, "loss": 0.2097, "step": 16500 }, { "epoch": 50.0, "eval_accuracy": 0.9792, "eval_loss": 0.07472656667232513, "eval_runtime": 13.5262, "eval_samples_per_second": 739.304, "eval_steps_per_second": 2.957, "step": 16650 }, { "epoch": 51.0, "eval_accuracy": 0.9795, "eval_loss": 0.0754549577832222, "eval_runtime": 13.1606, "eval_samples_per_second": 759.845, "eval_steps_per_second": 3.039, "step": 16983 }, { "epoch": 51.05, "grad_norm": 8.540103912353516, "learning_rate": 8.298298298298298e-06, "loss": 0.2063, "step": 17000 }, { "epoch": 52.0, "eval_accuracy": 0.9788, "eval_loss": 0.0741283968091011, "eval_runtime": 13.8466, "eval_samples_per_second": 722.201, "eval_steps_per_second": 2.889, "step": 17316 }, { "epoch": 52.55, "grad_norm": 7.042116165161133, "learning_rate": 8.248248248248248e-06, "loss": 0.2054, "step": 17500 }, { "epoch": 53.0, "eval_accuracy": 0.9784, "eval_loss": 0.0738772302865982, "eval_runtime": 13.021, "eval_samples_per_second": 767.992, "eval_steps_per_second": 3.072, "step": 17649 }, { "epoch": 54.0, "eval_accuracy": 0.9779, "eval_loss": 0.07553113251924515, "eval_runtime": 12.8958, "eval_samples_per_second": 775.444, "eval_steps_per_second": 3.102, "step": 17982 }, { "epoch": 54.05, "grad_norm": 9.23681640625, "learning_rate": 8.198198198198198e-06, "loss": 0.2003, "step": 18000 }, { "epoch": 55.0, "eval_accuracy": 0.9784, "eval_loss": 0.07760650664567947, "eval_runtime": 12.6634, "eval_samples_per_second": 789.678, "eval_steps_per_second": 3.159, "step": 18315 }, { "epoch": 55.56, "grad_norm": 5.839297771453857, "learning_rate": 8.148148148148148e-06, "loss": 0.2009, "step": 18500 }, { "epoch": 56.0, "eval_accuracy": 0.9786, "eval_loss": 0.07352690398693085, "eval_runtime": 13.1656, "eval_samples_per_second": 759.554, "eval_steps_per_second": 3.038, "step": 18648 }, { "epoch": 57.0, "eval_accuracy": 0.9769, "eval_loss": 0.07721856981515884, "eval_runtime": 12.8626, "eval_samples_per_second": 777.447, "eval_steps_per_second": 3.11, "step": 18981 }, { "epoch": 57.06, "grad_norm": 10.131054878234863, "learning_rate": 8.098098098098098e-06, "loss": 0.1999, "step": 19000 }, { "epoch": 58.0, "eval_accuracy": 0.9789, "eval_loss": 0.07691636681556702, "eval_runtime": 12.8042, "eval_samples_per_second": 780.991, "eval_steps_per_second": 3.124, "step": 19314 }, { "epoch": 58.56, "grad_norm": 7.643968105316162, "learning_rate": 8.048048048048048e-06, "loss": 0.1973, "step": 19500 }, { "epoch": 59.0, "eval_accuracy": 0.9793, "eval_loss": 0.07336228340864182, "eval_runtime": 13.6825, "eval_samples_per_second": 730.861, "eval_steps_per_second": 2.923, "step": 19647 }, { "epoch": 60.0, "eval_accuracy": 0.9787, "eval_loss": 0.07408491522073746, "eval_runtime": 13.4079, "eval_samples_per_second": 745.831, "eval_steps_per_second": 2.983, "step": 19980 }, { "epoch": 60.06, "grad_norm": 9.443299293518066, "learning_rate": 7.997997997997999e-06, "loss": 0.1953, "step": 20000 }, { "epoch": 61.0, "eval_accuracy": 0.978, "eval_loss": 0.07513260841369629, "eval_runtime": 13.4048, "eval_samples_per_second": 745.999, "eval_steps_per_second": 2.984, "step": 20313 }, { "epoch": 61.56, "grad_norm": 16.85797119140625, "learning_rate": 7.947947947947949e-06, "loss": 0.1937, "step": 20500 }, { "epoch": 62.0, "eval_accuracy": 0.9786, "eval_loss": 0.07370081543922424, "eval_runtime": 13.3055, "eval_samples_per_second": 751.568, "eval_steps_per_second": 3.006, "step": 20646 }, { "epoch": 63.0, "eval_accuracy": 0.9786, "eval_loss": 0.07323586940765381, "eval_runtime": 12.7695, "eval_samples_per_second": 783.119, "eval_steps_per_second": 3.132, "step": 20979 }, { "epoch": 63.06, "grad_norm": 8.4561128616333, "learning_rate": 7.897897897897899e-06, "loss": 0.1946, "step": 21000 }, { "epoch": 64.0, "eval_accuracy": 0.9789, "eval_loss": 0.07585693150758743, "eval_runtime": 13.6921, "eval_samples_per_second": 730.349, "eval_steps_per_second": 2.921, "step": 21312 }, { "epoch": 64.56, "grad_norm": 11.68150806427002, "learning_rate": 7.847847847847849e-06, "loss": 0.1909, "step": 21500 }, { "epoch": 65.0, "eval_accuracy": 0.9798, "eval_loss": 0.07349375635385513, "eval_runtime": 12.8445, "eval_samples_per_second": 778.544, "eval_steps_per_second": 3.114, "step": 21645 }, { "epoch": 66.0, "eval_accuracy": 0.9788, "eval_loss": 0.07336971163749695, "eval_runtime": 12.8882, "eval_samples_per_second": 775.905, "eval_steps_per_second": 3.104, "step": 21978 }, { "epoch": 66.07, "grad_norm": 8.738271713256836, "learning_rate": 7.797797797797799e-06, "loss": 0.1935, "step": 22000 }, { "epoch": 67.0, "eval_accuracy": 0.9793, "eval_loss": 0.07337453961372375, "eval_runtime": 12.8166, "eval_samples_per_second": 780.24, "eval_steps_per_second": 3.121, "step": 22311 }, { "epoch": 67.57, "grad_norm": 6.386814117431641, "learning_rate": 7.747747747747749e-06, "loss": 0.1936, "step": 22500 }, { "epoch": 68.0, "eval_accuracy": 0.9795, "eval_loss": 0.07239189743995667, "eval_runtime": 12.831, "eval_samples_per_second": 779.362, "eval_steps_per_second": 3.117, "step": 22644 }, { "epoch": 69.0, "eval_accuracy": 0.9785, "eval_loss": 0.07570048421621323, "eval_runtime": 12.8964, "eval_samples_per_second": 775.412, "eval_steps_per_second": 3.102, "step": 22977 }, { "epoch": 69.07, "grad_norm": 9.476435661315918, "learning_rate": 7.697697697697697e-06, "loss": 0.1858, "step": 23000 }, { "epoch": 70.0, "eval_accuracy": 0.9801, "eval_loss": 0.07100442796945572, "eval_runtime": 13.0999, "eval_samples_per_second": 763.367, "eval_steps_per_second": 3.053, "step": 23310 }, { "epoch": 70.57, "grad_norm": 9.190871238708496, "learning_rate": 7.647647647647647e-06, "loss": 0.1871, "step": 23500 }, { "epoch": 71.0, "eval_accuracy": 0.9799, "eval_loss": 0.07596922665834427, "eval_runtime": 13.423, "eval_samples_per_second": 744.991, "eval_steps_per_second": 2.98, "step": 23643 }, { "epoch": 72.0, "eval_accuracy": 0.9801, "eval_loss": 0.07650475203990936, "eval_runtime": 13.055, "eval_samples_per_second": 765.99, "eval_steps_per_second": 3.064, "step": 23976 }, { "epoch": 72.07, "grad_norm": 10.302529335021973, "learning_rate": 7.597597597597598e-06, "loss": 0.1836, "step": 24000 }, { "epoch": 73.0, "eval_accuracy": 0.9787, "eval_loss": 0.07714686542749405, "eval_runtime": 13.4272, "eval_samples_per_second": 744.757, "eval_steps_per_second": 2.979, "step": 24309 }, { "epoch": 73.57, "grad_norm": 7.050232410430908, "learning_rate": 7.547547547547548e-06, "loss": 0.1827, "step": 24500 }, { "epoch": 74.0, "eval_accuracy": 0.9782, "eval_loss": 0.07620517909526825, "eval_runtime": 12.8858, "eval_samples_per_second": 776.045, "eval_steps_per_second": 3.104, "step": 24642 }, { "epoch": 75.0, "eval_accuracy": 0.9781, "eval_loss": 0.0778127908706665, "eval_runtime": 13.234, "eval_samples_per_second": 755.629, "eval_steps_per_second": 3.023, "step": 24975 }, { "epoch": 75.08, "grad_norm": 8.824182510375977, "learning_rate": 7.4974974974974975e-06, "loss": 0.1847, "step": 25000 }, { "epoch": 76.0, "eval_accuracy": 0.9781, "eval_loss": 0.08140425384044647, "eval_runtime": 13.9137, "eval_samples_per_second": 718.714, "eval_steps_per_second": 2.875, "step": 25308 }, { "epoch": 76.58, "grad_norm": 8.920430183410645, "learning_rate": 7.447447447447448e-06, "loss": 0.1815, "step": 25500 }, { "epoch": 77.0, "eval_accuracy": 0.9788, "eval_loss": 0.07689312100410461, "eval_runtime": 13.1404, "eval_samples_per_second": 761.014, "eval_steps_per_second": 3.044, "step": 25641 }, { "epoch": 78.0, "eval_accuracy": 0.9801, "eval_loss": 0.07370501756668091, "eval_runtime": 13.7683, "eval_samples_per_second": 726.307, "eval_steps_per_second": 2.905, "step": 25974 }, { "epoch": 78.08, "grad_norm": 9.352115631103516, "learning_rate": 7.397397397397398e-06, "loss": 0.1786, "step": 26000 }, { "epoch": 79.0, "eval_accuracy": 0.9795, "eval_loss": 0.07396883517503738, "eval_runtime": 13.0019, "eval_samples_per_second": 769.121, "eval_steps_per_second": 3.076, "step": 26307 }, { "epoch": 79.58, "grad_norm": 14.500313758850098, "learning_rate": 7.347347347347348e-06, "loss": 0.1819, "step": 26500 }, { "epoch": 80.0, "eval_accuracy": 0.9807, "eval_loss": 0.07097125053405762, "eval_runtime": 13.6192, "eval_samples_per_second": 734.256, "eval_steps_per_second": 2.937, "step": 26640 }, { "epoch": 81.0, "eval_accuracy": 0.9799, "eval_loss": 0.07538946717977524, "eval_runtime": 13.1675, "eval_samples_per_second": 759.445, "eval_steps_per_second": 3.038, "step": 26973 }, { "epoch": 81.08, "grad_norm": 6.939184188842773, "learning_rate": 7.297297297297298e-06, "loss": 0.1767, "step": 27000 }, { "epoch": 82.0, "eval_accuracy": 0.9789, "eval_loss": 0.07721950113773346, "eval_runtime": 13.1312, "eval_samples_per_second": 761.542, "eval_steps_per_second": 3.046, "step": 27306 }, { "epoch": 82.58, "grad_norm": 6.59556770324707, "learning_rate": 7.247247247247248e-06, "loss": 0.1792, "step": 27500 }, { "epoch": 83.0, "eval_accuracy": 0.9799, "eval_loss": 0.0765281617641449, "eval_runtime": 13.0707, "eval_samples_per_second": 765.07, "eval_steps_per_second": 3.06, "step": 27639 }, { "epoch": 84.0, "eval_accuracy": 0.9799, "eval_loss": 0.07408629357814789, "eval_runtime": 12.4754, "eval_samples_per_second": 801.579, "eval_steps_per_second": 3.206, "step": 27972 }, { "epoch": 84.08, "grad_norm": 7.376372814178467, "learning_rate": 7.197197197197198e-06, "loss": 0.1752, "step": 28000 }, { "epoch": 85.0, "eval_accuracy": 0.9795, "eval_loss": 0.0741427093744278, "eval_runtime": 12.8952, "eval_samples_per_second": 775.481, "eval_steps_per_second": 3.102, "step": 28305 }, { "epoch": 85.59, "grad_norm": 11.074542045593262, "learning_rate": 7.147147147147148e-06, "loss": 0.1789, "step": 28500 }, { "epoch": 86.0, "eval_accuracy": 0.9802, "eval_loss": 0.07748846709728241, "eval_runtime": 12.5701, "eval_samples_per_second": 795.538, "eval_steps_per_second": 3.182, "step": 28638 }, { "epoch": 87.0, "eval_accuracy": 0.9803, "eval_loss": 0.07314252853393555, "eval_runtime": 13.4921, "eval_samples_per_second": 741.174, "eval_steps_per_second": 2.965, "step": 28971 }, { "epoch": 87.09, "grad_norm": 8.502799987792969, "learning_rate": 7.097097097097097e-06, "loss": 0.1755, "step": 29000 }, { "epoch": 88.0, "eval_accuracy": 0.9806, "eval_loss": 0.07246743142604828, "eval_runtime": 14.1164, "eval_samples_per_second": 708.396, "eval_steps_per_second": 2.834, "step": 29304 }, { "epoch": 88.59, "grad_norm": 10.004383087158203, "learning_rate": 7.047047047047047e-06, "loss": 0.1694, "step": 29500 }, { "epoch": 89.0, "eval_accuracy": 0.9795, "eval_loss": 0.07495511323213577, "eval_runtime": 13.0203, "eval_samples_per_second": 768.031, "eval_steps_per_second": 3.072, "step": 29637 }, { "epoch": 90.0, "eval_accuracy": 0.9815, "eval_loss": 0.07112333923578262, "eval_runtime": 12.8831, "eval_samples_per_second": 776.209, "eval_steps_per_second": 3.105, "step": 29970 }, { "epoch": 90.09, "grad_norm": 10.05745792388916, "learning_rate": 6.996996996996997e-06, "loss": 0.1739, "step": 30000 }, { "epoch": 91.0, "eval_accuracy": 0.98, "eval_loss": 0.07672711461782455, "eval_runtime": 13.4897, "eval_samples_per_second": 741.306, "eval_steps_per_second": 2.965, "step": 30303 }, { "epoch": 91.59, "grad_norm": 7.928704738616943, "learning_rate": 6.9469469469469474e-06, "loss": 0.1726, "step": 30500 }, { "epoch": 92.0, "eval_accuracy": 0.9801, "eval_loss": 0.0770508348941803, "eval_runtime": 13.4469, "eval_samples_per_second": 743.667, "eval_steps_per_second": 2.975, "step": 30636 }, { "epoch": 93.0, "eval_accuracy": 0.9786, "eval_loss": 0.0784955620765686, "eval_runtime": 13.8288, "eval_samples_per_second": 723.131, "eval_steps_per_second": 2.893, "step": 30969 }, { "epoch": 93.09, "grad_norm": 9.178421974182129, "learning_rate": 6.8968968968968975e-06, "loss": 0.1696, "step": 31000 }, { "epoch": 94.0, "eval_accuracy": 0.9787, "eval_loss": 0.07988455891609192, "eval_runtime": 13.675, "eval_samples_per_second": 731.261, "eval_steps_per_second": 2.925, "step": 31302 }, { "epoch": 94.59, "grad_norm": 7.237130165100098, "learning_rate": 6.846846846846848e-06, "loss": 0.1723, "step": 31500 }, { "epoch": 95.0, "eval_accuracy": 0.979, "eval_loss": 0.07755716890096664, "eval_runtime": 13.4765, "eval_samples_per_second": 742.035, "eval_steps_per_second": 2.968, "step": 31635 }, { "epoch": 96.0, "eval_accuracy": 0.9796, "eval_loss": 0.07740277796983719, "eval_runtime": 13.7122, "eval_samples_per_second": 729.276, "eval_steps_per_second": 2.917, "step": 31968 }, { "epoch": 96.1, "grad_norm": 6.332306385040283, "learning_rate": 6.796796796796798e-06, "loss": 0.1692, "step": 32000 }, { "epoch": 97.0, "eval_accuracy": 0.9797, "eval_loss": 0.08065084367990494, "eval_runtime": 12.8364, "eval_samples_per_second": 779.036, "eval_steps_per_second": 3.116, "step": 32301 }, { "epoch": 97.6, "grad_norm": 6.978306770324707, "learning_rate": 6.746746746746748e-06, "loss": 0.17, "step": 32500 }, { "epoch": 98.0, "eval_accuracy": 0.9798, "eval_loss": 0.07497260719537735, "eval_runtime": 12.9795, "eval_samples_per_second": 770.446, "eval_steps_per_second": 3.082, "step": 32634 }, { "epoch": 99.0, "eval_accuracy": 0.9805, "eval_loss": 0.07648865878582001, "eval_runtime": 13.1724, "eval_samples_per_second": 759.163, "eval_steps_per_second": 3.037, "step": 32967 }, { "epoch": 99.1, "grad_norm": 9.569737434387207, "learning_rate": 6.696696696696697e-06, "loss": 0.1691, "step": 33000 }, { "epoch": 100.0, "eval_accuracy": 0.9798, "eval_loss": 0.07629863917827606, "eval_runtime": 13.7613, "eval_samples_per_second": 726.674, "eval_steps_per_second": 2.907, "step": 33300 }, { "epoch": 100.6, "grad_norm": 9.273295402526855, "learning_rate": 6.646646646646647e-06, "loss": 0.165, "step": 33500 }, { "epoch": 101.0, "eval_accuracy": 0.9794, "eval_loss": 0.07651650160551071, "eval_runtime": 12.8929, "eval_samples_per_second": 775.622, "eval_steps_per_second": 3.102, "step": 33633 }, { "epoch": 102.0, "eval_accuracy": 0.9806, "eval_loss": 0.07412749528884888, "eval_runtime": 13.1273, "eval_samples_per_second": 761.772, "eval_steps_per_second": 3.047, "step": 33966 }, { "epoch": 102.1, "grad_norm": 5.686313152313232, "learning_rate": 6.596596596596597e-06, "loss": 0.1678, "step": 34000 }, { "epoch": 103.0, "eval_accuracy": 0.9805, "eval_loss": 0.07281830161809921, "eval_runtime": 14.1227, "eval_samples_per_second": 708.079, "eval_steps_per_second": 2.832, "step": 34299 }, { "epoch": 103.6, "grad_norm": 13.40892505645752, "learning_rate": 6.546546546546547e-06, "loss": 0.1663, "step": 34500 }, { "epoch": 104.0, "eval_accuracy": 0.9803, "eval_loss": 0.07456088066101074, "eval_runtime": 12.9479, "eval_samples_per_second": 772.329, "eval_steps_per_second": 3.089, "step": 34632 }, { "epoch": 105.0, "eval_accuracy": 0.9796, "eval_loss": 0.07469187676906586, "eval_runtime": 13.464, "eval_samples_per_second": 742.723, "eval_steps_per_second": 2.971, "step": 34965 }, { "epoch": 105.11, "grad_norm": 3.3622846603393555, "learning_rate": 6.496496496496497e-06, "loss": 0.1697, "step": 35000 }, { "epoch": 106.0, "eval_accuracy": 0.98, "eval_loss": 0.07429709285497665, "eval_runtime": 12.5016, "eval_samples_per_second": 799.901, "eval_steps_per_second": 3.2, "step": 35298 }, { "epoch": 106.61, "grad_norm": 13.544451713562012, "learning_rate": 6.446446446446447e-06, "loss": 0.1637, "step": 35500 }, { "epoch": 107.0, "eval_accuracy": 0.9796, "eval_loss": 0.07689350843429565, "eval_runtime": 13.0156, "eval_samples_per_second": 768.306, "eval_steps_per_second": 3.073, "step": 35631 }, { "epoch": 108.0, "eval_accuracy": 0.9802, "eval_loss": 0.07509542256593704, "eval_runtime": 13.0921, "eval_samples_per_second": 763.817, "eval_steps_per_second": 3.055, "step": 35964 }, { "epoch": 108.11, "grad_norm": 11.040998458862305, "learning_rate": 6.396396396396397e-06, "loss": 0.1678, "step": 36000 }, { "epoch": 109.0, "eval_accuracy": 0.9807, "eval_loss": 0.0769224464893341, "eval_runtime": 13.4563, "eval_samples_per_second": 743.145, "eval_steps_per_second": 2.973, "step": 36297 }, { "epoch": 109.61, "grad_norm": 7.243069171905518, "learning_rate": 6.3463463463463474e-06, "loss": 0.1674, "step": 36500 }, { "epoch": 110.0, "eval_accuracy": 0.9808, "eval_loss": 0.07392393797636032, "eval_runtime": 12.9386, "eval_samples_per_second": 772.879, "eval_steps_per_second": 3.092, "step": 36630 }, { "epoch": 111.0, "eval_accuracy": 0.9795, "eval_loss": 0.0809590220451355, "eval_runtime": 13.2637, "eval_samples_per_second": 753.935, "eval_steps_per_second": 3.016, "step": 36963 }, { "epoch": 111.11, "grad_norm": 8.149242401123047, "learning_rate": 6.296296296296297e-06, "loss": 0.1604, "step": 37000 }, { "epoch": 112.0, "eval_accuracy": 0.9806, "eval_loss": 0.07439053803682327, "eval_runtime": 12.9959, "eval_samples_per_second": 769.471, "eval_steps_per_second": 3.078, "step": 37296 }, { "epoch": 112.61, "grad_norm": 6.591969966888428, "learning_rate": 6.246246246246247e-06, "loss": 0.1583, "step": 37500 }, { "epoch": 113.0, "eval_accuracy": 0.9816, "eval_loss": 0.07411955296993256, "eval_runtime": 12.6355, "eval_samples_per_second": 791.421, "eval_steps_per_second": 3.166, "step": 37629 }, { "epoch": 114.0, "eval_accuracy": 0.98, "eval_loss": 0.07842327654361725, "eval_runtime": 14.6497, "eval_samples_per_second": 682.608, "eval_steps_per_second": 2.73, "step": 37962 }, { "epoch": 114.11, "grad_norm": 8.455940246582031, "learning_rate": 6.196196196196197e-06, "loss": 0.1592, "step": 38000 }, { "epoch": 115.0, "eval_accuracy": 0.9818, "eval_loss": 0.07287651300430298, "eval_runtime": 13.5165, "eval_samples_per_second": 739.838, "eval_steps_per_second": 2.959, "step": 38295 }, { "epoch": 115.62, "grad_norm": 6.092105388641357, "learning_rate": 6.146146146146147e-06, "loss": 0.1607, "step": 38500 }, { "epoch": 116.0, "eval_accuracy": 0.9818, "eval_loss": 0.07438412308692932, "eval_runtime": 13.6093, "eval_samples_per_second": 734.794, "eval_steps_per_second": 2.939, "step": 38628 }, { "epoch": 117.0, "eval_accuracy": 0.9817, "eval_loss": 0.07355909794569016, "eval_runtime": 12.6698, "eval_samples_per_second": 789.279, "eval_steps_per_second": 3.157, "step": 38961 }, { "epoch": 117.12, "grad_norm": 7.972623348236084, "learning_rate": 6.096096096096097e-06, "loss": 0.1657, "step": 39000 }, { "epoch": 118.0, "eval_accuracy": 0.9805, "eval_loss": 0.0769243985414505, "eval_runtime": 13.3442, "eval_samples_per_second": 749.391, "eval_steps_per_second": 2.998, "step": 39294 }, { "epoch": 118.62, "grad_norm": 7.559940338134766, "learning_rate": 6.046046046046047e-06, "loss": 0.1605, "step": 39500 }, { "epoch": 119.0, "eval_accuracy": 0.9812, "eval_loss": 0.0768662765622139, "eval_runtime": 13.0344, "eval_samples_per_second": 767.2, "eval_steps_per_second": 3.069, "step": 39627 }, { "epoch": 120.0, "eval_accuracy": 0.9808, "eval_loss": 0.07865633815526962, "eval_runtime": 13.8055, "eval_samples_per_second": 724.347, "eval_steps_per_second": 2.897, "step": 39960 }, { "epoch": 120.12, "grad_norm": 7.175966739654541, "learning_rate": 5.995995995995997e-06, "loss": 0.1554, "step": 40000 }, { "epoch": 121.0, "eval_accuracy": 0.9801, "eval_loss": 0.07854399085044861, "eval_runtime": 12.8799, "eval_samples_per_second": 776.402, "eval_steps_per_second": 3.106, "step": 40293 }, { "epoch": 121.62, "grad_norm": 12.97214126586914, "learning_rate": 5.945945945945947e-06, "loss": 0.157, "step": 40500 }, { "epoch": 122.0, "eval_accuracy": 0.9796, "eval_loss": 0.0760401040315628, "eval_runtime": 12.9319, "eval_samples_per_second": 773.283, "eval_steps_per_second": 3.093, "step": 40626 }, { "epoch": 123.0, "eval_accuracy": 0.9805, "eval_loss": 0.07537718862295151, "eval_runtime": 12.9913, "eval_samples_per_second": 769.749, "eval_steps_per_second": 3.079, "step": 40959 }, { "epoch": 123.12, "grad_norm": 7.540937423706055, "learning_rate": 5.895895895895896e-06, "loss": 0.1549, "step": 41000 }, { "epoch": 124.0, "eval_accuracy": 0.9802, "eval_loss": 0.07550998032093048, "eval_runtime": 14.417, "eval_samples_per_second": 693.624, "eval_steps_per_second": 2.774, "step": 41292 }, { "epoch": 124.62, "grad_norm": 6.355432987213135, "learning_rate": 5.8458458458458464e-06, "loss": 0.1578, "step": 41500 }, { "epoch": 125.0, "eval_accuracy": 0.9792, "eval_loss": 0.07649920880794525, "eval_runtime": 13.4926, "eval_samples_per_second": 741.15, "eval_steps_per_second": 2.965, "step": 41625 }, { "epoch": 126.0, "eval_accuracy": 0.98, "eval_loss": 0.07526528090238571, "eval_runtime": 12.9079, "eval_samples_per_second": 774.72, "eval_steps_per_second": 3.099, "step": 41958 }, { "epoch": 126.13, "grad_norm": 6.478011131286621, "learning_rate": 5.7957957957957965e-06, "loss": 0.1531, "step": 42000 }, { "epoch": 127.0, "eval_accuracy": 0.98, "eval_loss": 0.07793418318033218, "eval_runtime": 13.471, "eval_samples_per_second": 742.337, "eval_steps_per_second": 2.969, "step": 42291 }, { "epoch": 127.63, "grad_norm": 7.928163051605225, "learning_rate": 5.7457457457457466e-06, "loss": 0.1572, "step": 42500 }, { "epoch": 128.0, "eval_accuracy": 0.98, "eval_loss": 0.07834824174642563, "eval_runtime": 13.8772, "eval_samples_per_second": 720.605, "eval_steps_per_second": 2.882, "step": 42624 }, { "epoch": 129.0, "eval_accuracy": 0.9796, "eval_loss": 0.0785522609949112, "eval_runtime": 12.947, "eval_samples_per_second": 772.377, "eval_steps_per_second": 3.09, "step": 42957 }, { "epoch": 129.13, "grad_norm": 19.900619506835938, "learning_rate": 5.695695695695697e-06, "loss": 0.1558, "step": 43000 }, { "epoch": 130.0, "eval_accuracy": 0.9814, "eval_loss": 0.0741908997297287, "eval_runtime": 12.8882, "eval_samples_per_second": 775.906, "eval_steps_per_second": 3.104, "step": 43290 }, { "epoch": 130.63, "grad_norm": 12.561553001403809, "learning_rate": 5.645645645645647e-06, "loss": 0.1515, "step": 43500 }, { "epoch": 131.0, "eval_accuracy": 0.9798, "eval_loss": 0.07759422063827515, "eval_runtime": 14.2426, "eval_samples_per_second": 702.121, "eval_steps_per_second": 2.808, "step": 43623 }, { "epoch": 132.0, "eval_accuracy": 0.9793, "eval_loss": 0.08000089973211288, "eval_runtime": 13.0308, "eval_samples_per_second": 767.413, "eval_steps_per_second": 3.07, "step": 43956 }, { "epoch": 132.13, "grad_norm": 10.955676078796387, "learning_rate": 5.595595595595597e-06, "loss": 0.1526, "step": 44000 }, { "epoch": 133.0, "eval_accuracy": 0.9806, "eval_loss": 0.07563788443803787, "eval_runtime": 12.9183, "eval_samples_per_second": 774.093, "eval_steps_per_second": 3.096, "step": 44289 }, { "epoch": 133.63, "grad_norm": 9.621336936950684, "learning_rate": 5.545545545545547e-06, "loss": 0.1523, "step": 44500 }, { "epoch": 134.0, "eval_accuracy": 0.9797, "eval_loss": 0.07889340072870255, "eval_runtime": 13.5904, "eval_samples_per_second": 735.813, "eval_steps_per_second": 2.943, "step": 44622 }, { "epoch": 135.0, "eval_accuracy": 0.9801, "eval_loss": 0.07651440799236298, "eval_runtime": 13.0261, "eval_samples_per_second": 767.689, "eval_steps_per_second": 3.071, "step": 44955 }, { "epoch": 135.14, "grad_norm": 9.40494155883789, "learning_rate": 5.495495495495496e-06, "loss": 0.1519, "step": 45000 }, { "epoch": 136.0, "eval_accuracy": 0.9798, "eval_loss": 0.07700727880001068, "eval_runtime": 14.2776, "eval_samples_per_second": 700.397, "eval_steps_per_second": 2.802, "step": 45288 }, { "epoch": 136.64, "grad_norm": 7.778809070587158, "learning_rate": 5.445445445445446e-06, "loss": 0.1491, "step": 45500 }, { "epoch": 137.0, "eval_accuracy": 0.98, "eval_loss": 0.07937881350517273, "eval_runtime": 13.7045, "eval_samples_per_second": 729.689, "eval_steps_per_second": 2.919, "step": 45621 }, { "epoch": 138.0, "eval_accuracy": 0.9796, "eval_loss": 0.07901179045438766, "eval_runtime": 12.8776, "eval_samples_per_second": 776.54, "eval_steps_per_second": 3.106, "step": 45954 }, { "epoch": 138.14, "grad_norm": 12.694830894470215, "learning_rate": 5.395395395395396e-06, "loss": 0.1488, "step": 46000 }, { "epoch": 139.0, "eval_accuracy": 0.9796, "eval_loss": 0.07827717065811157, "eval_runtime": 13.01, "eval_samples_per_second": 768.642, "eval_steps_per_second": 3.075, "step": 46287 }, { "epoch": 139.64, "grad_norm": 5.728260517120361, "learning_rate": 5.345345345345346e-06, "loss": 0.1511, "step": 46500 }, { "epoch": 140.0, "eval_accuracy": 0.98, "eval_loss": 0.07687978446483612, "eval_runtime": 13.4169, "eval_samples_per_second": 745.331, "eval_steps_per_second": 2.981, "step": 46620 }, { "epoch": 141.0, "eval_accuracy": 0.9797, "eval_loss": 0.0826837420463562, "eval_runtime": 13.6768, "eval_samples_per_second": 731.166, "eval_steps_per_second": 2.925, "step": 46953 }, { "epoch": 141.14, "grad_norm": 8.749393463134766, "learning_rate": 5.2952952952952955e-06, "loss": 0.1475, "step": 47000 }, { "epoch": 142.0, "eval_accuracy": 0.98, "eval_loss": 0.07702562212944031, "eval_runtime": 13.4888, "eval_samples_per_second": 741.356, "eval_steps_per_second": 2.965, "step": 47286 }, { "epoch": 142.64, "grad_norm": 8.479342460632324, "learning_rate": 5.245245245245245e-06, "loss": 0.1449, "step": 47500 }, { "epoch": 143.0, "eval_accuracy": 0.98, "eval_loss": 0.07797821611166, "eval_runtime": 13.0058, "eval_samples_per_second": 768.886, "eval_steps_per_second": 3.076, "step": 47619 }, { "epoch": 144.0, "eval_accuracy": 0.9795, "eval_loss": 0.07707054167985916, "eval_runtime": 12.9845, "eval_samples_per_second": 770.15, "eval_steps_per_second": 3.081, "step": 47952 }, { "epoch": 144.14, "grad_norm": 10.80911636352539, "learning_rate": 5.195195195195195e-06, "loss": 0.146, "step": 48000 }, { "epoch": 145.0, "eval_accuracy": 0.9809, "eval_loss": 0.0750807523727417, "eval_runtime": 14.0436, "eval_samples_per_second": 712.069, "eval_steps_per_second": 2.848, "step": 48285 }, { "epoch": 145.65, "grad_norm": 5.568371295928955, "learning_rate": 5.145145145145145e-06, "loss": 0.1473, "step": 48500 }, { "epoch": 146.0, "eval_accuracy": 0.9797, "eval_loss": 0.07933703809976578, "eval_runtime": 13.1022, "eval_samples_per_second": 763.232, "eval_steps_per_second": 3.053, "step": 48618 }, { "epoch": 147.0, "eval_accuracy": 0.9812, "eval_loss": 0.07590621709823608, "eval_runtime": 13.1387, "eval_samples_per_second": 761.108, "eval_steps_per_second": 3.044, "step": 48951 }, { "epoch": 147.15, "grad_norm": 8.234355926513672, "learning_rate": 5.095095095095095e-06, "loss": 0.1466, "step": 49000 }, { "epoch": 148.0, "eval_accuracy": 0.9787, "eval_loss": 0.08211437612771988, "eval_runtime": 13.453, "eval_samples_per_second": 743.33, "eval_steps_per_second": 2.973, "step": 49284 }, { "epoch": 148.65, "grad_norm": 9.734493255615234, "learning_rate": 5.045045045045045e-06, "loss": 0.1472, "step": 49500 }, { "epoch": 149.0, "eval_accuracy": 0.9813, "eval_loss": 0.07566899061203003, "eval_runtime": 13.5127, "eval_samples_per_second": 740.042, "eval_steps_per_second": 2.96, "step": 49617 }, { "epoch": 150.0, "eval_accuracy": 0.9804, "eval_loss": 0.07641930133104324, "eval_runtime": 13.729, "eval_samples_per_second": 728.384, "eval_steps_per_second": 2.914, "step": 49950 }, { "epoch": 150.15, "grad_norm": 9.083195686340332, "learning_rate": 4.994994994994996e-06, "loss": 0.1437, "step": 50000 }, { "epoch": 151.0, "eval_accuracy": 0.9799, "eval_loss": 0.0816345363855362, "eval_runtime": 13.6081, "eval_samples_per_second": 734.856, "eval_steps_per_second": 2.939, "step": 50283 }, { "epoch": 151.65, "grad_norm": 16.20008087158203, "learning_rate": 4.944944944944945e-06, "loss": 0.1487, "step": 50500 }, { "epoch": 152.0, "eval_accuracy": 0.9818, "eval_loss": 0.07768727838993073, "eval_runtime": 12.9061, "eval_samples_per_second": 774.83, "eval_steps_per_second": 3.099, "step": 50616 }, { "epoch": 153.0, "eval_accuracy": 0.9811, "eval_loss": 0.07950293272733688, "eval_runtime": 13.0523, "eval_samples_per_second": 766.151, "eval_steps_per_second": 3.065, "step": 50949 }, { "epoch": 153.15, "grad_norm": 6.783934593200684, "learning_rate": 4.894894894894895e-06, "loss": 0.1455, "step": 51000 }, { "epoch": 154.0, "eval_accuracy": 0.9811, "eval_loss": 0.07836713641881943, "eval_runtime": 13.4341, "eval_samples_per_second": 744.377, "eval_steps_per_second": 2.978, "step": 51282 }, { "epoch": 154.65, "grad_norm": 7.791309833526611, "learning_rate": 4.844844844844845e-06, "loss": 0.1463, "step": 51500 }, { "epoch": 155.0, "eval_accuracy": 0.9801, "eval_loss": 0.07995989918708801, "eval_runtime": 13.7204, "eval_samples_per_second": 728.844, "eval_steps_per_second": 2.915, "step": 51615 }, { "epoch": 156.0, "eval_accuracy": 0.9809, "eval_loss": 0.07914280891418457, "eval_runtime": 13.045, "eval_samples_per_second": 766.58, "eval_steps_per_second": 3.066, "step": 51948 }, { "epoch": 156.16, "grad_norm": 7.225980281829834, "learning_rate": 4.794794794794795e-06, "loss": 0.1449, "step": 52000 }, { "epoch": 157.0, "eval_accuracy": 0.9815, "eval_loss": 0.0777197852730751, "eval_runtime": 12.8795, "eval_samples_per_second": 776.43, "eval_steps_per_second": 3.106, "step": 52281 }, { "epoch": 157.66, "grad_norm": 7.848995208740234, "learning_rate": 4.7447447447447454e-06, "loss": 0.1413, "step": 52500 }, { "epoch": 158.0, "eval_accuracy": 0.9802, "eval_loss": 0.07978815585374832, "eval_runtime": 13.0849, "eval_samples_per_second": 764.238, "eval_steps_per_second": 3.057, "step": 52614 }, { "epoch": 159.0, "eval_accuracy": 0.9798, "eval_loss": 0.08010842651128769, "eval_runtime": 12.9948, "eval_samples_per_second": 769.539, "eval_steps_per_second": 3.078, "step": 52947 }, { "epoch": 159.16, "grad_norm": 10.857318878173828, "learning_rate": 4.6946946946946955e-06, "loss": 0.143, "step": 53000 }, { "epoch": 160.0, "eval_accuracy": 0.9803, "eval_loss": 0.07897085696458817, "eval_runtime": 13.4824, "eval_samples_per_second": 741.707, "eval_steps_per_second": 2.967, "step": 53280 }, { "epoch": 160.66, "grad_norm": 8.192683219909668, "learning_rate": 4.6446446446446456e-06, "loss": 0.1462, "step": 53500 }, { "epoch": 161.0, "eval_accuracy": 0.9794, "eval_loss": 0.07847656309604645, "eval_runtime": 13.3614, "eval_samples_per_second": 748.422, "eval_steps_per_second": 2.994, "step": 53613 }, { "epoch": 162.0, "eval_accuracy": 0.9799, "eval_loss": 0.07839296758174896, "eval_runtime": 13.4943, "eval_samples_per_second": 741.054, "eval_steps_per_second": 2.964, "step": 53946 }, { "epoch": 162.16, "grad_norm": 5.753213882446289, "learning_rate": 4.594594594594596e-06, "loss": 0.1454, "step": 54000 }, { "epoch": 163.0, "eval_accuracy": 0.9814, "eval_loss": 0.07774946093559265, "eval_runtime": 13.5161, "eval_samples_per_second": 739.858, "eval_steps_per_second": 2.959, "step": 54279 }, { "epoch": 163.66, "grad_norm": 23.634429931640625, "learning_rate": 4.544544544544545e-06, "loss": 0.1404, "step": 54500 }, { "epoch": 164.0, "eval_accuracy": 0.9817, "eval_loss": 0.07676123827695847, "eval_runtime": 13.7785, "eval_samples_per_second": 725.767, "eval_steps_per_second": 2.903, "step": 54612 }, { "epoch": 165.0, "eval_accuracy": 0.9795, "eval_loss": 0.07868321239948273, "eval_runtime": 13.3337, "eval_samples_per_second": 749.978, "eval_steps_per_second": 3.0, "step": 54945 }, { "epoch": 165.17, "grad_norm": 14.497030258178711, "learning_rate": 4.494494494494495e-06, "loss": 0.1404, "step": 55000 }, { "epoch": 166.0, "eval_accuracy": 0.9806, "eval_loss": 0.08142885565757751, "eval_runtime": 13.0878, "eval_samples_per_second": 764.068, "eval_steps_per_second": 3.056, "step": 55278 }, { "epoch": 166.67, "grad_norm": 5.504241943359375, "learning_rate": 4.444444444444444e-06, "loss": 0.1438, "step": 55500 }, { "epoch": 167.0, "eval_accuracy": 0.9802, "eval_loss": 0.08015668392181396, "eval_runtime": 13.3375, "eval_samples_per_second": 749.766, "eval_steps_per_second": 2.999, "step": 55611 }, { "epoch": 168.0, "eval_accuracy": 0.9807, "eval_loss": 0.0773804783821106, "eval_runtime": 13.1562, "eval_samples_per_second": 760.1, "eval_steps_per_second": 3.04, "step": 55944 }, { "epoch": 168.17, "grad_norm": 10.65889835357666, "learning_rate": 4.394394394394394e-06, "loss": 0.1405, "step": 56000 }, { "epoch": 169.0, "eval_accuracy": 0.9793, "eval_loss": 0.07769276201725006, "eval_runtime": 13.37, "eval_samples_per_second": 747.945, "eval_steps_per_second": 2.992, "step": 56277 }, { "epoch": 169.67, "grad_norm": 9.663138389587402, "learning_rate": 4.344344344344344e-06, "loss": 0.1465, "step": 56500 }, { "epoch": 170.0, "eval_accuracy": 0.9804, "eval_loss": 0.07831669598817825, "eval_runtime": 13.9555, "eval_samples_per_second": 716.565, "eval_steps_per_second": 2.866, "step": 56610 }, { "epoch": 171.0, "eval_accuracy": 0.9799, "eval_loss": 0.08174577355384827, "eval_runtime": 13.3581, "eval_samples_per_second": 748.612, "eval_steps_per_second": 2.994, "step": 56943 }, { "epoch": 171.17, "grad_norm": 11.15052604675293, "learning_rate": 4.294294294294294e-06, "loss": 0.1404, "step": 57000 }, { "epoch": 172.0, "eval_accuracy": 0.9806, "eval_loss": 0.0780324712395668, "eval_runtime": 12.9812, "eval_samples_per_second": 770.346, "eval_steps_per_second": 3.081, "step": 57276 }, { "epoch": 172.67, "grad_norm": 10.398097038269043, "learning_rate": 4.2442442442442444e-06, "loss": 0.1367, "step": 57500 }, { "epoch": 173.0, "eval_accuracy": 0.9806, "eval_loss": 0.07895645499229431, "eval_runtime": 12.8468, "eval_samples_per_second": 778.402, "eval_steps_per_second": 3.114, "step": 57609 }, { "epoch": 174.0, "eval_accuracy": 0.9816, "eval_loss": 0.07868947833776474, "eval_runtime": 13.3373, "eval_samples_per_second": 749.779, "eval_steps_per_second": 2.999, "step": 57942 }, { "epoch": 174.17, "grad_norm": 8.292234420776367, "learning_rate": 4.1941941941941945e-06, "loss": 0.1399, "step": 58000 }, { "epoch": 175.0, "eval_accuracy": 0.9801, "eval_loss": 0.08106452971696854, "eval_runtime": 12.854, "eval_samples_per_second": 777.968, "eval_steps_per_second": 3.112, "step": 58275 }, { "epoch": 175.68, "grad_norm": 12.446533203125, "learning_rate": 4.1441441441441446e-06, "loss": 0.1418, "step": 58500 }, { "epoch": 176.0, "eval_accuracy": 0.9809, "eval_loss": 0.08040361106395721, "eval_runtime": 13.1526, "eval_samples_per_second": 760.308, "eval_steps_per_second": 3.041, "step": 58608 }, { "epoch": 177.0, "eval_accuracy": 0.9806, "eval_loss": 0.07995961606502533, "eval_runtime": 13.0397, "eval_samples_per_second": 766.888, "eval_steps_per_second": 3.068, "step": 58941 }, { "epoch": 177.18, "grad_norm": 9.551538467407227, "learning_rate": 4.094094094094095e-06, "loss": 0.1381, "step": 59000 }, { "epoch": 178.0, "eval_accuracy": 0.9814, "eval_loss": 0.07857974618673325, "eval_runtime": 13.3646, "eval_samples_per_second": 748.245, "eval_steps_per_second": 2.993, "step": 59274 }, { "epoch": 178.68, "grad_norm": 7.961233615875244, "learning_rate": 4.044044044044044e-06, "loss": 0.1357, "step": 59500 }, { "epoch": 179.0, "eval_accuracy": 0.9805, "eval_loss": 0.0797557458281517, "eval_runtime": 13.833, "eval_samples_per_second": 722.907, "eval_steps_per_second": 2.892, "step": 59607 }, { "epoch": 180.0, "eval_accuracy": 0.9813, "eval_loss": 0.07922037690877914, "eval_runtime": 13.1611, "eval_samples_per_second": 759.818, "eval_steps_per_second": 3.039, "step": 59940 }, { "epoch": 180.18, "grad_norm": 8.392486572265625, "learning_rate": 3.993993993993994e-06, "loss": 0.1465, "step": 60000 }, { "epoch": 181.0, "eval_accuracy": 0.9809, "eval_loss": 0.08021984249353409, "eval_runtime": 12.7756, "eval_samples_per_second": 782.741, "eval_steps_per_second": 3.131, "step": 60273 }, { "epoch": 181.68, "grad_norm": 5.668210506439209, "learning_rate": 3.943943943943944e-06, "loss": 0.1366, "step": 60500 }, { "epoch": 182.0, "eval_accuracy": 0.9804, "eval_loss": 0.07884296774864197, "eval_runtime": 12.9767, "eval_samples_per_second": 770.61, "eval_steps_per_second": 3.082, "step": 60606 }, { "epoch": 183.0, "eval_accuracy": 0.979, "eval_loss": 0.0805293619632721, "eval_runtime": 12.9332, "eval_samples_per_second": 773.205, "eval_steps_per_second": 3.093, "step": 60939 }, { "epoch": 183.18, "grad_norm": 9.771552085876465, "learning_rate": 3.893893893893894e-06, "loss": 0.139, "step": 61000 }, { "epoch": 184.0, "eval_accuracy": 0.9794, "eval_loss": 0.0822456106543541, "eval_runtime": 13.3118, "eval_samples_per_second": 751.211, "eval_steps_per_second": 3.005, "step": 61272 }, { "epoch": 184.68, "grad_norm": 10.898391723632812, "learning_rate": 3.843843843843844e-06, "loss": 0.1381, "step": 61500 }, { "epoch": 185.0, "eval_accuracy": 0.9807, "eval_loss": 0.08079157024621964, "eval_runtime": 12.8717, "eval_samples_per_second": 776.899, "eval_steps_per_second": 3.108, "step": 61605 }, { "epoch": 186.0, "eval_accuracy": 0.9802, "eval_loss": 0.08059785515069962, "eval_runtime": 12.418, "eval_samples_per_second": 805.284, "eval_steps_per_second": 3.221, "step": 61938 }, { "epoch": 186.19, "grad_norm": 6.1758246421813965, "learning_rate": 3.793793793793794e-06, "loss": 0.1367, "step": 62000 }, { "epoch": 187.0, "eval_accuracy": 0.9803, "eval_loss": 0.07853790372610092, "eval_runtime": 12.9215, "eval_samples_per_second": 773.902, "eval_steps_per_second": 3.096, "step": 62271 }, { "epoch": 187.69, "grad_norm": 9.155027389526367, "learning_rate": 3.743743743743744e-06, "loss": 0.1354, "step": 62500 }, { "epoch": 188.0, "eval_accuracy": 0.9803, "eval_loss": 0.0803978368639946, "eval_runtime": 13.5157, "eval_samples_per_second": 739.883, "eval_steps_per_second": 2.96, "step": 62604 }, { "epoch": 189.0, "eval_accuracy": 0.98, "eval_loss": 0.07950347661972046, "eval_runtime": 13.0185, "eval_samples_per_second": 768.138, "eval_steps_per_second": 3.073, "step": 62937 }, { "epoch": 189.19, "grad_norm": 9.88645076751709, "learning_rate": 3.693693693693694e-06, "loss": 0.137, "step": 63000 }, { "epoch": 190.0, "eval_accuracy": 0.9805, "eval_loss": 0.07970842719078064, "eval_runtime": 13.0486, "eval_samples_per_second": 766.367, "eval_steps_per_second": 3.065, "step": 63270 }, { "epoch": 190.69, "grad_norm": 10.085098266601562, "learning_rate": 3.643643643643644e-06, "loss": 0.1351, "step": 63500 }, { "epoch": 191.0, "eval_accuracy": 0.9803, "eval_loss": 0.07862575352191925, "eval_runtime": 13.7359, "eval_samples_per_second": 728.019, "eval_steps_per_second": 2.912, "step": 63603 }, { "epoch": 192.0, "eval_accuracy": 0.9807, "eval_loss": 0.07779725641012192, "eval_runtime": 14.1749, "eval_samples_per_second": 705.473, "eval_steps_per_second": 2.822, "step": 63936 }, { "epoch": 192.19, "grad_norm": 7.259002685546875, "learning_rate": 3.593593593593594e-06, "loss": 0.1345, "step": 64000 }, { "epoch": 193.0, "eval_accuracy": 0.9812, "eval_loss": 0.07995971292257309, "eval_runtime": 13.3268, "eval_samples_per_second": 750.366, "eval_steps_per_second": 3.001, "step": 64269 }, { "epoch": 193.69, "grad_norm": 6.42719030380249, "learning_rate": 3.5435435435435437e-06, "loss": 0.1377, "step": 64500 }, { "epoch": 194.0, "eval_accuracy": 0.9799, "eval_loss": 0.07895601540803909, "eval_runtime": 12.9129, "eval_samples_per_second": 774.417, "eval_steps_per_second": 3.098, "step": 64602 }, { "epoch": 195.0, "eval_accuracy": 0.98, "eval_loss": 0.08155795186758041, "eval_runtime": 13.7447, "eval_samples_per_second": 727.555, "eval_steps_per_second": 2.91, "step": 64935 }, { "epoch": 195.2, "grad_norm": 7.303466320037842, "learning_rate": 3.4934934934934938e-06, "loss": 0.1339, "step": 65000 }, { "epoch": 196.0, "eval_accuracy": 0.9811, "eval_loss": 0.08134587854146957, "eval_runtime": 12.87, "eval_samples_per_second": 777.004, "eval_steps_per_second": 3.108, "step": 65268 }, { "epoch": 196.7, "grad_norm": 10.115856170654297, "learning_rate": 3.443443443443444e-06, "loss": 0.1338, "step": 65500 }, { "epoch": 197.0, "eval_accuracy": 0.981, "eval_loss": 0.07863133400678635, "eval_runtime": 13.1588, "eval_samples_per_second": 759.949, "eval_steps_per_second": 3.04, "step": 65601 }, { "epoch": 198.0, "eval_accuracy": 0.9805, "eval_loss": 0.08128491789102554, "eval_runtime": 12.3451, "eval_samples_per_second": 810.038, "eval_steps_per_second": 3.24, "step": 65934 }, { "epoch": 198.2, "grad_norm": 9.01919174194336, "learning_rate": 3.393393393393394e-06, "loss": 0.1371, "step": 66000 }, { "epoch": 199.0, "eval_accuracy": 0.9808, "eval_loss": 0.08089832216501236, "eval_runtime": 13.1128, "eval_samples_per_second": 762.612, "eval_steps_per_second": 3.05, "step": 66267 }, { "epoch": 199.7, "grad_norm": 9.190634727478027, "learning_rate": 3.3433433433433436e-06, "loss": 0.1339, "step": 66500 }, { "epoch": 200.0, "eval_accuracy": 0.9807, "eval_loss": 0.07968232780694962, "eval_runtime": 12.8919, "eval_samples_per_second": 775.68, "eval_steps_per_second": 3.103, "step": 66600 }, { "epoch": 201.0, "eval_accuracy": 0.9808, "eval_loss": 0.08057761192321777, "eval_runtime": 12.9886, "eval_samples_per_second": 769.904, "eval_steps_per_second": 3.08, "step": 66933 }, { "epoch": 201.2, "grad_norm": 9.490571022033691, "learning_rate": 3.2932932932932936e-06, "loss": 0.131, "step": 67000 }, { "epoch": 202.0, "eval_accuracy": 0.98, "eval_loss": 0.08165069669485092, "eval_runtime": 13.9588, "eval_samples_per_second": 716.394, "eval_steps_per_second": 2.866, "step": 67266 }, { "epoch": 202.7, "grad_norm": 8.564950942993164, "learning_rate": 3.2432432432432437e-06, "loss": 0.1365, "step": 67500 }, { "epoch": 203.0, "eval_accuracy": 0.9801, "eval_loss": 0.08228688687086105, "eval_runtime": 12.9615, "eval_samples_per_second": 771.513, "eval_steps_per_second": 3.086, "step": 67599 }, { "epoch": 204.0, "eval_accuracy": 0.9798, "eval_loss": 0.08267272263765335, "eval_runtime": 12.8976, "eval_samples_per_second": 775.339, "eval_steps_per_second": 3.101, "step": 67932 }, { "epoch": 204.2, "grad_norm": 9.844771385192871, "learning_rate": 3.1931931931931938e-06, "loss": 0.1358, "step": 68000 }, { "epoch": 205.0, "eval_accuracy": 0.9816, "eval_loss": 0.0804433524608612, "eval_runtime": 12.8434, "eval_samples_per_second": 778.613, "eval_steps_per_second": 3.114, "step": 68265 }, { "epoch": 205.71, "grad_norm": 9.6033935546875, "learning_rate": 3.1431431431431434e-06, "loss": 0.132, "step": 68500 }, { "epoch": 206.0, "eval_accuracy": 0.9802, "eval_loss": 0.08253764361143112, "eval_runtime": 13.4062, "eval_samples_per_second": 745.922, "eval_steps_per_second": 2.984, "step": 68598 }, { "epoch": 207.0, "eval_accuracy": 0.981, "eval_loss": 0.07984968274831772, "eval_runtime": 13.6899, "eval_samples_per_second": 730.467, "eval_steps_per_second": 2.922, "step": 68931 }, { "epoch": 207.21, "grad_norm": 7.0395355224609375, "learning_rate": 3.0930930930930935e-06, "loss": 0.1396, "step": 69000 }, { "epoch": 208.0, "eval_accuracy": 0.9813, "eval_loss": 0.08085375279188156, "eval_runtime": 12.8706, "eval_samples_per_second": 776.962, "eval_steps_per_second": 3.108, "step": 69264 }, { "epoch": 208.71, "grad_norm": 12.84909725189209, "learning_rate": 3.0430430430430436e-06, "loss": 0.1324, "step": 69500 }, { "epoch": 209.0, "eval_accuracy": 0.9815, "eval_loss": 0.07963848859071732, "eval_runtime": 12.9764, "eval_samples_per_second": 770.628, "eval_steps_per_second": 3.083, "step": 69597 }, { "epoch": 210.0, "eval_accuracy": 0.9807, "eval_loss": 0.08001097291707993, "eval_runtime": 13.4375, "eval_samples_per_second": 744.185, "eval_steps_per_second": 2.977, "step": 69930 }, { "epoch": 210.21, "grad_norm": 8.406508445739746, "learning_rate": 2.9929929929929936e-06, "loss": 0.1324, "step": 70000 }, { "epoch": 211.0, "eval_accuracy": 0.9809, "eval_loss": 0.08123359829187393, "eval_runtime": 13.1971, "eval_samples_per_second": 757.742, "eval_steps_per_second": 3.031, "step": 70263 }, { "epoch": 211.71, "grad_norm": 4.204705715179443, "learning_rate": 2.942942942942943e-06, "loss": 0.1343, "step": 70500 }, { "epoch": 212.0, "eval_accuracy": 0.9811, "eval_loss": 0.08246932923793793, "eval_runtime": 13.3417, "eval_samples_per_second": 749.532, "eval_steps_per_second": 2.998, "step": 70596 }, { "epoch": 213.0, "eval_accuracy": 0.9811, "eval_loss": 0.08172763139009476, "eval_runtime": 12.9861, "eval_samples_per_second": 770.053, "eval_steps_per_second": 3.08, "step": 70929 }, { "epoch": 213.21, "grad_norm": 8.177204132080078, "learning_rate": 2.892892892892893e-06, "loss": 0.1322, "step": 71000 }, { "epoch": 214.0, "eval_accuracy": 0.9811, "eval_loss": 0.08131828904151917, "eval_runtime": 14.0986, "eval_samples_per_second": 709.289, "eval_steps_per_second": 2.837, "step": 71262 }, { "epoch": 214.71, "grad_norm": 8.844195365905762, "learning_rate": 2.842842842842843e-06, "loss": 0.133, "step": 71500 }, { "epoch": 215.0, "eval_accuracy": 0.9807, "eval_loss": 0.0824679508805275, "eval_runtime": 12.94, "eval_samples_per_second": 772.8, "eval_steps_per_second": 3.091, "step": 71595 }, { "epoch": 216.0, "eval_accuracy": 0.9809, "eval_loss": 0.0828867107629776, "eval_runtime": 12.9965, "eval_samples_per_second": 769.439, "eval_steps_per_second": 3.078, "step": 71928 }, { "epoch": 216.22, "grad_norm": 11.01076889038086, "learning_rate": 2.7927927927927926e-06, "loss": 0.1336, "step": 72000 }, { "epoch": 217.0, "eval_accuracy": 0.9802, "eval_loss": 0.08191470056772232, "eval_runtime": 12.6388, "eval_samples_per_second": 791.211, "eval_steps_per_second": 3.165, "step": 72261 }, { "epoch": 217.72, "grad_norm": 8.309555053710938, "learning_rate": 2.7427427427427427e-06, "loss": 0.1287, "step": 72500 }, { "epoch": 218.0, "eval_accuracy": 0.9803, "eval_loss": 0.08172294497489929, "eval_runtime": 12.869, "eval_samples_per_second": 777.063, "eval_steps_per_second": 3.108, "step": 72594 }, { "epoch": 219.0, "eval_accuracy": 0.9804, "eval_loss": 0.08100100606679916, "eval_runtime": 13.9577, "eval_samples_per_second": 716.449, "eval_steps_per_second": 2.866, "step": 72927 }, { "epoch": 219.22, "grad_norm": 10.596402168273926, "learning_rate": 2.6926926926926928e-06, "loss": 0.1322, "step": 73000 }, { "epoch": 220.0, "eval_accuracy": 0.98, "eval_loss": 0.08346739411354065, "eval_runtime": 12.8881, "eval_samples_per_second": 775.91, "eval_steps_per_second": 3.104, "step": 73260 }, { "epoch": 220.72, "grad_norm": 8.293975830078125, "learning_rate": 2.642642642642643e-06, "loss": 0.1287, "step": 73500 }, { "epoch": 221.0, "eval_accuracy": 0.9798, "eval_loss": 0.08478812873363495, "eval_runtime": 12.482, "eval_samples_per_second": 801.151, "eval_steps_per_second": 3.205, "step": 73593 }, { "epoch": 222.0, "eval_accuracy": 0.9803, "eval_loss": 0.08156371861696243, "eval_runtime": 12.9596, "eval_samples_per_second": 771.628, "eval_steps_per_second": 3.087, "step": 73926 }, { "epoch": 222.22, "grad_norm": 9.707475662231445, "learning_rate": 2.5925925925925925e-06, "loss": 0.1317, "step": 74000 }, { "epoch": 223.0, "eval_accuracy": 0.9803, "eval_loss": 0.08239776641130447, "eval_runtime": 13.8203, "eval_samples_per_second": 723.571, "eval_steps_per_second": 2.894, "step": 74259 }, { "epoch": 223.72, "grad_norm": 5.2577996253967285, "learning_rate": 2.5425425425425426e-06, "loss": 0.1308, "step": 74500 }, { "epoch": 224.0, "eval_accuracy": 0.9811, "eval_loss": 0.08223745971918106, "eval_runtime": 13.4783, "eval_samples_per_second": 741.934, "eval_steps_per_second": 2.968, "step": 74592 }, { "epoch": 225.0, "eval_accuracy": 0.9807, "eval_loss": 0.0822429209947586, "eval_runtime": 13.2583, "eval_samples_per_second": 754.244, "eval_steps_per_second": 3.017, "step": 74925 }, { "epoch": 225.23, "grad_norm": 6.952250957489014, "learning_rate": 2.4924924924924926e-06, "loss": 0.1247, "step": 75000 }, { "epoch": 226.0, "eval_accuracy": 0.9806, "eval_loss": 0.08117574453353882, "eval_runtime": 13.5159, "eval_samples_per_second": 739.872, "eval_steps_per_second": 2.959, "step": 75258 }, { "epoch": 226.73, "grad_norm": 17.568580627441406, "learning_rate": 2.4424424424424427e-06, "loss": 0.129, "step": 75500 }, { "epoch": 227.0, "eval_accuracy": 0.9805, "eval_loss": 0.08187758177518845, "eval_runtime": 12.7892, "eval_samples_per_second": 781.912, "eval_steps_per_second": 3.128, "step": 75591 }, { "epoch": 228.0, "eval_accuracy": 0.981, "eval_loss": 0.08235606551170349, "eval_runtime": 12.9107, "eval_samples_per_second": 774.55, "eval_steps_per_second": 3.098, "step": 75924 }, { "epoch": 228.23, "grad_norm": 13.310216903686523, "learning_rate": 2.3923923923923923e-06, "loss": 0.1315, "step": 76000 }, { "epoch": 229.0, "eval_accuracy": 0.9803, "eval_loss": 0.08291840553283691, "eval_runtime": 13.4267, "eval_samples_per_second": 744.787, "eval_steps_per_second": 2.979, "step": 76257 }, { "epoch": 229.73, "grad_norm": 7.18035888671875, "learning_rate": 2.3423423423423424e-06, "loss": 0.1243, "step": 76500 }, { "epoch": 230.0, "eval_accuracy": 0.9808, "eval_loss": 0.08134060353040695, "eval_runtime": 12.9054, "eval_samples_per_second": 774.871, "eval_steps_per_second": 3.099, "step": 76590 }, { "epoch": 231.0, "eval_accuracy": 0.9808, "eval_loss": 0.08125565946102142, "eval_runtime": 13.8266, "eval_samples_per_second": 723.246, "eval_steps_per_second": 2.893, "step": 76923 }, { "epoch": 231.23, "grad_norm": 11.132826805114746, "learning_rate": 2.2922922922922925e-06, "loss": 0.1244, "step": 77000 }, { "epoch": 232.0, "eval_accuracy": 0.981, "eval_loss": 0.08288297057151794, "eval_runtime": 13.8545, "eval_samples_per_second": 721.786, "eval_steps_per_second": 2.887, "step": 77256 }, { "epoch": 232.73, "grad_norm": 7.415234565734863, "learning_rate": 2.2422422422422426e-06, "loss": 0.1286, "step": 77500 }, { "epoch": 233.0, "eval_accuracy": 0.9801, "eval_loss": 0.083954356610775, "eval_runtime": 13.1117, "eval_samples_per_second": 762.679, "eval_steps_per_second": 3.051, "step": 77589 }, { "epoch": 234.0, "eval_accuracy": 0.9805, "eval_loss": 0.08230035752058029, "eval_runtime": 13.3702, "eval_samples_per_second": 747.932, "eval_steps_per_second": 2.992, "step": 77922 }, { "epoch": 234.23, "grad_norm": 7.36590576171875, "learning_rate": 2.192192192192192e-06, "loss": 0.1261, "step": 78000 }, { "epoch": 235.0, "eval_accuracy": 0.9811, "eval_loss": 0.08295118808746338, "eval_runtime": 13.748, "eval_samples_per_second": 727.381, "eval_steps_per_second": 2.91, "step": 78255 }, { "epoch": 235.74, "grad_norm": 10.516325950622559, "learning_rate": 2.1421421421421423e-06, "loss": 0.1238, "step": 78500 }, { "epoch": 236.0, "eval_accuracy": 0.9812, "eval_loss": 0.08197174966335297, "eval_runtime": 12.9286, "eval_samples_per_second": 773.481, "eval_steps_per_second": 3.094, "step": 78588 }, { "epoch": 237.0, "eval_accuracy": 0.9807, "eval_loss": 0.08315034210681915, "eval_runtime": 13.634, "eval_samples_per_second": 733.458, "eval_steps_per_second": 2.934, "step": 78921 }, { "epoch": 237.24, "grad_norm": 5.020528316497803, "learning_rate": 2.0920920920920923e-06, "loss": 0.1296, "step": 79000 }, { "epoch": 238.0, "eval_accuracy": 0.9809, "eval_loss": 0.08168121427297592, "eval_runtime": 14.4842, "eval_samples_per_second": 690.406, "eval_steps_per_second": 2.762, "step": 79254 }, { "epoch": 238.74, "grad_norm": 11.957234382629395, "learning_rate": 2.0420420420420424e-06, "loss": 0.1278, "step": 79500 }, { "epoch": 239.0, "eval_accuracy": 0.981, "eval_loss": 0.08146882057189941, "eval_runtime": 14.503, "eval_samples_per_second": 689.511, "eval_steps_per_second": 2.758, "step": 79587 }, { "epoch": 240.0, "eval_accuracy": 0.9802, "eval_loss": 0.08267929404973984, "eval_runtime": 12.9081, "eval_samples_per_second": 774.71, "eval_steps_per_second": 3.099, "step": 79920 }, { "epoch": 240.24, "grad_norm": 10.550077438354492, "learning_rate": 1.991991991991992e-06, "loss": 0.1246, "step": 80000 }, { "epoch": 241.0, "eval_accuracy": 0.9805, "eval_loss": 0.08258900791406631, "eval_runtime": 13.3618, "eval_samples_per_second": 748.404, "eval_steps_per_second": 2.994, "step": 80253 }, { "epoch": 241.74, "grad_norm": 14.927352905273438, "learning_rate": 1.941941941941942e-06, "loss": 0.128, "step": 80500 }, { "epoch": 242.0, "eval_accuracy": 0.9797, "eval_loss": 0.08207195997238159, "eval_runtime": 13.4168, "eval_samples_per_second": 745.333, "eval_steps_per_second": 2.981, "step": 80586 }, { "epoch": 243.0, "eval_accuracy": 0.981, "eval_loss": 0.08075813204050064, "eval_runtime": 12.9166, "eval_samples_per_second": 774.198, "eval_steps_per_second": 3.097, "step": 80919 }, { "epoch": 243.24, "grad_norm": 10.435842514038086, "learning_rate": 1.8918918918918922e-06, "loss": 0.1274, "step": 81000 }, { "epoch": 244.0, "eval_accuracy": 0.9806, "eval_loss": 0.0817038044333458, "eval_runtime": 12.9068, "eval_samples_per_second": 774.784, "eval_steps_per_second": 3.099, "step": 81252 }, { "epoch": 244.74, "grad_norm": 6.686298370361328, "learning_rate": 1.841841841841842e-06, "loss": 0.1232, "step": 81500 }, { "epoch": 245.0, "eval_accuracy": 0.9805, "eval_loss": 0.0812101811170578, "eval_runtime": 13.3736, "eval_samples_per_second": 747.741, "eval_steps_per_second": 2.991, "step": 81585 }, { "epoch": 246.0, "eval_accuracy": 0.9809, "eval_loss": 0.08127359300851822, "eval_runtime": 13.8034, "eval_samples_per_second": 724.46, "eval_steps_per_second": 2.898, "step": 81918 }, { "epoch": 246.25, "grad_norm": 9.9036865234375, "learning_rate": 1.7917917917917917e-06, "loss": 0.1281, "step": 82000 }, { "epoch": 247.0, "eval_accuracy": 0.9801, "eval_loss": 0.0803731232881546, "eval_runtime": 13.6609, "eval_samples_per_second": 732.019, "eval_steps_per_second": 2.928, "step": 82251 }, { "epoch": 247.75, "grad_norm": 9.58124828338623, "learning_rate": 1.7417417417417418e-06, "loss": 0.1236, "step": 82500 }, { "epoch": 248.0, "eval_accuracy": 0.9807, "eval_loss": 0.08054234832525253, "eval_runtime": 12.9985, "eval_samples_per_second": 769.319, "eval_steps_per_second": 3.077, "step": 82584 }, { "epoch": 249.0, "eval_accuracy": 0.9807, "eval_loss": 0.08253397792577744, "eval_runtime": 12.7246, "eval_samples_per_second": 785.882, "eval_steps_per_second": 3.144, "step": 82917 }, { "epoch": 249.25, "grad_norm": 11.608097076416016, "learning_rate": 1.6916916916916916e-06, "loss": 0.1223, "step": 83000 }, { "epoch": 250.0, "eval_accuracy": 0.9804, "eval_loss": 0.08115767687559128, "eval_runtime": 13.6973, "eval_samples_per_second": 730.07, "eval_steps_per_second": 2.92, "step": 83250 }, { "epoch": 250.75, "grad_norm": 7.931227207183838, "learning_rate": 1.6416416416416417e-06, "loss": 0.1278, "step": 83500 }, { "epoch": 251.0, "eval_accuracy": 0.9802, "eval_loss": 0.08087089657783508, "eval_runtime": 12.8657, "eval_samples_per_second": 777.263, "eval_steps_per_second": 3.109, "step": 83583 }, { "epoch": 252.0, "eval_accuracy": 0.9818, "eval_loss": 0.07839205116033554, "eval_runtime": 12.9826, "eval_samples_per_second": 770.262, "eval_steps_per_second": 3.081, "step": 83916 }, { "epoch": 252.25, "grad_norm": 6.618145942687988, "learning_rate": 1.5915915915915916e-06, "loss": 0.1238, "step": 84000 }, { "epoch": 253.0, "eval_accuracy": 0.9808, "eval_loss": 0.07928815484046936, "eval_runtime": 13.4795, "eval_samples_per_second": 741.867, "eval_steps_per_second": 2.967, "step": 84249 }, { "epoch": 253.75, "grad_norm": 6.5788397789001465, "learning_rate": 1.5415415415415416e-06, "loss": 0.1259, "step": 84500 }, { "epoch": 254.0, "eval_accuracy": 0.9814, "eval_loss": 0.08129309117794037, "eval_runtime": 12.8787, "eval_samples_per_second": 776.478, "eval_steps_per_second": 3.106, "step": 84582 }, { "epoch": 255.0, "eval_accuracy": 0.981, "eval_loss": 0.08033791929483414, "eval_runtime": 12.7033, "eval_samples_per_second": 787.197, "eval_steps_per_second": 3.149, "step": 84915 }, { "epoch": 255.26, "grad_norm": 8.367218017578125, "learning_rate": 1.4914914914914915e-06, "loss": 0.1261, "step": 85000 }, { "epoch": 256.0, "eval_accuracy": 0.981, "eval_loss": 0.08045142143964767, "eval_runtime": 12.9511, "eval_samples_per_second": 772.133, "eval_steps_per_second": 3.089, "step": 85248 }, { "epoch": 256.76, "grad_norm": 17.39365005493164, "learning_rate": 1.4414414414414416e-06, "loss": 0.1312, "step": 85500 }, { "epoch": 257.0, "eval_accuracy": 0.9805, "eval_loss": 0.08164441585540771, "eval_runtime": 13.3735, "eval_samples_per_second": 747.745, "eval_steps_per_second": 2.991, "step": 85581 }, { "epoch": 258.0, "eval_accuracy": 0.9807, "eval_loss": 0.08030729740858078, "eval_runtime": 12.9143, "eval_samples_per_second": 774.336, "eval_steps_per_second": 3.097, "step": 85914 }, { "epoch": 258.26, "grad_norm": 12.668910026550293, "learning_rate": 1.3913913913913914e-06, "loss": 0.1237, "step": 86000 }, { "epoch": 259.0, "eval_accuracy": 0.9804, "eval_loss": 0.07897236198186874, "eval_runtime": 13.3964, "eval_samples_per_second": 746.469, "eval_steps_per_second": 2.986, "step": 86247 }, { "epoch": 259.76, "grad_norm": 3.696176767349243, "learning_rate": 1.3413413413413415e-06, "loss": 0.1234, "step": 86500 }, { "epoch": 260.0, "eval_accuracy": 0.9803, "eval_loss": 0.07928313314914703, "eval_runtime": 13.8621, "eval_samples_per_second": 721.391, "eval_steps_per_second": 2.886, "step": 86580 }, { "epoch": 261.0, "eval_accuracy": 0.9806, "eval_loss": 0.07920601218938828, "eval_runtime": 12.907, "eval_samples_per_second": 774.775, "eval_steps_per_second": 3.099, "step": 86913 }, { "epoch": 261.26, "grad_norm": 11.28502082824707, "learning_rate": 1.2912912912912913e-06, "loss": 0.1237, "step": 87000 }, { "epoch": 262.0, "eval_accuracy": 0.9806, "eval_loss": 0.08003947883844376, "eval_runtime": 13.8177, "eval_samples_per_second": 723.709, "eval_steps_per_second": 2.895, "step": 87246 }, { "epoch": 262.76, "grad_norm": 13.543560981750488, "learning_rate": 1.2412412412412414e-06, "loss": 0.1257, "step": 87500 }, { "epoch": 263.0, "eval_accuracy": 0.9802, "eval_loss": 0.08235891908407211, "eval_runtime": 13.4574, "eval_samples_per_second": 743.088, "eval_steps_per_second": 2.972, "step": 87579 }, { "epoch": 264.0, "eval_accuracy": 0.9807, "eval_loss": 0.08182436227798462, "eval_runtime": 12.9778, "eval_samples_per_second": 770.546, "eval_steps_per_second": 3.082, "step": 87912 }, { "epoch": 264.26, "grad_norm": 11.065189361572266, "learning_rate": 1.1911911911911913e-06, "loss": 0.1219, "step": 88000 }, { "epoch": 265.0, "eval_accuracy": 0.9808, "eval_loss": 0.08205542713403702, "eval_runtime": 13.6001, "eval_samples_per_second": 735.288, "eval_steps_per_second": 2.941, "step": 88245 }, { "epoch": 265.77, "grad_norm": 9.291784286499023, "learning_rate": 1.1411411411411411e-06, "loss": 0.1298, "step": 88500 }, { "epoch": 266.0, "eval_accuracy": 0.9805, "eval_loss": 0.08165726810693741, "eval_runtime": 13.3896, "eval_samples_per_second": 746.85, "eval_steps_per_second": 2.987, "step": 88578 }, { "epoch": 267.0, "eval_accuracy": 0.9805, "eval_loss": 0.08162441104650497, "eval_runtime": 14.0305, "eval_samples_per_second": 712.733, "eval_steps_per_second": 2.851, "step": 88911 }, { "epoch": 267.27, "grad_norm": 17.33576202392578, "learning_rate": 1.0910910910910912e-06, "loss": 0.1222, "step": 89000 }, { "epoch": 268.0, "eval_accuracy": 0.9806, "eval_loss": 0.08136063069105148, "eval_runtime": 12.8095, "eval_samples_per_second": 780.671, "eval_steps_per_second": 3.123, "step": 89244 }, { "epoch": 268.77, "grad_norm": 11.170260429382324, "learning_rate": 1.041041041041041e-06, "loss": 0.1268, "step": 89500 }, { "epoch": 269.0, "eval_accuracy": 0.9803, "eval_loss": 0.08162767440080643, "eval_runtime": 13.5821, "eval_samples_per_second": 736.263, "eval_steps_per_second": 2.945, "step": 89577 }, { "epoch": 270.0, "eval_accuracy": 0.981, "eval_loss": 0.08254320919513702, "eval_runtime": 12.9419, "eval_samples_per_second": 772.681, "eval_steps_per_second": 3.091, "step": 89910 }, { "epoch": 270.27, "grad_norm": 10.08292007446289, "learning_rate": 9.909909909909911e-07, "loss": 0.1239, "step": 90000 }, { "epoch": 271.0, "eval_accuracy": 0.9802, "eval_loss": 0.08088234812021255, "eval_runtime": 12.9857, "eval_samples_per_second": 770.075, "eval_steps_per_second": 3.08, "step": 90243 }, { "epoch": 271.77, "grad_norm": 7.639751434326172, "learning_rate": 9.409409409409411e-07, "loss": 0.1277, "step": 90500 }, { "epoch": 272.0, "eval_accuracy": 0.9804, "eval_loss": 0.0805734246969223, "eval_runtime": 12.6096, "eval_samples_per_second": 793.045, "eval_steps_per_second": 3.172, "step": 90576 }, { "epoch": 273.0, "eval_accuracy": 0.98, "eval_loss": 0.08124550431966782, "eval_runtime": 13.1278, "eval_samples_per_second": 761.741, "eval_steps_per_second": 3.047, "step": 90909 }, { "epoch": 273.27, "grad_norm": 7.800063133239746, "learning_rate": 8.90890890890891e-07, "loss": 0.1235, "step": 91000 }, { "epoch": 274.0, "eval_accuracy": 0.9807, "eval_loss": 0.08137263357639313, "eval_runtime": 13.3744, "eval_samples_per_second": 747.7, "eval_steps_per_second": 2.991, "step": 91242 }, { "epoch": 274.77, "grad_norm": 13.224382400512695, "learning_rate": 8.40840840840841e-07, "loss": 0.1261, "step": 91500 }, { "epoch": 275.0, "eval_accuracy": 0.9801, "eval_loss": 0.08086758852005005, "eval_runtime": 12.9048, "eval_samples_per_second": 774.907, "eval_steps_per_second": 3.1, "step": 91575 }, { "epoch": 276.0, "eval_accuracy": 0.9806, "eval_loss": 0.080258309841156, "eval_runtime": 14.2222, "eval_samples_per_second": 703.127, "eval_steps_per_second": 2.813, "step": 91908 }, { "epoch": 276.28, "grad_norm": 8.529864311218262, "learning_rate": 7.907907907907908e-07, "loss": 0.1219, "step": 92000 }, { "epoch": 277.0, "eval_accuracy": 0.9803, "eval_loss": 0.08069344609975815, "eval_runtime": 13.3763, "eval_samples_per_second": 747.589, "eval_steps_per_second": 2.99, "step": 92241 }, { "epoch": 277.78, "grad_norm": 5.7626051902771, "learning_rate": 7.407407407407407e-07, "loss": 0.1235, "step": 92500 }, { "epoch": 278.0, "eval_accuracy": 0.9812, "eval_loss": 0.0805598720908165, "eval_runtime": 13.2417, "eval_samples_per_second": 755.192, "eval_steps_per_second": 3.021, "step": 92574 }, { "epoch": 279.0, "eval_accuracy": 0.9807, "eval_loss": 0.07991771399974823, "eval_runtime": 12.9989, "eval_samples_per_second": 769.296, "eval_steps_per_second": 3.077, "step": 92907 }, { "epoch": 279.28, "grad_norm": 12.886475563049316, "learning_rate": 6.906906906906907e-07, "loss": 0.1232, "step": 93000 }, { "epoch": 280.0, "eval_accuracy": 0.9805, "eval_loss": 0.08009103685617447, "eval_runtime": 14.1951, "eval_samples_per_second": 704.469, "eval_steps_per_second": 2.818, "step": 93240 }, { "epoch": 280.78, "grad_norm": 13.245797157287598, "learning_rate": 6.406406406406407e-07, "loss": 0.1236, "step": 93500 }, { "epoch": 281.0, "eval_accuracy": 0.9812, "eval_loss": 0.08077774941921234, "eval_runtime": 13.9349, "eval_samples_per_second": 717.624, "eval_steps_per_second": 2.87, "step": 93573 }, { "epoch": 282.0, "eval_accuracy": 0.9807, "eval_loss": 0.08111685514450073, "eval_runtime": 13.0495, "eval_samples_per_second": 766.313, "eval_steps_per_second": 3.065, "step": 93906 }, { "epoch": 282.28, "grad_norm": 6.8997673988342285, "learning_rate": 5.905905905905906e-07, "loss": 0.1195, "step": 94000 }, { "epoch": 283.0, "eval_accuracy": 0.9804, "eval_loss": 0.08137265592813492, "eval_runtime": 13.163, "eval_samples_per_second": 759.705, "eval_steps_per_second": 3.039, "step": 94239 }, { "epoch": 283.78, "grad_norm": 12.197209358215332, "learning_rate": 5.405405405405406e-07, "loss": 0.1191, "step": 94500 }, { "epoch": 284.0, "eval_accuracy": 0.9804, "eval_loss": 0.08120004087686539, "eval_runtime": 12.9217, "eval_samples_per_second": 773.893, "eval_steps_per_second": 3.096, "step": 94572 }, { "epoch": 285.0, "eval_accuracy": 0.9805, "eval_loss": 0.08181598037481308, "eval_runtime": 12.828, "eval_samples_per_second": 779.547, "eval_steps_per_second": 3.118, "step": 94905 }, { "epoch": 285.29, "grad_norm": 6.001578330993652, "learning_rate": 4.904904904904905e-07, "loss": 0.1205, "step": 95000 }, { "epoch": 286.0, "eval_accuracy": 0.9807, "eval_loss": 0.08141326904296875, "eval_runtime": 13.7647, "eval_samples_per_second": 726.495, "eval_steps_per_second": 2.906, "step": 95238 }, { "epoch": 286.79, "grad_norm": 9.633207321166992, "learning_rate": 4.4044044044044046e-07, "loss": 0.1203, "step": 95500 }, { "epoch": 287.0, "eval_accuracy": 0.9808, "eval_loss": 0.08182702958583832, "eval_runtime": 14.1767, "eval_samples_per_second": 705.381, "eval_steps_per_second": 2.822, "step": 95571 }, { "epoch": 288.0, "eval_accuracy": 0.9806, "eval_loss": 0.08031768351793289, "eval_runtime": 14.019, "eval_samples_per_second": 713.316, "eval_steps_per_second": 2.853, "step": 95904 }, { "epoch": 288.29, "grad_norm": 9.451753616333008, "learning_rate": 3.903903903903904e-07, "loss": 0.1197, "step": 96000 }, { "epoch": 289.0, "eval_accuracy": 0.9812, "eval_loss": 0.0809708833694458, "eval_runtime": 13.7936, "eval_samples_per_second": 724.975, "eval_steps_per_second": 2.9, "step": 96237 }, { "epoch": 289.79, "grad_norm": 10.313632011413574, "learning_rate": 3.403403403403404e-07, "loss": 0.1233, "step": 96500 }, { "epoch": 290.0, "eval_accuracy": 0.9811, "eval_loss": 0.08130063861608505, "eval_runtime": 13.4821, "eval_samples_per_second": 741.722, "eval_steps_per_second": 2.967, "step": 96570 }, { "epoch": 291.0, "eval_accuracy": 0.9813, "eval_loss": 0.08096129447221756, "eval_runtime": 13.9986, "eval_samples_per_second": 714.359, "eval_steps_per_second": 2.857, "step": 96903 }, { "epoch": 291.29, "grad_norm": 6.7220892906188965, "learning_rate": 2.9029029029029035e-07, "loss": 0.12, "step": 97000 }, { "epoch": 292.0, "eval_accuracy": 0.9813, "eval_loss": 0.08056668192148209, "eval_runtime": 13.2921, "eval_samples_per_second": 752.329, "eval_steps_per_second": 3.009, "step": 97236 }, { "epoch": 292.79, "grad_norm": 7.212859630584717, "learning_rate": 2.4024024024024026e-07, "loss": 0.1219, "step": 97500 }, { "epoch": 293.0, "eval_accuracy": 0.9816, "eval_loss": 0.08098697662353516, "eval_runtime": 13.5812, "eval_samples_per_second": 736.31, "eval_steps_per_second": 2.945, "step": 97569 }, { "epoch": 294.0, "eval_accuracy": 0.9815, "eval_loss": 0.08067005127668381, "eval_runtime": 12.9034, "eval_samples_per_second": 774.988, "eval_steps_per_second": 3.1, "step": 97902 }, { "epoch": 294.29, "grad_norm": 7.5087409019470215, "learning_rate": 1.9019019019019022e-07, "loss": 0.1202, "step": 98000 }, { "epoch": 295.0, "eval_accuracy": 0.9813, "eval_loss": 0.08077917248010635, "eval_runtime": 13.4699, "eval_samples_per_second": 742.397, "eval_steps_per_second": 2.97, "step": 98235 }, { "epoch": 295.8, "grad_norm": 7.660182952880859, "learning_rate": 1.4014014014014016e-07, "loss": 0.1228, "step": 98500 }, { "epoch": 296.0, "eval_accuracy": 0.9815, "eval_loss": 0.0807722955942154, "eval_runtime": 13.0168, "eval_samples_per_second": 768.237, "eval_steps_per_second": 3.073, "step": 98568 }, { "epoch": 297.0, "eval_accuracy": 0.9813, "eval_loss": 0.08067157864570618, "eval_runtime": 13.4303, "eval_samples_per_second": 744.586, "eval_steps_per_second": 2.978, "step": 98901 }, { "epoch": 297.3, "grad_norm": 10.4266357421875, "learning_rate": 9.00900900900901e-08, "loss": 0.1212, "step": 99000 }, { "epoch": 298.0, "eval_accuracy": 0.9812, "eval_loss": 0.08074088394641876, "eval_runtime": 12.8481, "eval_samples_per_second": 778.327, "eval_steps_per_second": 3.113, "step": 99234 }, { "epoch": 298.8, "grad_norm": 10.557640075683594, "learning_rate": 4.004004004004004e-08, "loss": 0.1214, "step": 99500 }, { "epoch": 299.0, "eval_accuracy": 0.9812, "eval_loss": 0.0807051733136177, "eval_runtime": 13.1178, "eval_samples_per_second": 762.323, "eval_steps_per_second": 3.049, "step": 99567 }, { "epoch": 300.0, "eval_accuracy": 0.981, "eval_loss": 0.08068788051605225, "eval_runtime": 12.9887, "eval_samples_per_second": 769.902, "eval_steps_per_second": 3.08, "step": 99900 }, { "epoch": 300.0, "step": 99900, "total_flos": 3.1698470226124734e+20, "train_loss": 0.17093151241451413, "train_runtime": 47820.897, "train_samples_per_second": 266.62, "train_steps_per_second": 2.089 } ], "logging_steps": 500, "max_steps": 99900, "num_input_tokens_seen": 0, "num_train_epochs": 300, "save_steps": 500, "total_flos": 3.1698470226124734e+20, "train_batch_size": 128, "trial_name": null, "trial_params": null }