cifar10_swin-tiny / trainer_state.json
jialicheng's picture
Upload folder using huggingface_hub
ddcc1cd verified
{
"best_metric": 0.9818,
"best_model_checkpoint": "../../checkpoint/cifar10/swin-tiny/checkpoint-38295",
"epoch": 300.0,
"eval_steps": 500,
"global_step": 99900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_accuracy": 0.9223,
"eval_loss": 0.2584497928619385,
"eval_runtime": 21.3622,
"eval_samples_per_second": 468.117,
"eval_steps_per_second": 1.872,
"step": 333
},
{
"epoch": 1.5,
"grad_norm": 14.220479011535645,
"learning_rate": 9.949949949949951e-06,
"loss": 0.9076,
"step": 500
},
{
"epoch": 2.0,
"eval_accuracy": 0.945,
"eval_loss": 0.1637052595615387,
"eval_runtime": 13.6699,
"eval_samples_per_second": 731.533,
"eval_steps_per_second": 2.926,
"step": 666
},
{
"epoch": 3.0,
"eval_accuracy": 0.9553,
"eval_loss": 0.1344435065984726,
"eval_runtime": 13.1721,
"eval_samples_per_second": 759.181,
"eval_steps_per_second": 3.037,
"step": 999
},
{
"epoch": 3.0,
"grad_norm": 9.328938484191895,
"learning_rate": 9.899899899899901e-06,
"loss": 0.4797,
"step": 1000
},
{
"epoch": 4.0,
"eval_accuracy": 0.9604,
"eval_loss": 0.1205841451883316,
"eval_runtime": 12.6584,
"eval_samples_per_second": 789.992,
"eval_steps_per_second": 3.16,
"step": 1332
},
{
"epoch": 4.5,
"grad_norm": 14.11563777923584,
"learning_rate": 9.849849849849851e-06,
"loss": 0.4193,
"step": 1500
},
{
"epoch": 5.0,
"eval_accuracy": 0.9635,
"eval_loss": 0.11088060587644577,
"eval_runtime": 12.7891,
"eval_samples_per_second": 781.918,
"eval_steps_per_second": 3.128,
"step": 1665
},
{
"epoch": 6.0,
"eval_accuracy": 0.9661,
"eval_loss": 0.10564317554235458,
"eval_runtime": 12.9686,
"eval_samples_per_second": 771.094,
"eval_steps_per_second": 3.084,
"step": 1998
},
{
"epoch": 6.01,
"grad_norm": 12.565740585327148,
"learning_rate": 9.799799799799801e-06,
"loss": 0.3846,
"step": 2000
},
{
"epoch": 7.0,
"eval_accuracy": 0.9688,
"eval_loss": 0.09508195519447327,
"eval_runtime": 13.2698,
"eval_samples_per_second": 753.592,
"eval_steps_per_second": 3.014,
"step": 2331
},
{
"epoch": 7.51,
"grad_norm": 9.896069526672363,
"learning_rate": 9.749749749749751e-06,
"loss": 0.3572,
"step": 2500
},
{
"epoch": 8.0,
"eval_accuracy": 0.9689,
"eval_loss": 0.09568808227777481,
"eval_runtime": 13.6448,
"eval_samples_per_second": 732.879,
"eval_steps_per_second": 2.932,
"step": 2664
},
{
"epoch": 9.0,
"eval_accuracy": 0.9693,
"eval_loss": 0.09088099747896194,
"eval_runtime": 13.9779,
"eval_samples_per_second": 715.417,
"eval_steps_per_second": 2.862,
"step": 2997
},
{
"epoch": 9.01,
"grad_norm": 9.739038467407227,
"learning_rate": 9.699699699699701e-06,
"loss": 0.3409,
"step": 3000
},
{
"epoch": 10.0,
"eval_accuracy": 0.971,
"eval_loss": 0.0861617922782898,
"eval_runtime": 13.5874,
"eval_samples_per_second": 735.979,
"eval_steps_per_second": 2.944,
"step": 3330
},
{
"epoch": 10.51,
"grad_norm": 7.383803367614746,
"learning_rate": 9.649649649649651e-06,
"loss": 0.3319,
"step": 3500
},
{
"epoch": 11.0,
"eval_accuracy": 0.9721,
"eval_loss": 0.08562646806240082,
"eval_runtime": 13.5289,
"eval_samples_per_second": 739.158,
"eval_steps_per_second": 2.957,
"step": 3663
},
{
"epoch": 12.0,
"eval_accuracy": 0.972,
"eval_loss": 0.08723447471857071,
"eval_runtime": 13.3531,
"eval_samples_per_second": 748.887,
"eval_steps_per_second": 2.996,
"step": 3996
},
{
"epoch": 12.01,
"grad_norm": 11.866540908813477,
"learning_rate": 9.5995995995996e-06,
"loss": 0.3253,
"step": 4000
},
{
"epoch": 13.0,
"eval_accuracy": 0.973,
"eval_loss": 0.08058160543441772,
"eval_runtime": 14.1547,
"eval_samples_per_second": 706.479,
"eval_steps_per_second": 2.826,
"step": 4329
},
{
"epoch": 13.51,
"grad_norm": 7.938398361206055,
"learning_rate": 9.54954954954955e-06,
"loss": 0.3084,
"step": 4500
},
{
"epoch": 14.0,
"eval_accuracy": 0.9738,
"eval_loss": 0.08162784576416016,
"eval_runtime": 14.1065,
"eval_samples_per_second": 708.895,
"eval_steps_per_second": 2.836,
"step": 4662
},
{
"epoch": 15.0,
"eval_accuracy": 0.9742,
"eval_loss": 0.07894858717918396,
"eval_runtime": 13.886,
"eval_samples_per_second": 720.149,
"eval_steps_per_second": 2.881,
"step": 4995
},
{
"epoch": 15.02,
"grad_norm": 16.568248748779297,
"learning_rate": 9.4994994994995e-06,
"loss": 0.3022,
"step": 5000
},
{
"epoch": 16.0,
"eval_accuracy": 0.9746,
"eval_loss": 0.07670588046312332,
"eval_runtime": 13.5929,
"eval_samples_per_second": 735.676,
"eval_steps_per_second": 2.943,
"step": 5328
},
{
"epoch": 16.52,
"grad_norm": 13.009441375732422,
"learning_rate": 9.44944944944945e-06,
"loss": 0.2894,
"step": 5500
},
{
"epoch": 17.0,
"eval_accuracy": 0.9725,
"eval_loss": 0.0805484876036644,
"eval_runtime": 13.3932,
"eval_samples_per_second": 746.649,
"eval_steps_per_second": 2.987,
"step": 5661
},
{
"epoch": 18.0,
"eval_accuracy": 0.9759,
"eval_loss": 0.0759720578789711,
"eval_runtime": 13.5457,
"eval_samples_per_second": 738.24,
"eval_steps_per_second": 2.953,
"step": 5994
},
{
"epoch": 18.02,
"grad_norm": 13.468392372131348,
"learning_rate": 9.3993993993994e-06,
"loss": 0.2842,
"step": 6000
},
{
"epoch": 19.0,
"eval_accuracy": 0.9744,
"eval_loss": 0.07423894852399826,
"eval_runtime": 13.6253,
"eval_samples_per_second": 733.929,
"eval_steps_per_second": 2.936,
"step": 6327
},
{
"epoch": 19.52,
"grad_norm": 12.263895988464355,
"learning_rate": 9.34934934934935e-06,
"loss": 0.2712,
"step": 6500
},
{
"epoch": 20.0,
"eval_accuracy": 0.9738,
"eval_loss": 0.07846847176551819,
"eval_runtime": 12.9608,
"eval_samples_per_second": 771.556,
"eval_steps_per_second": 3.086,
"step": 6660
},
{
"epoch": 21.0,
"eval_accuracy": 0.9735,
"eval_loss": 0.07904864102602005,
"eval_runtime": 12.8724,
"eval_samples_per_second": 776.858,
"eval_steps_per_second": 3.107,
"step": 6993
},
{
"epoch": 21.02,
"grad_norm": 10.845202445983887,
"learning_rate": 9.2992992992993e-06,
"loss": 0.2729,
"step": 7000
},
{
"epoch": 22.0,
"eval_accuracy": 0.9759,
"eval_loss": 0.07514221966266632,
"eval_runtime": 14.1339,
"eval_samples_per_second": 707.52,
"eval_steps_per_second": 2.83,
"step": 7326
},
{
"epoch": 22.52,
"grad_norm": 11.12897777557373,
"learning_rate": 9.24924924924925e-06,
"loss": 0.2634,
"step": 7500
},
{
"epoch": 23.0,
"eval_accuracy": 0.9737,
"eval_loss": 0.07959982007741928,
"eval_runtime": 13.0065,
"eval_samples_per_second": 768.844,
"eval_steps_per_second": 3.075,
"step": 7659
},
{
"epoch": 24.0,
"eval_accuracy": 0.9752,
"eval_loss": 0.07558540254831314,
"eval_runtime": 13.805,
"eval_samples_per_second": 724.375,
"eval_steps_per_second": 2.897,
"step": 7992
},
{
"epoch": 24.02,
"grad_norm": 10.100821495056152,
"learning_rate": 9.1991991991992e-06,
"loss": 0.2591,
"step": 8000
},
{
"epoch": 25.0,
"eval_accuracy": 0.9759,
"eval_loss": 0.07549387961626053,
"eval_runtime": 13.4677,
"eval_samples_per_second": 742.518,
"eval_steps_per_second": 2.97,
"step": 8325
},
{
"epoch": 25.53,
"grad_norm": 9.881790161132812,
"learning_rate": 9.14914914914915e-06,
"loss": 0.253,
"step": 8500
},
{
"epoch": 26.0,
"eval_accuracy": 0.9746,
"eval_loss": 0.07933681458234787,
"eval_runtime": 13.2517,
"eval_samples_per_second": 754.619,
"eval_steps_per_second": 3.018,
"step": 8658
},
{
"epoch": 27.0,
"eval_accuracy": 0.9765,
"eval_loss": 0.07278025895357132,
"eval_runtime": 13.5258,
"eval_samples_per_second": 739.327,
"eval_steps_per_second": 2.957,
"step": 8991
},
{
"epoch": 27.03,
"grad_norm": 7.72860860824585,
"learning_rate": 9.0990990990991e-06,
"loss": 0.2518,
"step": 9000
},
{
"epoch": 28.0,
"eval_accuracy": 0.9748,
"eval_loss": 0.07914856821298599,
"eval_runtime": 13.7348,
"eval_samples_per_second": 728.079,
"eval_steps_per_second": 2.912,
"step": 9324
},
{
"epoch": 28.53,
"grad_norm": 8.068327903747559,
"learning_rate": 9.04904904904905e-06,
"loss": 0.2482,
"step": 9500
},
{
"epoch": 29.0,
"eval_accuracy": 0.9756,
"eval_loss": 0.07918867468833923,
"eval_runtime": 13.3633,
"eval_samples_per_second": 748.316,
"eval_steps_per_second": 2.993,
"step": 9657
},
{
"epoch": 30.0,
"eval_accuracy": 0.9764,
"eval_loss": 0.07418718934059143,
"eval_runtime": 12.9493,
"eval_samples_per_second": 772.24,
"eval_steps_per_second": 3.089,
"step": 9990
},
{
"epoch": 30.03,
"grad_norm": 8.977522850036621,
"learning_rate": 8.998998998999e-06,
"loss": 0.2429,
"step": 10000
},
{
"epoch": 31.0,
"eval_accuracy": 0.9757,
"eval_loss": 0.07399851083755493,
"eval_runtime": 13.4787,
"eval_samples_per_second": 741.913,
"eval_steps_per_second": 2.968,
"step": 10323
},
{
"epoch": 31.53,
"grad_norm": 11.080597877502441,
"learning_rate": 8.94894894894895e-06,
"loss": 0.2405,
"step": 10500
},
{
"epoch": 32.0,
"eval_accuracy": 0.9757,
"eval_loss": 0.07426943629980087,
"eval_runtime": 12.8343,
"eval_samples_per_second": 779.16,
"eval_steps_per_second": 3.117,
"step": 10656
},
{
"epoch": 33.0,
"eval_accuracy": 0.9757,
"eval_loss": 0.07429418712854385,
"eval_runtime": 12.9825,
"eval_samples_per_second": 770.266,
"eval_steps_per_second": 3.081,
"step": 10989
},
{
"epoch": 33.03,
"grad_norm": 7.3039140701293945,
"learning_rate": 8.8988988988989e-06,
"loss": 0.234,
"step": 11000
},
{
"epoch": 34.0,
"eval_accuracy": 0.9769,
"eval_loss": 0.07486932724714279,
"eval_runtime": 12.96,
"eval_samples_per_second": 771.606,
"eval_steps_per_second": 3.086,
"step": 11322
},
{
"epoch": 34.53,
"grad_norm": 8.610194206237793,
"learning_rate": 8.84884884884885e-06,
"loss": 0.2353,
"step": 11500
},
{
"epoch": 35.0,
"eval_accuracy": 0.975,
"eval_loss": 0.0768030509352684,
"eval_runtime": 13.519,
"eval_samples_per_second": 739.698,
"eval_steps_per_second": 2.959,
"step": 11655
},
{
"epoch": 36.0,
"eval_accuracy": 0.9771,
"eval_loss": 0.07342812418937683,
"eval_runtime": 14.3472,
"eval_samples_per_second": 697.001,
"eval_steps_per_second": 2.788,
"step": 11988
},
{
"epoch": 36.04,
"grad_norm": 7.767194747924805,
"learning_rate": 8.798798798798799e-06,
"loss": 0.2329,
"step": 12000
},
{
"epoch": 37.0,
"eval_accuracy": 0.9755,
"eval_loss": 0.07778933644294739,
"eval_runtime": 13.5633,
"eval_samples_per_second": 737.284,
"eval_steps_per_second": 2.949,
"step": 12321
},
{
"epoch": 37.54,
"grad_norm": 11.39279842376709,
"learning_rate": 8.74874874874875e-06,
"loss": 0.2289,
"step": 12500
},
{
"epoch": 38.0,
"eval_accuracy": 0.9771,
"eval_loss": 0.07622923702001572,
"eval_runtime": 13.5603,
"eval_samples_per_second": 737.447,
"eval_steps_per_second": 2.95,
"step": 12654
},
{
"epoch": 39.0,
"eval_accuracy": 0.9761,
"eval_loss": 0.07648137956857681,
"eval_runtime": 13.4622,
"eval_samples_per_second": 742.82,
"eval_steps_per_second": 2.971,
"step": 12987
},
{
"epoch": 39.04,
"grad_norm": 8.879070281982422,
"learning_rate": 8.6986986986987e-06,
"loss": 0.227,
"step": 13000
},
{
"epoch": 40.0,
"eval_accuracy": 0.9768,
"eval_loss": 0.07394447922706604,
"eval_runtime": 13.4641,
"eval_samples_per_second": 742.715,
"eval_steps_per_second": 2.971,
"step": 13320
},
{
"epoch": 40.54,
"grad_norm": 10.858572006225586,
"learning_rate": 8.64864864864865e-06,
"loss": 0.2213,
"step": 13500
},
{
"epoch": 41.0,
"eval_accuracy": 0.9773,
"eval_loss": 0.07473840564489365,
"eval_runtime": 12.9211,
"eval_samples_per_second": 773.93,
"eval_steps_per_second": 3.096,
"step": 13653
},
{
"epoch": 42.0,
"eval_accuracy": 0.9786,
"eval_loss": 0.07195272296667099,
"eval_runtime": 13.3716,
"eval_samples_per_second": 747.852,
"eval_steps_per_second": 2.991,
"step": 13986
},
{
"epoch": 42.04,
"grad_norm": 9.299273490905762,
"learning_rate": 8.5985985985986e-06,
"loss": 0.217,
"step": 14000
},
{
"epoch": 43.0,
"eval_accuracy": 0.9771,
"eval_loss": 0.07661354541778564,
"eval_runtime": 13.4888,
"eval_samples_per_second": 741.354,
"eval_steps_per_second": 2.965,
"step": 14319
},
{
"epoch": 43.54,
"grad_norm": 9.49695873260498,
"learning_rate": 8.54854854854855e-06,
"loss": 0.22,
"step": 14500
},
{
"epoch": 44.0,
"eval_accuracy": 0.9767,
"eval_loss": 0.07640816271305084,
"eval_runtime": 14.0377,
"eval_samples_per_second": 712.365,
"eval_steps_per_second": 2.849,
"step": 14652
},
{
"epoch": 45.0,
"eval_accuracy": 0.9779,
"eval_loss": 0.07278802245855331,
"eval_runtime": 13.4886,
"eval_samples_per_second": 741.366,
"eval_steps_per_second": 2.965,
"step": 14985
},
{
"epoch": 45.05,
"grad_norm": 12.065461158752441,
"learning_rate": 8.4984984984985e-06,
"loss": 0.2179,
"step": 15000
},
{
"epoch": 46.0,
"eval_accuracy": 0.9785,
"eval_loss": 0.0740213543176651,
"eval_runtime": 14.112,
"eval_samples_per_second": 708.617,
"eval_steps_per_second": 2.834,
"step": 15318
},
{
"epoch": 46.55,
"grad_norm": 9.281307220458984,
"learning_rate": 8.44844844844845e-06,
"loss": 0.2074,
"step": 15500
},
{
"epoch": 47.0,
"eval_accuracy": 0.9793,
"eval_loss": 0.0712471604347229,
"eval_runtime": 13.5017,
"eval_samples_per_second": 740.647,
"eval_steps_per_second": 2.963,
"step": 15651
},
{
"epoch": 48.0,
"eval_accuracy": 0.9783,
"eval_loss": 0.0759299248456955,
"eval_runtime": 13.3849,
"eval_samples_per_second": 747.113,
"eval_steps_per_second": 2.988,
"step": 15984
},
{
"epoch": 48.05,
"grad_norm": 6.8984503746032715,
"learning_rate": 8.398398398398398e-06,
"loss": 0.2096,
"step": 16000
},
{
"epoch": 49.0,
"eval_accuracy": 0.9791,
"eval_loss": 0.07268951088190079,
"eval_runtime": 13.5376,
"eval_samples_per_second": 738.686,
"eval_steps_per_second": 2.955,
"step": 16317
},
{
"epoch": 49.55,
"grad_norm": 8.968807220458984,
"learning_rate": 8.348348348348348e-06,
"loss": 0.2097,
"step": 16500
},
{
"epoch": 50.0,
"eval_accuracy": 0.9792,
"eval_loss": 0.07472656667232513,
"eval_runtime": 13.5262,
"eval_samples_per_second": 739.304,
"eval_steps_per_second": 2.957,
"step": 16650
},
{
"epoch": 51.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.0754549577832222,
"eval_runtime": 13.1606,
"eval_samples_per_second": 759.845,
"eval_steps_per_second": 3.039,
"step": 16983
},
{
"epoch": 51.05,
"grad_norm": 8.540103912353516,
"learning_rate": 8.298298298298298e-06,
"loss": 0.2063,
"step": 17000
},
{
"epoch": 52.0,
"eval_accuracy": 0.9788,
"eval_loss": 0.0741283968091011,
"eval_runtime": 13.8466,
"eval_samples_per_second": 722.201,
"eval_steps_per_second": 2.889,
"step": 17316
},
{
"epoch": 52.55,
"grad_norm": 7.042116165161133,
"learning_rate": 8.248248248248248e-06,
"loss": 0.2054,
"step": 17500
},
{
"epoch": 53.0,
"eval_accuracy": 0.9784,
"eval_loss": 0.0738772302865982,
"eval_runtime": 13.021,
"eval_samples_per_second": 767.992,
"eval_steps_per_second": 3.072,
"step": 17649
},
{
"epoch": 54.0,
"eval_accuracy": 0.9779,
"eval_loss": 0.07553113251924515,
"eval_runtime": 12.8958,
"eval_samples_per_second": 775.444,
"eval_steps_per_second": 3.102,
"step": 17982
},
{
"epoch": 54.05,
"grad_norm": 9.23681640625,
"learning_rate": 8.198198198198198e-06,
"loss": 0.2003,
"step": 18000
},
{
"epoch": 55.0,
"eval_accuracy": 0.9784,
"eval_loss": 0.07760650664567947,
"eval_runtime": 12.6634,
"eval_samples_per_second": 789.678,
"eval_steps_per_second": 3.159,
"step": 18315
},
{
"epoch": 55.56,
"grad_norm": 5.839297771453857,
"learning_rate": 8.148148148148148e-06,
"loss": 0.2009,
"step": 18500
},
{
"epoch": 56.0,
"eval_accuracy": 0.9786,
"eval_loss": 0.07352690398693085,
"eval_runtime": 13.1656,
"eval_samples_per_second": 759.554,
"eval_steps_per_second": 3.038,
"step": 18648
},
{
"epoch": 57.0,
"eval_accuracy": 0.9769,
"eval_loss": 0.07721856981515884,
"eval_runtime": 12.8626,
"eval_samples_per_second": 777.447,
"eval_steps_per_second": 3.11,
"step": 18981
},
{
"epoch": 57.06,
"grad_norm": 10.131054878234863,
"learning_rate": 8.098098098098098e-06,
"loss": 0.1999,
"step": 19000
},
{
"epoch": 58.0,
"eval_accuracy": 0.9789,
"eval_loss": 0.07691636681556702,
"eval_runtime": 12.8042,
"eval_samples_per_second": 780.991,
"eval_steps_per_second": 3.124,
"step": 19314
},
{
"epoch": 58.56,
"grad_norm": 7.643968105316162,
"learning_rate": 8.048048048048048e-06,
"loss": 0.1973,
"step": 19500
},
{
"epoch": 59.0,
"eval_accuracy": 0.9793,
"eval_loss": 0.07336228340864182,
"eval_runtime": 13.6825,
"eval_samples_per_second": 730.861,
"eval_steps_per_second": 2.923,
"step": 19647
},
{
"epoch": 60.0,
"eval_accuracy": 0.9787,
"eval_loss": 0.07408491522073746,
"eval_runtime": 13.4079,
"eval_samples_per_second": 745.831,
"eval_steps_per_second": 2.983,
"step": 19980
},
{
"epoch": 60.06,
"grad_norm": 9.443299293518066,
"learning_rate": 7.997997997997999e-06,
"loss": 0.1953,
"step": 20000
},
{
"epoch": 61.0,
"eval_accuracy": 0.978,
"eval_loss": 0.07513260841369629,
"eval_runtime": 13.4048,
"eval_samples_per_second": 745.999,
"eval_steps_per_second": 2.984,
"step": 20313
},
{
"epoch": 61.56,
"grad_norm": 16.85797119140625,
"learning_rate": 7.947947947947949e-06,
"loss": 0.1937,
"step": 20500
},
{
"epoch": 62.0,
"eval_accuracy": 0.9786,
"eval_loss": 0.07370081543922424,
"eval_runtime": 13.3055,
"eval_samples_per_second": 751.568,
"eval_steps_per_second": 3.006,
"step": 20646
},
{
"epoch": 63.0,
"eval_accuracy": 0.9786,
"eval_loss": 0.07323586940765381,
"eval_runtime": 12.7695,
"eval_samples_per_second": 783.119,
"eval_steps_per_second": 3.132,
"step": 20979
},
{
"epoch": 63.06,
"grad_norm": 8.4561128616333,
"learning_rate": 7.897897897897899e-06,
"loss": 0.1946,
"step": 21000
},
{
"epoch": 64.0,
"eval_accuracy": 0.9789,
"eval_loss": 0.07585693150758743,
"eval_runtime": 13.6921,
"eval_samples_per_second": 730.349,
"eval_steps_per_second": 2.921,
"step": 21312
},
{
"epoch": 64.56,
"grad_norm": 11.68150806427002,
"learning_rate": 7.847847847847849e-06,
"loss": 0.1909,
"step": 21500
},
{
"epoch": 65.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.07349375635385513,
"eval_runtime": 12.8445,
"eval_samples_per_second": 778.544,
"eval_steps_per_second": 3.114,
"step": 21645
},
{
"epoch": 66.0,
"eval_accuracy": 0.9788,
"eval_loss": 0.07336971163749695,
"eval_runtime": 12.8882,
"eval_samples_per_second": 775.905,
"eval_steps_per_second": 3.104,
"step": 21978
},
{
"epoch": 66.07,
"grad_norm": 8.738271713256836,
"learning_rate": 7.797797797797799e-06,
"loss": 0.1935,
"step": 22000
},
{
"epoch": 67.0,
"eval_accuracy": 0.9793,
"eval_loss": 0.07337453961372375,
"eval_runtime": 12.8166,
"eval_samples_per_second": 780.24,
"eval_steps_per_second": 3.121,
"step": 22311
},
{
"epoch": 67.57,
"grad_norm": 6.386814117431641,
"learning_rate": 7.747747747747749e-06,
"loss": 0.1936,
"step": 22500
},
{
"epoch": 68.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.07239189743995667,
"eval_runtime": 12.831,
"eval_samples_per_second": 779.362,
"eval_steps_per_second": 3.117,
"step": 22644
},
{
"epoch": 69.0,
"eval_accuracy": 0.9785,
"eval_loss": 0.07570048421621323,
"eval_runtime": 12.8964,
"eval_samples_per_second": 775.412,
"eval_steps_per_second": 3.102,
"step": 22977
},
{
"epoch": 69.07,
"grad_norm": 9.476435661315918,
"learning_rate": 7.697697697697697e-06,
"loss": 0.1858,
"step": 23000
},
{
"epoch": 70.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.07100442796945572,
"eval_runtime": 13.0999,
"eval_samples_per_second": 763.367,
"eval_steps_per_second": 3.053,
"step": 23310
},
{
"epoch": 70.57,
"grad_norm": 9.190871238708496,
"learning_rate": 7.647647647647647e-06,
"loss": 0.1871,
"step": 23500
},
{
"epoch": 71.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.07596922665834427,
"eval_runtime": 13.423,
"eval_samples_per_second": 744.991,
"eval_steps_per_second": 2.98,
"step": 23643
},
{
"epoch": 72.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.07650475203990936,
"eval_runtime": 13.055,
"eval_samples_per_second": 765.99,
"eval_steps_per_second": 3.064,
"step": 23976
},
{
"epoch": 72.07,
"grad_norm": 10.302529335021973,
"learning_rate": 7.597597597597598e-06,
"loss": 0.1836,
"step": 24000
},
{
"epoch": 73.0,
"eval_accuracy": 0.9787,
"eval_loss": 0.07714686542749405,
"eval_runtime": 13.4272,
"eval_samples_per_second": 744.757,
"eval_steps_per_second": 2.979,
"step": 24309
},
{
"epoch": 73.57,
"grad_norm": 7.050232410430908,
"learning_rate": 7.547547547547548e-06,
"loss": 0.1827,
"step": 24500
},
{
"epoch": 74.0,
"eval_accuracy": 0.9782,
"eval_loss": 0.07620517909526825,
"eval_runtime": 12.8858,
"eval_samples_per_second": 776.045,
"eval_steps_per_second": 3.104,
"step": 24642
},
{
"epoch": 75.0,
"eval_accuracy": 0.9781,
"eval_loss": 0.0778127908706665,
"eval_runtime": 13.234,
"eval_samples_per_second": 755.629,
"eval_steps_per_second": 3.023,
"step": 24975
},
{
"epoch": 75.08,
"grad_norm": 8.824182510375977,
"learning_rate": 7.4974974974974975e-06,
"loss": 0.1847,
"step": 25000
},
{
"epoch": 76.0,
"eval_accuracy": 0.9781,
"eval_loss": 0.08140425384044647,
"eval_runtime": 13.9137,
"eval_samples_per_second": 718.714,
"eval_steps_per_second": 2.875,
"step": 25308
},
{
"epoch": 76.58,
"grad_norm": 8.920430183410645,
"learning_rate": 7.447447447447448e-06,
"loss": 0.1815,
"step": 25500
},
{
"epoch": 77.0,
"eval_accuracy": 0.9788,
"eval_loss": 0.07689312100410461,
"eval_runtime": 13.1404,
"eval_samples_per_second": 761.014,
"eval_steps_per_second": 3.044,
"step": 25641
},
{
"epoch": 78.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.07370501756668091,
"eval_runtime": 13.7683,
"eval_samples_per_second": 726.307,
"eval_steps_per_second": 2.905,
"step": 25974
},
{
"epoch": 78.08,
"grad_norm": 9.352115631103516,
"learning_rate": 7.397397397397398e-06,
"loss": 0.1786,
"step": 26000
},
{
"epoch": 79.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.07396883517503738,
"eval_runtime": 13.0019,
"eval_samples_per_second": 769.121,
"eval_steps_per_second": 3.076,
"step": 26307
},
{
"epoch": 79.58,
"grad_norm": 14.500313758850098,
"learning_rate": 7.347347347347348e-06,
"loss": 0.1819,
"step": 26500
},
{
"epoch": 80.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.07097125053405762,
"eval_runtime": 13.6192,
"eval_samples_per_second": 734.256,
"eval_steps_per_second": 2.937,
"step": 26640
},
{
"epoch": 81.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.07538946717977524,
"eval_runtime": 13.1675,
"eval_samples_per_second": 759.445,
"eval_steps_per_second": 3.038,
"step": 26973
},
{
"epoch": 81.08,
"grad_norm": 6.939184188842773,
"learning_rate": 7.297297297297298e-06,
"loss": 0.1767,
"step": 27000
},
{
"epoch": 82.0,
"eval_accuracy": 0.9789,
"eval_loss": 0.07721950113773346,
"eval_runtime": 13.1312,
"eval_samples_per_second": 761.542,
"eval_steps_per_second": 3.046,
"step": 27306
},
{
"epoch": 82.58,
"grad_norm": 6.59556770324707,
"learning_rate": 7.247247247247248e-06,
"loss": 0.1792,
"step": 27500
},
{
"epoch": 83.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.0765281617641449,
"eval_runtime": 13.0707,
"eval_samples_per_second": 765.07,
"eval_steps_per_second": 3.06,
"step": 27639
},
{
"epoch": 84.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.07408629357814789,
"eval_runtime": 12.4754,
"eval_samples_per_second": 801.579,
"eval_steps_per_second": 3.206,
"step": 27972
},
{
"epoch": 84.08,
"grad_norm": 7.376372814178467,
"learning_rate": 7.197197197197198e-06,
"loss": 0.1752,
"step": 28000
},
{
"epoch": 85.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.0741427093744278,
"eval_runtime": 12.8952,
"eval_samples_per_second": 775.481,
"eval_steps_per_second": 3.102,
"step": 28305
},
{
"epoch": 85.59,
"grad_norm": 11.074542045593262,
"learning_rate": 7.147147147147148e-06,
"loss": 0.1789,
"step": 28500
},
{
"epoch": 86.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.07748846709728241,
"eval_runtime": 12.5701,
"eval_samples_per_second": 795.538,
"eval_steps_per_second": 3.182,
"step": 28638
},
{
"epoch": 87.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.07314252853393555,
"eval_runtime": 13.4921,
"eval_samples_per_second": 741.174,
"eval_steps_per_second": 2.965,
"step": 28971
},
{
"epoch": 87.09,
"grad_norm": 8.502799987792969,
"learning_rate": 7.097097097097097e-06,
"loss": 0.1755,
"step": 29000
},
{
"epoch": 88.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.07246743142604828,
"eval_runtime": 14.1164,
"eval_samples_per_second": 708.396,
"eval_steps_per_second": 2.834,
"step": 29304
},
{
"epoch": 88.59,
"grad_norm": 10.004383087158203,
"learning_rate": 7.047047047047047e-06,
"loss": 0.1694,
"step": 29500
},
{
"epoch": 89.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.07495511323213577,
"eval_runtime": 13.0203,
"eval_samples_per_second": 768.031,
"eval_steps_per_second": 3.072,
"step": 29637
},
{
"epoch": 90.0,
"eval_accuracy": 0.9815,
"eval_loss": 0.07112333923578262,
"eval_runtime": 12.8831,
"eval_samples_per_second": 776.209,
"eval_steps_per_second": 3.105,
"step": 29970
},
{
"epoch": 90.09,
"grad_norm": 10.05745792388916,
"learning_rate": 6.996996996996997e-06,
"loss": 0.1739,
"step": 30000
},
{
"epoch": 91.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07672711461782455,
"eval_runtime": 13.4897,
"eval_samples_per_second": 741.306,
"eval_steps_per_second": 2.965,
"step": 30303
},
{
"epoch": 91.59,
"grad_norm": 7.928704738616943,
"learning_rate": 6.9469469469469474e-06,
"loss": 0.1726,
"step": 30500
},
{
"epoch": 92.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.0770508348941803,
"eval_runtime": 13.4469,
"eval_samples_per_second": 743.667,
"eval_steps_per_second": 2.975,
"step": 30636
},
{
"epoch": 93.0,
"eval_accuracy": 0.9786,
"eval_loss": 0.0784955620765686,
"eval_runtime": 13.8288,
"eval_samples_per_second": 723.131,
"eval_steps_per_second": 2.893,
"step": 30969
},
{
"epoch": 93.09,
"grad_norm": 9.178421974182129,
"learning_rate": 6.8968968968968975e-06,
"loss": 0.1696,
"step": 31000
},
{
"epoch": 94.0,
"eval_accuracy": 0.9787,
"eval_loss": 0.07988455891609192,
"eval_runtime": 13.675,
"eval_samples_per_second": 731.261,
"eval_steps_per_second": 2.925,
"step": 31302
},
{
"epoch": 94.59,
"grad_norm": 7.237130165100098,
"learning_rate": 6.846846846846848e-06,
"loss": 0.1723,
"step": 31500
},
{
"epoch": 95.0,
"eval_accuracy": 0.979,
"eval_loss": 0.07755716890096664,
"eval_runtime": 13.4765,
"eval_samples_per_second": 742.035,
"eval_steps_per_second": 2.968,
"step": 31635
},
{
"epoch": 96.0,
"eval_accuracy": 0.9796,
"eval_loss": 0.07740277796983719,
"eval_runtime": 13.7122,
"eval_samples_per_second": 729.276,
"eval_steps_per_second": 2.917,
"step": 31968
},
{
"epoch": 96.1,
"grad_norm": 6.332306385040283,
"learning_rate": 6.796796796796798e-06,
"loss": 0.1692,
"step": 32000
},
{
"epoch": 97.0,
"eval_accuracy": 0.9797,
"eval_loss": 0.08065084367990494,
"eval_runtime": 12.8364,
"eval_samples_per_second": 779.036,
"eval_steps_per_second": 3.116,
"step": 32301
},
{
"epoch": 97.6,
"grad_norm": 6.978306770324707,
"learning_rate": 6.746746746746748e-06,
"loss": 0.17,
"step": 32500
},
{
"epoch": 98.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.07497260719537735,
"eval_runtime": 12.9795,
"eval_samples_per_second": 770.446,
"eval_steps_per_second": 3.082,
"step": 32634
},
{
"epoch": 99.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.07648865878582001,
"eval_runtime": 13.1724,
"eval_samples_per_second": 759.163,
"eval_steps_per_second": 3.037,
"step": 32967
},
{
"epoch": 99.1,
"grad_norm": 9.569737434387207,
"learning_rate": 6.696696696696697e-06,
"loss": 0.1691,
"step": 33000
},
{
"epoch": 100.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.07629863917827606,
"eval_runtime": 13.7613,
"eval_samples_per_second": 726.674,
"eval_steps_per_second": 2.907,
"step": 33300
},
{
"epoch": 100.6,
"grad_norm": 9.273295402526855,
"learning_rate": 6.646646646646647e-06,
"loss": 0.165,
"step": 33500
},
{
"epoch": 101.0,
"eval_accuracy": 0.9794,
"eval_loss": 0.07651650160551071,
"eval_runtime": 12.8929,
"eval_samples_per_second": 775.622,
"eval_steps_per_second": 3.102,
"step": 33633
},
{
"epoch": 102.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.07412749528884888,
"eval_runtime": 13.1273,
"eval_samples_per_second": 761.772,
"eval_steps_per_second": 3.047,
"step": 33966
},
{
"epoch": 102.1,
"grad_norm": 5.686313152313232,
"learning_rate": 6.596596596596597e-06,
"loss": 0.1678,
"step": 34000
},
{
"epoch": 103.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.07281830161809921,
"eval_runtime": 14.1227,
"eval_samples_per_second": 708.079,
"eval_steps_per_second": 2.832,
"step": 34299
},
{
"epoch": 103.6,
"grad_norm": 13.40892505645752,
"learning_rate": 6.546546546546547e-06,
"loss": 0.1663,
"step": 34500
},
{
"epoch": 104.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.07456088066101074,
"eval_runtime": 12.9479,
"eval_samples_per_second": 772.329,
"eval_steps_per_second": 3.089,
"step": 34632
},
{
"epoch": 105.0,
"eval_accuracy": 0.9796,
"eval_loss": 0.07469187676906586,
"eval_runtime": 13.464,
"eval_samples_per_second": 742.723,
"eval_steps_per_second": 2.971,
"step": 34965
},
{
"epoch": 105.11,
"grad_norm": 3.3622846603393555,
"learning_rate": 6.496496496496497e-06,
"loss": 0.1697,
"step": 35000
},
{
"epoch": 106.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07429709285497665,
"eval_runtime": 12.5016,
"eval_samples_per_second": 799.901,
"eval_steps_per_second": 3.2,
"step": 35298
},
{
"epoch": 106.61,
"grad_norm": 13.544451713562012,
"learning_rate": 6.446446446446447e-06,
"loss": 0.1637,
"step": 35500
},
{
"epoch": 107.0,
"eval_accuracy": 0.9796,
"eval_loss": 0.07689350843429565,
"eval_runtime": 13.0156,
"eval_samples_per_second": 768.306,
"eval_steps_per_second": 3.073,
"step": 35631
},
{
"epoch": 108.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.07509542256593704,
"eval_runtime": 13.0921,
"eval_samples_per_second": 763.817,
"eval_steps_per_second": 3.055,
"step": 35964
},
{
"epoch": 108.11,
"grad_norm": 11.040998458862305,
"learning_rate": 6.396396396396397e-06,
"loss": 0.1678,
"step": 36000
},
{
"epoch": 109.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.0769224464893341,
"eval_runtime": 13.4563,
"eval_samples_per_second": 743.145,
"eval_steps_per_second": 2.973,
"step": 36297
},
{
"epoch": 109.61,
"grad_norm": 7.243069171905518,
"learning_rate": 6.3463463463463474e-06,
"loss": 0.1674,
"step": 36500
},
{
"epoch": 110.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.07392393797636032,
"eval_runtime": 12.9386,
"eval_samples_per_second": 772.879,
"eval_steps_per_second": 3.092,
"step": 36630
},
{
"epoch": 111.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.0809590220451355,
"eval_runtime": 13.2637,
"eval_samples_per_second": 753.935,
"eval_steps_per_second": 3.016,
"step": 36963
},
{
"epoch": 111.11,
"grad_norm": 8.149242401123047,
"learning_rate": 6.296296296296297e-06,
"loss": 0.1604,
"step": 37000
},
{
"epoch": 112.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.07439053803682327,
"eval_runtime": 12.9959,
"eval_samples_per_second": 769.471,
"eval_steps_per_second": 3.078,
"step": 37296
},
{
"epoch": 112.61,
"grad_norm": 6.591969966888428,
"learning_rate": 6.246246246246247e-06,
"loss": 0.1583,
"step": 37500
},
{
"epoch": 113.0,
"eval_accuracy": 0.9816,
"eval_loss": 0.07411955296993256,
"eval_runtime": 12.6355,
"eval_samples_per_second": 791.421,
"eval_steps_per_second": 3.166,
"step": 37629
},
{
"epoch": 114.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07842327654361725,
"eval_runtime": 14.6497,
"eval_samples_per_second": 682.608,
"eval_steps_per_second": 2.73,
"step": 37962
},
{
"epoch": 114.11,
"grad_norm": 8.455940246582031,
"learning_rate": 6.196196196196197e-06,
"loss": 0.1592,
"step": 38000
},
{
"epoch": 115.0,
"eval_accuracy": 0.9818,
"eval_loss": 0.07287651300430298,
"eval_runtime": 13.5165,
"eval_samples_per_second": 739.838,
"eval_steps_per_second": 2.959,
"step": 38295
},
{
"epoch": 115.62,
"grad_norm": 6.092105388641357,
"learning_rate": 6.146146146146147e-06,
"loss": 0.1607,
"step": 38500
},
{
"epoch": 116.0,
"eval_accuracy": 0.9818,
"eval_loss": 0.07438412308692932,
"eval_runtime": 13.6093,
"eval_samples_per_second": 734.794,
"eval_steps_per_second": 2.939,
"step": 38628
},
{
"epoch": 117.0,
"eval_accuracy": 0.9817,
"eval_loss": 0.07355909794569016,
"eval_runtime": 12.6698,
"eval_samples_per_second": 789.279,
"eval_steps_per_second": 3.157,
"step": 38961
},
{
"epoch": 117.12,
"grad_norm": 7.972623348236084,
"learning_rate": 6.096096096096097e-06,
"loss": 0.1657,
"step": 39000
},
{
"epoch": 118.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.0769243985414505,
"eval_runtime": 13.3442,
"eval_samples_per_second": 749.391,
"eval_steps_per_second": 2.998,
"step": 39294
},
{
"epoch": 118.62,
"grad_norm": 7.559940338134766,
"learning_rate": 6.046046046046047e-06,
"loss": 0.1605,
"step": 39500
},
{
"epoch": 119.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.0768662765622139,
"eval_runtime": 13.0344,
"eval_samples_per_second": 767.2,
"eval_steps_per_second": 3.069,
"step": 39627
},
{
"epoch": 120.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.07865633815526962,
"eval_runtime": 13.8055,
"eval_samples_per_second": 724.347,
"eval_steps_per_second": 2.897,
"step": 39960
},
{
"epoch": 120.12,
"grad_norm": 7.175966739654541,
"learning_rate": 5.995995995995997e-06,
"loss": 0.1554,
"step": 40000
},
{
"epoch": 121.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.07854399085044861,
"eval_runtime": 12.8799,
"eval_samples_per_second": 776.402,
"eval_steps_per_second": 3.106,
"step": 40293
},
{
"epoch": 121.62,
"grad_norm": 12.97214126586914,
"learning_rate": 5.945945945945947e-06,
"loss": 0.157,
"step": 40500
},
{
"epoch": 122.0,
"eval_accuracy": 0.9796,
"eval_loss": 0.0760401040315628,
"eval_runtime": 12.9319,
"eval_samples_per_second": 773.283,
"eval_steps_per_second": 3.093,
"step": 40626
},
{
"epoch": 123.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.07537718862295151,
"eval_runtime": 12.9913,
"eval_samples_per_second": 769.749,
"eval_steps_per_second": 3.079,
"step": 40959
},
{
"epoch": 123.12,
"grad_norm": 7.540937423706055,
"learning_rate": 5.895895895895896e-06,
"loss": 0.1549,
"step": 41000
},
{
"epoch": 124.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.07550998032093048,
"eval_runtime": 14.417,
"eval_samples_per_second": 693.624,
"eval_steps_per_second": 2.774,
"step": 41292
},
{
"epoch": 124.62,
"grad_norm": 6.355432987213135,
"learning_rate": 5.8458458458458464e-06,
"loss": 0.1578,
"step": 41500
},
{
"epoch": 125.0,
"eval_accuracy": 0.9792,
"eval_loss": 0.07649920880794525,
"eval_runtime": 13.4926,
"eval_samples_per_second": 741.15,
"eval_steps_per_second": 2.965,
"step": 41625
},
{
"epoch": 126.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07526528090238571,
"eval_runtime": 12.9079,
"eval_samples_per_second": 774.72,
"eval_steps_per_second": 3.099,
"step": 41958
},
{
"epoch": 126.13,
"grad_norm": 6.478011131286621,
"learning_rate": 5.7957957957957965e-06,
"loss": 0.1531,
"step": 42000
},
{
"epoch": 127.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07793418318033218,
"eval_runtime": 13.471,
"eval_samples_per_second": 742.337,
"eval_steps_per_second": 2.969,
"step": 42291
},
{
"epoch": 127.63,
"grad_norm": 7.928163051605225,
"learning_rate": 5.7457457457457466e-06,
"loss": 0.1572,
"step": 42500
},
{
"epoch": 128.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07834824174642563,
"eval_runtime": 13.8772,
"eval_samples_per_second": 720.605,
"eval_steps_per_second": 2.882,
"step": 42624
},
{
"epoch": 129.0,
"eval_accuracy": 0.9796,
"eval_loss": 0.0785522609949112,
"eval_runtime": 12.947,
"eval_samples_per_second": 772.377,
"eval_steps_per_second": 3.09,
"step": 42957
},
{
"epoch": 129.13,
"grad_norm": 19.900619506835938,
"learning_rate": 5.695695695695697e-06,
"loss": 0.1558,
"step": 43000
},
{
"epoch": 130.0,
"eval_accuracy": 0.9814,
"eval_loss": 0.0741908997297287,
"eval_runtime": 12.8882,
"eval_samples_per_second": 775.906,
"eval_steps_per_second": 3.104,
"step": 43290
},
{
"epoch": 130.63,
"grad_norm": 12.561553001403809,
"learning_rate": 5.645645645645647e-06,
"loss": 0.1515,
"step": 43500
},
{
"epoch": 131.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.07759422063827515,
"eval_runtime": 14.2426,
"eval_samples_per_second": 702.121,
"eval_steps_per_second": 2.808,
"step": 43623
},
{
"epoch": 132.0,
"eval_accuracy": 0.9793,
"eval_loss": 0.08000089973211288,
"eval_runtime": 13.0308,
"eval_samples_per_second": 767.413,
"eval_steps_per_second": 3.07,
"step": 43956
},
{
"epoch": 132.13,
"grad_norm": 10.955676078796387,
"learning_rate": 5.595595595595597e-06,
"loss": 0.1526,
"step": 44000
},
{
"epoch": 133.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.07563788443803787,
"eval_runtime": 12.9183,
"eval_samples_per_second": 774.093,
"eval_steps_per_second": 3.096,
"step": 44289
},
{
"epoch": 133.63,
"grad_norm": 9.621336936950684,
"learning_rate": 5.545545545545547e-06,
"loss": 0.1523,
"step": 44500
},
{
"epoch": 134.0,
"eval_accuracy": 0.9797,
"eval_loss": 0.07889340072870255,
"eval_runtime": 13.5904,
"eval_samples_per_second": 735.813,
"eval_steps_per_second": 2.943,
"step": 44622
},
{
"epoch": 135.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.07651440799236298,
"eval_runtime": 13.0261,
"eval_samples_per_second": 767.689,
"eval_steps_per_second": 3.071,
"step": 44955
},
{
"epoch": 135.14,
"grad_norm": 9.40494155883789,
"learning_rate": 5.495495495495496e-06,
"loss": 0.1519,
"step": 45000
},
{
"epoch": 136.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.07700727880001068,
"eval_runtime": 14.2776,
"eval_samples_per_second": 700.397,
"eval_steps_per_second": 2.802,
"step": 45288
},
{
"epoch": 136.64,
"grad_norm": 7.778809070587158,
"learning_rate": 5.445445445445446e-06,
"loss": 0.1491,
"step": 45500
},
{
"epoch": 137.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07937881350517273,
"eval_runtime": 13.7045,
"eval_samples_per_second": 729.689,
"eval_steps_per_second": 2.919,
"step": 45621
},
{
"epoch": 138.0,
"eval_accuracy": 0.9796,
"eval_loss": 0.07901179045438766,
"eval_runtime": 12.8776,
"eval_samples_per_second": 776.54,
"eval_steps_per_second": 3.106,
"step": 45954
},
{
"epoch": 138.14,
"grad_norm": 12.694830894470215,
"learning_rate": 5.395395395395396e-06,
"loss": 0.1488,
"step": 46000
},
{
"epoch": 139.0,
"eval_accuracy": 0.9796,
"eval_loss": 0.07827717065811157,
"eval_runtime": 13.01,
"eval_samples_per_second": 768.642,
"eval_steps_per_second": 3.075,
"step": 46287
},
{
"epoch": 139.64,
"grad_norm": 5.728260517120361,
"learning_rate": 5.345345345345346e-06,
"loss": 0.1511,
"step": 46500
},
{
"epoch": 140.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07687978446483612,
"eval_runtime": 13.4169,
"eval_samples_per_second": 745.331,
"eval_steps_per_second": 2.981,
"step": 46620
},
{
"epoch": 141.0,
"eval_accuracy": 0.9797,
"eval_loss": 0.0826837420463562,
"eval_runtime": 13.6768,
"eval_samples_per_second": 731.166,
"eval_steps_per_second": 2.925,
"step": 46953
},
{
"epoch": 141.14,
"grad_norm": 8.749393463134766,
"learning_rate": 5.2952952952952955e-06,
"loss": 0.1475,
"step": 47000
},
{
"epoch": 142.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07702562212944031,
"eval_runtime": 13.4888,
"eval_samples_per_second": 741.356,
"eval_steps_per_second": 2.965,
"step": 47286
},
{
"epoch": 142.64,
"grad_norm": 8.479342460632324,
"learning_rate": 5.245245245245245e-06,
"loss": 0.1449,
"step": 47500
},
{
"epoch": 143.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07797821611166,
"eval_runtime": 13.0058,
"eval_samples_per_second": 768.886,
"eval_steps_per_second": 3.076,
"step": 47619
},
{
"epoch": 144.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.07707054167985916,
"eval_runtime": 12.9845,
"eval_samples_per_second": 770.15,
"eval_steps_per_second": 3.081,
"step": 47952
},
{
"epoch": 144.14,
"grad_norm": 10.80911636352539,
"learning_rate": 5.195195195195195e-06,
"loss": 0.146,
"step": 48000
},
{
"epoch": 145.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.0750807523727417,
"eval_runtime": 14.0436,
"eval_samples_per_second": 712.069,
"eval_steps_per_second": 2.848,
"step": 48285
},
{
"epoch": 145.65,
"grad_norm": 5.568371295928955,
"learning_rate": 5.145145145145145e-06,
"loss": 0.1473,
"step": 48500
},
{
"epoch": 146.0,
"eval_accuracy": 0.9797,
"eval_loss": 0.07933703809976578,
"eval_runtime": 13.1022,
"eval_samples_per_second": 763.232,
"eval_steps_per_second": 3.053,
"step": 48618
},
{
"epoch": 147.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.07590621709823608,
"eval_runtime": 13.1387,
"eval_samples_per_second": 761.108,
"eval_steps_per_second": 3.044,
"step": 48951
},
{
"epoch": 147.15,
"grad_norm": 8.234355926513672,
"learning_rate": 5.095095095095095e-06,
"loss": 0.1466,
"step": 49000
},
{
"epoch": 148.0,
"eval_accuracy": 0.9787,
"eval_loss": 0.08211437612771988,
"eval_runtime": 13.453,
"eval_samples_per_second": 743.33,
"eval_steps_per_second": 2.973,
"step": 49284
},
{
"epoch": 148.65,
"grad_norm": 9.734493255615234,
"learning_rate": 5.045045045045045e-06,
"loss": 0.1472,
"step": 49500
},
{
"epoch": 149.0,
"eval_accuracy": 0.9813,
"eval_loss": 0.07566899061203003,
"eval_runtime": 13.5127,
"eval_samples_per_second": 740.042,
"eval_steps_per_second": 2.96,
"step": 49617
},
{
"epoch": 150.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.07641930133104324,
"eval_runtime": 13.729,
"eval_samples_per_second": 728.384,
"eval_steps_per_second": 2.914,
"step": 49950
},
{
"epoch": 150.15,
"grad_norm": 9.083195686340332,
"learning_rate": 4.994994994994996e-06,
"loss": 0.1437,
"step": 50000
},
{
"epoch": 151.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.0816345363855362,
"eval_runtime": 13.6081,
"eval_samples_per_second": 734.856,
"eval_steps_per_second": 2.939,
"step": 50283
},
{
"epoch": 151.65,
"grad_norm": 16.20008087158203,
"learning_rate": 4.944944944944945e-06,
"loss": 0.1487,
"step": 50500
},
{
"epoch": 152.0,
"eval_accuracy": 0.9818,
"eval_loss": 0.07768727838993073,
"eval_runtime": 12.9061,
"eval_samples_per_second": 774.83,
"eval_steps_per_second": 3.099,
"step": 50616
},
{
"epoch": 153.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.07950293272733688,
"eval_runtime": 13.0523,
"eval_samples_per_second": 766.151,
"eval_steps_per_second": 3.065,
"step": 50949
},
{
"epoch": 153.15,
"grad_norm": 6.783934593200684,
"learning_rate": 4.894894894894895e-06,
"loss": 0.1455,
"step": 51000
},
{
"epoch": 154.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.07836713641881943,
"eval_runtime": 13.4341,
"eval_samples_per_second": 744.377,
"eval_steps_per_second": 2.978,
"step": 51282
},
{
"epoch": 154.65,
"grad_norm": 7.791309833526611,
"learning_rate": 4.844844844844845e-06,
"loss": 0.1463,
"step": 51500
},
{
"epoch": 155.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.07995989918708801,
"eval_runtime": 13.7204,
"eval_samples_per_second": 728.844,
"eval_steps_per_second": 2.915,
"step": 51615
},
{
"epoch": 156.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.07914280891418457,
"eval_runtime": 13.045,
"eval_samples_per_second": 766.58,
"eval_steps_per_second": 3.066,
"step": 51948
},
{
"epoch": 156.16,
"grad_norm": 7.225980281829834,
"learning_rate": 4.794794794794795e-06,
"loss": 0.1449,
"step": 52000
},
{
"epoch": 157.0,
"eval_accuracy": 0.9815,
"eval_loss": 0.0777197852730751,
"eval_runtime": 12.8795,
"eval_samples_per_second": 776.43,
"eval_steps_per_second": 3.106,
"step": 52281
},
{
"epoch": 157.66,
"grad_norm": 7.848995208740234,
"learning_rate": 4.7447447447447454e-06,
"loss": 0.1413,
"step": 52500
},
{
"epoch": 158.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.07978815585374832,
"eval_runtime": 13.0849,
"eval_samples_per_second": 764.238,
"eval_steps_per_second": 3.057,
"step": 52614
},
{
"epoch": 159.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.08010842651128769,
"eval_runtime": 12.9948,
"eval_samples_per_second": 769.539,
"eval_steps_per_second": 3.078,
"step": 52947
},
{
"epoch": 159.16,
"grad_norm": 10.857318878173828,
"learning_rate": 4.6946946946946955e-06,
"loss": 0.143,
"step": 53000
},
{
"epoch": 160.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.07897085696458817,
"eval_runtime": 13.4824,
"eval_samples_per_second": 741.707,
"eval_steps_per_second": 2.967,
"step": 53280
},
{
"epoch": 160.66,
"grad_norm": 8.192683219909668,
"learning_rate": 4.6446446446446456e-06,
"loss": 0.1462,
"step": 53500
},
{
"epoch": 161.0,
"eval_accuracy": 0.9794,
"eval_loss": 0.07847656309604645,
"eval_runtime": 13.3614,
"eval_samples_per_second": 748.422,
"eval_steps_per_second": 2.994,
"step": 53613
},
{
"epoch": 162.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.07839296758174896,
"eval_runtime": 13.4943,
"eval_samples_per_second": 741.054,
"eval_steps_per_second": 2.964,
"step": 53946
},
{
"epoch": 162.16,
"grad_norm": 5.753213882446289,
"learning_rate": 4.594594594594596e-06,
"loss": 0.1454,
"step": 54000
},
{
"epoch": 163.0,
"eval_accuracy": 0.9814,
"eval_loss": 0.07774946093559265,
"eval_runtime": 13.5161,
"eval_samples_per_second": 739.858,
"eval_steps_per_second": 2.959,
"step": 54279
},
{
"epoch": 163.66,
"grad_norm": 23.634429931640625,
"learning_rate": 4.544544544544545e-06,
"loss": 0.1404,
"step": 54500
},
{
"epoch": 164.0,
"eval_accuracy": 0.9817,
"eval_loss": 0.07676123827695847,
"eval_runtime": 13.7785,
"eval_samples_per_second": 725.767,
"eval_steps_per_second": 2.903,
"step": 54612
},
{
"epoch": 165.0,
"eval_accuracy": 0.9795,
"eval_loss": 0.07868321239948273,
"eval_runtime": 13.3337,
"eval_samples_per_second": 749.978,
"eval_steps_per_second": 3.0,
"step": 54945
},
{
"epoch": 165.17,
"grad_norm": 14.497030258178711,
"learning_rate": 4.494494494494495e-06,
"loss": 0.1404,
"step": 55000
},
{
"epoch": 166.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.08142885565757751,
"eval_runtime": 13.0878,
"eval_samples_per_second": 764.068,
"eval_steps_per_second": 3.056,
"step": 55278
},
{
"epoch": 166.67,
"grad_norm": 5.504241943359375,
"learning_rate": 4.444444444444444e-06,
"loss": 0.1438,
"step": 55500
},
{
"epoch": 167.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08015668392181396,
"eval_runtime": 13.3375,
"eval_samples_per_second": 749.766,
"eval_steps_per_second": 2.999,
"step": 55611
},
{
"epoch": 168.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.0773804783821106,
"eval_runtime": 13.1562,
"eval_samples_per_second": 760.1,
"eval_steps_per_second": 3.04,
"step": 55944
},
{
"epoch": 168.17,
"grad_norm": 10.65889835357666,
"learning_rate": 4.394394394394394e-06,
"loss": 0.1405,
"step": 56000
},
{
"epoch": 169.0,
"eval_accuracy": 0.9793,
"eval_loss": 0.07769276201725006,
"eval_runtime": 13.37,
"eval_samples_per_second": 747.945,
"eval_steps_per_second": 2.992,
"step": 56277
},
{
"epoch": 169.67,
"grad_norm": 9.663138389587402,
"learning_rate": 4.344344344344344e-06,
"loss": 0.1465,
"step": 56500
},
{
"epoch": 170.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.07831669598817825,
"eval_runtime": 13.9555,
"eval_samples_per_second": 716.565,
"eval_steps_per_second": 2.866,
"step": 56610
},
{
"epoch": 171.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.08174577355384827,
"eval_runtime": 13.3581,
"eval_samples_per_second": 748.612,
"eval_steps_per_second": 2.994,
"step": 56943
},
{
"epoch": 171.17,
"grad_norm": 11.15052604675293,
"learning_rate": 4.294294294294294e-06,
"loss": 0.1404,
"step": 57000
},
{
"epoch": 172.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.0780324712395668,
"eval_runtime": 12.9812,
"eval_samples_per_second": 770.346,
"eval_steps_per_second": 3.081,
"step": 57276
},
{
"epoch": 172.67,
"grad_norm": 10.398097038269043,
"learning_rate": 4.2442442442442444e-06,
"loss": 0.1367,
"step": 57500
},
{
"epoch": 173.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.07895645499229431,
"eval_runtime": 12.8468,
"eval_samples_per_second": 778.402,
"eval_steps_per_second": 3.114,
"step": 57609
},
{
"epoch": 174.0,
"eval_accuracy": 0.9816,
"eval_loss": 0.07868947833776474,
"eval_runtime": 13.3373,
"eval_samples_per_second": 749.779,
"eval_steps_per_second": 2.999,
"step": 57942
},
{
"epoch": 174.17,
"grad_norm": 8.292234420776367,
"learning_rate": 4.1941941941941945e-06,
"loss": 0.1399,
"step": 58000
},
{
"epoch": 175.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.08106452971696854,
"eval_runtime": 12.854,
"eval_samples_per_second": 777.968,
"eval_steps_per_second": 3.112,
"step": 58275
},
{
"epoch": 175.68,
"grad_norm": 12.446533203125,
"learning_rate": 4.1441441441441446e-06,
"loss": 0.1418,
"step": 58500
},
{
"epoch": 176.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.08040361106395721,
"eval_runtime": 13.1526,
"eval_samples_per_second": 760.308,
"eval_steps_per_second": 3.041,
"step": 58608
},
{
"epoch": 177.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.07995961606502533,
"eval_runtime": 13.0397,
"eval_samples_per_second": 766.888,
"eval_steps_per_second": 3.068,
"step": 58941
},
{
"epoch": 177.18,
"grad_norm": 9.551538467407227,
"learning_rate": 4.094094094094095e-06,
"loss": 0.1381,
"step": 59000
},
{
"epoch": 178.0,
"eval_accuracy": 0.9814,
"eval_loss": 0.07857974618673325,
"eval_runtime": 13.3646,
"eval_samples_per_second": 748.245,
"eval_steps_per_second": 2.993,
"step": 59274
},
{
"epoch": 178.68,
"grad_norm": 7.961233615875244,
"learning_rate": 4.044044044044044e-06,
"loss": 0.1357,
"step": 59500
},
{
"epoch": 179.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.0797557458281517,
"eval_runtime": 13.833,
"eval_samples_per_second": 722.907,
"eval_steps_per_second": 2.892,
"step": 59607
},
{
"epoch": 180.0,
"eval_accuracy": 0.9813,
"eval_loss": 0.07922037690877914,
"eval_runtime": 13.1611,
"eval_samples_per_second": 759.818,
"eval_steps_per_second": 3.039,
"step": 59940
},
{
"epoch": 180.18,
"grad_norm": 8.392486572265625,
"learning_rate": 3.993993993993994e-06,
"loss": 0.1465,
"step": 60000
},
{
"epoch": 181.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.08021984249353409,
"eval_runtime": 12.7756,
"eval_samples_per_second": 782.741,
"eval_steps_per_second": 3.131,
"step": 60273
},
{
"epoch": 181.68,
"grad_norm": 5.668210506439209,
"learning_rate": 3.943943943943944e-06,
"loss": 0.1366,
"step": 60500
},
{
"epoch": 182.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.07884296774864197,
"eval_runtime": 12.9767,
"eval_samples_per_second": 770.61,
"eval_steps_per_second": 3.082,
"step": 60606
},
{
"epoch": 183.0,
"eval_accuracy": 0.979,
"eval_loss": 0.0805293619632721,
"eval_runtime": 12.9332,
"eval_samples_per_second": 773.205,
"eval_steps_per_second": 3.093,
"step": 60939
},
{
"epoch": 183.18,
"grad_norm": 9.771552085876465,
"learning_rate": 3.893893893893894e-06,
"loss": 0.139,
"step": 61000
},
{
"epoch": 184.0,
"eval_accuracy": 0.9794,
"eval_loss": 0.0822456106543541,
"eval_runtime": 13.3118,
"eval_samples_per_second": 751.211,
"eval_steps_per_second": 3.005,
"step": 61272
},
{
"epoch": 184.68,
"grad_norm": 10.898391723632812,
"learning_rate": 3.843843843843844e-06,
"loss": 0.1381,
"step": 61500
},
{
"epoch": 185.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08079157024621964,
"eval_runtime": 12.8717,
"eval_samples_per_second": 776.899,
"eval_steps_per_second": 3.108,
"step": 61605
},
{
"epoch": 186.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08059785515069962,
"eval_runtime": 12.418,
"eval_samples_per_second": 805.284,
"eval_steps_per_second": 3.221,
"step": 61938
},
{
"epoch": 186.19,
"grad_norm": 6.1758246421813965,
"learning_rate": 3.793793793793794e-06,
"loss": 0.1367,
"step": 62000
},
{
"epoch": 187.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.07853790372610092,
"eval_runtime": 12.9215,
"eval_samples_per_second": 773.902,
"eval_steps_per_second": 3.096,
"step": 62271
},
{
"epoch": 187.69,
"grad_norm": 9.155027389526367,
"learning_rate": 3.743743743743744e-06,
"loss": 0.1354,
"step": 62500
},
{
"epoch": 188.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.0803978368639946,
"eval_runtime": 13.5157,
"eval_samples_per_second": 739.883,
"eval_steps_per_second": 2.96,
"step": 62604
},
{
"epoch": 189.0,
"eval_accuracy": 0.98,
"eval_loss": 0.07950347661972046,
"eval_runtime": 13.0185,
"eval_samples_per_second": 768.138,
"eval_steps_per_second": 3.073,
"step": 62937
},
{
"epoch": 189.19,
"grad_norm": 9.88645076751709,
"learning_rate": 3.693693693693694e-06,
"loss": 0.137,
"step": 63000
},
{
"epoch": 190.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.07970842719078064,
"eval_runtime": 13.0486,
"eval_samples_per_second": 766.367,
"eval_steps_per_second": 3.065,
"step": 63270
},
{
"epoch": 190.69,
"grad_norm": 10.085098266601562,
"learning_rate": 3.643643643643644e-06,
"loss": 0.1351,
"step": 63500
},
{
"epoch": 191.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.07862575352191925,
"eval_runtime": 13.7359,
"eval_samples_per_second": 728.019,
"eval_steps_per_second": 2.912,
"step": 63603
},
{
"epoch": 192.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.07779725641012192,
"eval_runtime": 14.1749,
"eval_samples_per_second": 705.473,
"eval_steps_per_second": 2.822,
"step": 63936
},
{
"epoch": 192.19,
"grad_norm": 7.259002685546875,
"learning_rate": 3.593593593593594e-06,
"loss": 0.1345,
"step": 64000
},
{
"epoch": 193.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.07995971292257309,
"eval_runtime": 13.3268,
"eval_samples_per_second": 750.366,
"eval_steps_per_second": 3.001,
"step": 64269
},
{
"epoch": 193.69,
"grad_norm": 6.42719030380249,
"learning_rate": 3.5435435435435437e-06,
"loss": 0.1377,
"step": 64500
},
{
"epoch": 194.0,
"eval_accuracy": 0.9799,
"eval_loss": 0.07895601540803909,
"eval_runtime": 12.9129,
"eval_samples_per_second": 774.417,
"eval_steps_per_second": 3.098,
"step": 64602
},
{
"epoch": 195.0,
"eval_accuracy": 0.98,
"eval_loss": 0.08155795186758041,
"eval_runtime": 13.7447,
"eval_samples_per_second": 727.555,
"eval_steps_per_second": 2.91,
"step": 64935
},
{
"epoch": 195.2,
"grad_norm": 7.303466320037842,
"learning_rate": 3.4934934934934938e-06,
"loss": 0.1339,
"step": 65000
},
{
"epoch": 196.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.08134587854146957,
"eval_runtime": 12.87,
"eval_samples_per_second": 777.004,
"eval_steps_per_second": 3.108,
"step": 65268
},
{
"epoch": 196.7,
"grad_norm": 10.115856170654297,
"learning_rate": 3.443443443443444e-06,
"loss": 0.1338,
"step": 65500
},
{
"epoch": 197.0,
"eval_accuracy": 0.981,
"eval_loss": 0.07863133400678635,
"eval_runtime": 13.1588,
"eval_samples_per_second": 759.949,
"eval_steps_per_second": 3.04,
"step": 65601
},
{
"epoch": 198.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08128491789102554,
"eval_runtime": 12.3451,
"eval_samples_per_second": 810.038,
"eval_steps_per_second": 3.24,
"step": 65934
},
{
"epoch": 198.2,
"grad_norm": 9.01919174194336,
"learning_rate": 3.393393393393394e-06,
"loss": 0.1371,
"step": 66000
},
{
"epoch": 199.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.08089832216501236,
"eval_runtime": 13.1128,
"eval_samples_per_second": 762.612,
"eval_steps_per_second": 3.05,
"step": 66267
},
{
"epoch": 199.7,
"grad_norm": 9.190634727478027,
"learning_rate": 3.3433433433433436e-06,
"loss": 0.1339,
"step": 66500
},
{
"epoch": 200.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.07968232780694962,
"eval_runtime": 12.8919,
"eval_samples_per_second": 775.68,
"eval_steps_per_second": 3.103,
"step": 66600
},
{
"epoch": 201.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.08057761192321777,
"eval_runtime": 12.9886,
"eval_samples_per_second": 769.904,
"eval_steps_per_second": 3.08,
"step": 66933
},
{
"epoch": 201.2,
"grad_norm": 9.490571022033691,
"learning_rate": 3.2932932932932936e-06,
"loss": 0.131,
"step": 67000
},
{
"epoch": 202.0,
"eval_accuracy": 0.98,
"eval_loss": 0.08165069669485092,
"eval_runtime": 13.9588,
"eval_samples_per_second": 716.394,
"eval_steps_per_second": 2.866,
"step": 67266
},
{
"epoch": 202.7,
"grad_norm": 8.564950942993164,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.1365,
"step": 67500
},
{
"epoch": 203.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.08228688687086105,
"eval_runtime": 12.9615,
"eval_samples_per_second": 771.513,
"eval_steps_per_second": 3.086,
"step": 67599
},
{
"epoch": 204.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.08267272263765335,
"eval_runtime": 12.8976,
"eval_samples_per_second": 775.339,
"eval_steps_per_second": 3.101,
"step": 67932
},
{
"epoch": 204.2,
"grad_norm": 9.844771385192871,
"learning_rate": 3.1931931931931938e-06,
"loss": 0.1358,
"step": 68000
},
{
"epoch": 205.0,
"eval_accuracy": 0.9816,
"eval_loss": 0.0804433524608612,
"eval_runtime": 12.8434,
"eval_samples_per_second": 778.613,
"eval_steps_per_second": 3.114,
"step": 68265
},
{
"epoch": 205.71,
"grad_norm": 9.6033935546875,
"learning_rate": 3.1431431431431434e-06,
"loss": 0.132,
"step": 68500
},
{
"epoch": 206.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08253764361143112,
"eval_runtime": 13.4062,
"eval_samples_per_second": 745.922,
"eval_steps_per_second": 2.984,
"step": 68598
},
{
"epoch": 207.0,
"eval_accuracy": 0.981,
"eval_loss": 0.07984968274831772,
"eval_runtime": 13.6899,
"eval_samples_per_second": 730.467,
"eval_steps_per_second": 2.922,
"step": 68931
},
{
"epoch": 207.21,
"grad_norm": 7.0395355224609375,
"learning_rate": 3.0930930930930935e-06,
"loss": 0.1396,
"step": 69000
},
{
"epoch": 208.0,
"eval_accuracy": 0.9813,
"eval_loss": 0.08085375279188156,
"eval_runtime": 12.8706,
"eval_samples_per_second": 776.962,
"eval_steps_per_second": 3.108,
"step": 69264
},
{
"epoch": 208.71,
"grad_norm": 12.84909725189209,
"learning_rate": 3.0430430430430436e-06,
"loss": 0.1324,
"step": 69500
},
{
"epoch": 209.0,
"eval_accuracy": 0.9815,
"eval_loss": 0.07963848859071732,
"eval_runtime": 12.9764,
"eval_samples_per_second": 770.628,
"eval_steps_per_second": 3.083,
"step": 69597
},
{
"epoch": 210.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08001097291707993,
"eval_runtime": 13.4375,
"eval_samples_per_second": 744.185,
"eval_steps_per_second": 2.977,
"step": 69930
},
{
"epoch": 210.21,
"grad_norm": 8.406508445739746,
"learning_rate": 2.9929929929929936e-06,
"loss": 0.1324,
"step": 70000
},
{
"epoch": 211.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.08123359829187393,
"eval_runtime": 13.1971,
"eval_samples_per_second": 757.742,
"eval_steps_per_second": 3.031,
"step": 70263
},
{
"epoch": 211.71,
"grad_norm": 4.204705715179443,
"learning_rate": 2.942942942942943e-06,
"loss": 0.1343,
"step": 70500
},
{
"epoch": 212.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.08246932923793793,
"eval_runtime": 13.3417,
"eval_samples_per_second": 749.532,
"eval_steps_per_second": 2.998,
"step": 70596
},
{
"epoch": 213.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.08172763139009476,
"eval_runtime": 12.9861,
"eval_samples_per_second": 770.053,
"eval_steps_per_second": 3.08,
"step": 70929
},
{
"epoch": 213.21,
"grad_norm": 8.177204132080078,
"learning_rate": 2.892892892892893e-06,
"loss": 0.1322,
"step": 71000
},
{
"epoch": 214.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.08131828904151917,
"eval_runtime": 14.0986,
"eval_samples_per_second": 709.289,
"eval_steps_per_second": 2.837,
"step": 71262
},
{
"epoch": 214.71,
"grad_norm": 8.844195365905762,
"learning_rate": 2.842842842842843e-06,
"loss": 0.133,
"step": 71500
},
{
"epoch": 215.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.0824679508805275,
"eval_runtime": 12.94,
"eval_samples_per_second": 772.8,
"eval_steps_per_second": 3.091,
"step": 71595
},
{
"epoch": 216.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.0828867107629776,
"eval_runtime": 12.9965,
"eval_samples_per_second": 769.439,
"eval_steps_per_second": 3.078,
"step": 71928
},
{
"epoch": 216.22,
"grad_norm": 11.01076889038086,
"learning_rate": 2.7927927927927926e-06,
"loss": 0.1336,
"step": 72000
},
{
"epoch": 217.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08191470056772232,
"eval_runtime": 12.6388,
"eval_samples_per_second": 791.211,
"eval_steps_per_second": 3.165,
"step": 72261
},
{
"epoch": 217.72,
"grad_norm": 8.309555053710938,
"learning_rate": 2.7427427427427427e-06,
"loss": 0.1287,
"step": 72500
},
{
"epoch": 218.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.08172294497489929,
"eval_runtime": 12.869,
"eval_samples_per_second": 777.063,
"eval_steps_per_second": 3.108,
"step": 72594
},
{
"epoch": 219.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.08100100606679916,
"eval_runtime": 13.9577,
"eval_samples_per_second": 716.449,
"eval_steps_per_second": 2.866,
"step": 72927
},
{
"epoch": 219.22,
"grad_norm": 10.596402168273926,
"learning_rate": 2.6926926926926928e-06,
"loss": 0.1322,
"step": 73000
},
{
"epoch": 220.0,
"eval_accuracy": 0.98,
"eval_loss": 0.08346739411354065,
"eval_runtime": 12.8881,
"eval_samples_per_second": 775.91,
"eval_steps_per_second": 3.104,
"step": 73260
},
{
"epoch": 220.72,
"grad_norm": 8.293975830078125,
"learning_rate": 2.642642642642643e-06,
"loss": 0.1287,
"step": 73500
},
{
"epoch": 221.0,
"eval_accuracy": 0.9798,
"eval_loss": 0.08478812873363495,
"eval_runtime": 12.482,
"eval_samples_per_second": 801.151,
"eval_steps_per_second": 3.205,
"step": 73593
},
{
"epoch": 222.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.08156371861696243,
"eval_runtime": 12.9596,
"eval_samples_per_second": 771.628,
"eval_steps_per_second": 3.087,
"step": 73926
},
{
"epoch": 222.22,
"grad_norm": 9.707475662231445,
"learning_rate": 2.5925925925925925e-06,
"loss": 0.1317,
"step": 74000
},
{
"epoch": 223.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.08239776641130447,
"eval_runtime": 13.8203,
"eval_samples_per_second": 723.571,
"eval_steps_per_second": 2.894,
"step": 74259
},
{
"epoch": 223.72,
"grad_norm": 5.2577996253967285,
"learning_rate": 2.5425425425425426e-06,
"loss": 0.1308,
"step": 74500
},
{
"epoch": 224.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.08223745971918106,
"eval_runtime": 13.4783,
"eval_samples_per_second": 741.934,
"eval_steps_per_second": 2.968,
"step": 74592
},
{
"epoch": 225.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.0822429209947586,
"eval_runtime": 13.2583,
"eval_samples_per_second": 754.244,
"eval_steps_per_second": 3.017,
"step": 74925
},
{
"epoch": 225.23,
"grad_norm": 6.952250957489014,
"learning_rate": 2.4924924924924926e-06,
"loss": 0.1247,
"step": 75000
},
{
"epoch": 226.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.08117574453353882,
"eval_runtime": 13.5159,
"eval_samples_per_second": 739.872,
"eval_steps_per_second": 2.959,
"step": 75258
},
{
"epoch": 226.73,
"grad_norm": 17.568580627441406,
"learning_rate": 2.4424424424424427e-06,
"loss": 0.129,
"step": 75500
},
{
"epoch": 227.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08187758177518845,
"eval_runtime": 12.7892,
"eval_samples_per_second": 781.912,
"eval_steps_per_second": 3.128,
"step": 75591
},
{
"epoch": 228.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08235606551170349,
"eval_runtime": 12.9107,
"eval_samples_per_second": 774.55,
"eval_steps_per_second": 3.098,
"step": 75924
},
{
"epoch": 228.23,
"grad_norm": 13.310216903686523,
"learning_rate": 2.3923923923923923e-06,
"loss": 0.1315,
"step": 76000
},
{
"epoch": 229.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.08291840553283691,
"eval_runtime": 13.4267,
"eval_samples_per_second": 744.787,
"eval_steps_per_second": 2.979,
"step": 76257
},
{
"epoch": 229.73,
"grad_norm": 7.18035888671875,
"learning_rate": 2.3423423423423424e-06,
"loss": 0.1243,
"step": 76500
},
{
"epoch": 230.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.08134060353040695,
"eval_runtime": 12.9054,
"eval_samples_per_second": 774.871,
"eval_steps_per_second": 3.099,
"step": 76590
},
{
"epoch": 231.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.08125565946102142,
"eval_runtime": 13.8266,
"eval_samples_per_second": 723.246,
"eval_steps_per_second": 2.893,
"step": 76923
},
{
"epoch": 231.23,
"grad_norm": 11.132826805114746,
"learning_rate": 2.2922922922922925e-06,
"loss": 0.1244,
"step": 77000
},
{
"epoch": 232.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08288297057151794,
"eval_runtime": 13.8545,
"eval_samples_per_second": 721.786,
"eval_steps_per_second": 2.887,
"step": 77256
},
{
"epoch": 232.73,
"grad_norm": 7.415234565734863,
"learning_rate": 2.2422422422422426e-06,
"loss": 0.1286,
"step": 77500
},
{
"epoch": 233.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.083954356610775,
"eval_runtime": 13.1117,
"eval_samples_per_second": 762.679,
"eval_steps_per_second": 3.051,
"step": 77589
},
{
"epoch": 234.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08230035752058029,
"eval_runtime": 13.3702,
"eval_samples_per_second": 747.932,
"eval_steps_per_second": 2.992,
"step": 77922
},
{
"epoch": 234.23,
"grad_norm": 7.36590576171875,
"learning_rate": 2.192192192192192e-06,
"loss": 0.1261,
"step": 78000
},
{
"epoch": 235.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.08295118808746338,
"eval_runtime": 13.748,
"eval_samples_per_second": 727.381,
"eval_steps_per_second": 2.91,
"step": 78255
},
{
"epoch": 235.74,
"grad_norm": 10.516325950622559,
"learning_rate": 2.1421421421421423e-06,
"loss": 0.1238,
"step": 78500
},
{
"epoch": 236.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.08197174966335297,
"eval_runtime": 12.9286,
"eval_samples_per_second": 773.481,
"eval_steps_per_second": 3.094,
"step": 78588
},
{
"epoch": 237.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08315034210681915,
"eval_runtime": 13.634,
"eval_samples_per_second": 733.458,
"eval_steps_per_second": 2.934,
"step": 78921
},
{
"epoch": 237.24,
"grad_norm": 5.020528316497803,
"learning_rate": 2.0920920920920923e-06,
"loss": 0.1296,
"step": 79000
},
{
"epoch": 238.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.08168121427297592,
"eval_runtime": 14.4842,
"eval_samples_per_second": 690.406,
"eval_steps_per_second": 2.762,
"step": 79254
},
{
"epoch": 238.74,
"grad_norm": 11.957234382629395,
"learning_rate": 2.0420420420420424e-06,
"loss": 0.1278,
"step": 79500
},
{
"epoch": 239.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08146882057189941,
"eval_runtime": 14.503,
"eval_samples_per_second": 689.511,
"eval_steps_per_second": 2.758,
"step": 79587
},
{
"epoch": 240.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08267929404973984,
"eval_runtime": 12.9081,
"eval_samples_per_second": 774.71,
"eval_steps_per_second": 3.099,
"step": 79920
},
{
"epoch": 240.24,
"grad_norm": 10.550077438354492,
"learning_rate": 1.991991991991992e-06,
"loss": 0.1246,
"step": 80000
},
{
"epoch": 241.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08258900791406631,
"eval_runtime": 13.3618,
"eval_samples_per_second": 748.404,
"eval_steps_per_second": 2.994,
"step": 80253
},
{
"epoch": 241.74,
"grad_norm": 14.927352905273438,
"learning_rate": 1.941941941941942e-06,
"loss": 0.128,
"step": 80500
},
{
"epoch": 242.0,
"eval_accuracy": 0.9797,
"eval_loss": 0.08207195997238159,
"eval_runtime": 13.4168,
"eval_samples_per_second": 745.333,
"eval_steps_per_second": 2.981,
"step": 80586
},
{
"epoch": 243.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08075813204050064,
"eval_runtime": 12.9166,
"eval_samples_per_second": 774.198,
"eval_steps_per_second": 3.097,
"step": 80919
},
{
"epoch": 243.24,
"grad_norm": 10.435842514038086,
"learning_rate": 1.8918918918918922e-06,
"loss": 0.1274,
"step": 81000
},
{
"epoch": 244.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.0817038044333458,
"eval_runtime": 12.9068,
"eval_samples_per_second": 774.784,
"eval_steps_per_second": 3.099,
"step": 81252
},
{
"epoch": 244.74,
"grad_norm": 6.686298370361328,
"learning_rate": 1.841841841841842e-06,
"loss": 0.1232,
"step": 81500
},
{
"epoch": 245.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.0812101811170578,
"eval_runtime": 13.3736,
"eval_samples_per_second": 747.741,
"eval_steps_per_second": 2.991,
"step": 81585
},
{
"epoch": 246.0,
"eval_accuracy": 0.9809,
"eval_loss": 0.08127359300851822,
"eval_runtime": 13.8034,
"eval_samples_per_second": 724.46,
"eval_steps_per_second": 2.898,
"step": 81918
},
{
"epoch": 246.25,
"grad_norm": 9.9036865234375,
"learning_rate": 1.7917917917917917e-06,
"loss": 0.1281,
"step": 82000
},
{
"epoch": 247.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.0803731232881546,
"eval_runtime": 13.6609,
"eval_samples_per_second": 732.019,
"eval_steps_per_second": 2.928,
"step": 82251
},
{
"epoch": 247.75,
"grad_norm": 9.58124828338623,
"learning_rate": 1.7417417417417418e-06,
"loss": 0.1236,
"step": 82500
},
{
"epoch": 248.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08054234832525253,
"eval_runtime": 12.9985,
"eval_samples_per_second": 769.319,
"eval_steps_per_second": 3.077,
"step": 82584
},
{
"epoch": 249.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08253397792577744,
"eval_runtime": 12.7246,
"eval_samples_per_second": 785.882,
"eval_steps_per_second": 3.144,
"step": 82917
},
{
"epoch": 249.25,
"grad_norm": 11.608097076416016,
"learning_rate": 1.6916916916916916e-06,
"loss": 0.1223,
"step": 83000
},
{
"epoch": 250.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.08115767687559128,
"eval_runtime": 13.6973,
"eval_samples_per_second": 730.07,
"eval_steps_per_second": 2.92,
"step": 83250
},
{
"epoch": 250.75,
"grad_norm": 7.931227207183838,
"learning_rate": 1.6416416416416417e-06,
"loss": 0.1278,
"step": 83500
},
{
"epoch": 251.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08087089657783508,
"eval_runtime": 12.8657,
"eval_samples_per_second": 777.263,
"eval_steps_per_second": 3.109,
"step": 83583
},
{
"epoch": 252.0,
"eval_accuracy": 0.9818,
"eval_loss": 0.07839205116033554,
"eval_runtime": 12.9826,
"eval_samples_per_second": 770.262,
"eval_steps_per_second": 3.081,
"step": 83916
},
{
"epoch": 252.25,
"grad_norm": 6.618145942687988,
"learning_rate": 1.5915915915915916e-06,
"loss": 0.1238,
"step": 84000
},
{
"epoch": 253.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.07928815484046936,
"eval_runtime": 13.4795,
"eval_samples_per_second": 741.867,
"eval_steps_per_second": 2.967,
"step": 84249
},
{
"epoch": 253.75,
"grad_norm": 6.5788397789001465,
"learning_rate": 1.5415415415415416e-06,
"loss": 0.1259,
"step": 84500
},
{
"epoch": 254.0,
"eval_accuracy": 0.9814,
"eval_loss": 0.08129309117794037,
"eval_runtime": 12.8787,
"eval_samples_per_second": 776.478,
"eval_steps_per_second": 3.106,
"step": 84582
},
{
"epoch": 255.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08033791929483414,
"eval_runtime": 12.7033,
"eval_samples_per_second": 787.197,
"eval_steps_per_second": 3.149,
"step": 84915
},
{
"epoch": 255.26,
"grad_norm": 8.367218017578125,
"learning_rate": 1.4914914914914915e-06,
"loss": 0.1261,
"step": 85000
},
{
"epoch": 256.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08045142143964767,
"eval_runtime": 12.9511,
"eval_samples_per_second": 772.133,
"eval_steps_per_second": 3.089,
"step": 85248
},
{
"epoch": 256.76,
"grad_norm": 17.39365005493164,
"learning_rate": 1.4414414414414416e-06,
"loss": 0.1312,
"step": 85500
},
{
"epoch": 257.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08164441585540771,
"eval_runtime": 13.3735,
"eval_samples_per_second": 747.745,
"eval_steps_per_second": 2.991,
"step": 85581
},
{
"epoch": 258.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08030729740858078,
"eval_runtime": 12.9143,
"eval_samples_per_second": 774.336,
"eval_steps_per_second": 3.097,
"step": 85914
},
{
"epoch": 258.26,
"grad_norm": 12.668910026550293,
"learning_rate": 1.3913913913913914e-06,
"loss": 0.1237,
"step": 86000
},
{
"epoch": 259.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.07897236198186874,
"eval_runtime": 13.3964,
"eval_samples_per_second": 746.469,
"eval_steps_per_second": 2.986,
"step": 86247
},
{
"epoch": 259.76,
"grad_norm": 3.696176767349243,
"learning_rate": 1.3413413413413415e-06,
"loss": 0.1234,
"step": 86500
},
{
"epoch": 260.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.07928313314914703,
"eval_runtime": 13.8621,
"eval_samples_per_second": 721.391,
"eval_steps_per_second": 2.886,
"step": 86580
},
{
"epoch": 261.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.07920601218938828,
"eval_runtime": 12.907,
"eval_samples_per_second": 774.775,
"eval_steps_per_second": 3.099,
"step": 86913
},
{
"epoch": 261.26,
"grad_norm": 11.28502082824707,
"learning_rate": 1.2912912912912913e-06,
"loss": 0.1237,
"step": 87000
},
{
"epoch": 262.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.08003947883844376,
"eval_runtime": 13.8177,
"eval_samples_per_second": 723.709,
"eval_steps_per_second": 2.895,
"step": 87246
},
{
"epoch": 262.76,
"grad_norm": 13.543560981750488,
"learning_rate": 1.2412412412412414e-06,
"loss": 0.1257,
"step": 87500
},
{
"epoch": 263.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08235891908407211,
"eval_runtime": 13.4574,
"eval_samples_per_second": 743.088,
"eval_steps_per_second": 2.972,
"step": 87579
},
{
"epoch": 264.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08182436227798462,
"eval_runtime": 12.9778,
"eval_samples_per_second": 770.546,
"eval_steps_per_second": 3.082,
"step": 87912
},
{
"epoch": 264.26,
"grad_norm": 11.065189361572266,
"learning_rate": 1.1911911911911913e-06,
"loss": 0.1219,
"step": 88000
},
{
"epoch": 265.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.08205542713403702,
"eval_runtime": 13.6001,
"eval_samples_per_second": 735.288,
"eval_steps_per_second": 2.941,
"step": 88245
},
{
"epoch": 265.77,
"grad_norm": 9.291784286499023,
"learning_rate": 1.1411411411411411e-06,
"loss": 0.1298,
"step": 88500
},
{
"epoch": 266.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08165726810693741,
"eval_runtime": 13.3896,
"eval_samples_per_second": 746.85,
"eval_steps_per_second": 2.987,
"step": 88578
},
{
"epoch": 267.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08162441104650497,
"eval_runtime": 14.0305,
"eval_samples_per_second": 712.733,
"eval_steps_per_second": 2.851,
"step": 88911
},
{
"epoch": 267.27,
"grad_norm": 17.33576202392578,
"learning_rate": 1.0910910910910912e-06,
"loss": 0.1222,
"step": 89000
},
{
"epoch": 268.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.08136063069105148,
"eval_runtime": 12.8095,
"eval_samples_per_second": 780.671,
"eval_steps_per_second": 3.123,
"step": 89244
},
{
"epoch": 268.77,
"grad_norm": 11.170260429382324,
"learning_rate": 1.041041041041041e-06,
"loss": 0.1268,
"step": 89500
},
{
"epoch": 269.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.08162767440080643,
"eval_runtime": 13.5821,
"eval_samples_per_second": 736.263,
"eval_steps_per_second": 2.945,
"step": 89577
},
{
"epoch": 270.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08254320919513702,
"eval_runtime": 12.9419,
"eval_samples_per_second": 772.681,
"eval_steps_per_second": 3.091,
"step": 89910
},
{
"epoch": 270.27,
"grad_norm": 10.08292007446289,
"learning_rate": 9.909909909909911e-07,
"loss": 0.1239,
"step": 90000
},
{
"epoch": 271.0,
"eval_accuracy": 0.9802,
"eval_loss": 0.08088234812021255,
"eval_runtime": 12.9857,
"eval_samples_per_second": 770.075,
"eval_steps_per_second": 3.08,
"step": 90243
},
{
"epoch": 271.77,
"grad_norm": 7.639751434326172,
"learning_rate": 9.409409409409411e-07,
"loss": 0.1277,
"step": 90500
},
{
"epoch": 272.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.0805734246969223,
"eval_runtime": 12.6096,
"eval_samples_per_second": 793.045,
"eval_steps_per_second": 3.172,
"step": 90576
},
{
"epoch": 273.0,
"eval_accuracy": 0.98,
"eval_loss": 0.08124550431966782,
"eval_runtime": 13.1278,
"eval_samples_per_second": 761.741,
"eval_steps_per_second": 3.047,
"step": 90909
},
{
"epoch": 273.27,
"grad_norm": 7.800063133239746,
"learning_rate": 8.90890890890891e-07,
"loss": 0.1235,
"step": 91000
},
{
"epoch": 274.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08137263357639313,
"eval_runtime": 13.3744,
"eval_samples_per_second": 747.7,
"eval_steps_per_second": 2.991,
"step": 91242
},
{
"epoch": 274.77,
"grad_norm": 13.224382400512695,
"learning_rate": 8.40840840840841e-07,
"loss": 0.1261,
"step": 91500
},
{
"epoch": 275.0,
"eval_accuracy": 0.9801,
"eval_loss": 0.08086758852005005,
"eval_runtime": 12.9048,
"eval_samples_per_second": 774.907,
"eval_steps_per_second": 3.1,
"step": 91575
},
{
"epoch": 276.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.080258309841156,
"eval_runtime": 14.2222,
"eval_samples_per_second": 703.127,
"eval_steps_per_second": 2.813,
"step": 91908
},
{
"epoch": 276.28,
"grad_norm": 8.529864311218262,
"learning_rate": 7.907907907907908e-07,
"loss": 0.1219,
"step": 92000
},
{
"epoch": 277.0,
"eval_accuracy": 0.9803,
"eval_loss": 0.08069344609975815,
"eval_runtime": 13.3763,
"eval_samples_per_second": 747.589,
"eval_steps_per_second": 2.99,
"step": 92241
},
{
"epoch": 277.78,
"grad_norm": 5.7626051902771,
"learning_rate": 7.407407407407407e-07,
"loss": 0.1235,
"step": 92500
},
{
"epoch": 278.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.0805598720908165,
"eval_runtime": 13.2417,
"eval_samples_per_second": 755.192,
"eval_steps_per_second": 3.021,
"step": 92574
},
{
"epoch": 279.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.07991771399974823,
"eval_runtime": 12.9989,
"eval_samples_per_second": 769.296,
"eval_steps_per_second": 3.077,
"step": 92907
},
{
"epoch": 279.28,
"grad_norm": 12.886475563049316,
"learning_rate": 6.906906906906907e-07,
"loss": 0.1232,
"step": 93000
},
{
"epoch": 280.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08009103685617447,
"eval_runtime": 14.1951,
"eval_samples_per_second": 704.469,
"eval_steps_per_second": 2.818,
"step": 93240
},
{
"epoch": 280.78,
"grad_norm": 13.245797157287598,
"learning_rate": 6.406406406406407e-07,
"loss": 0.1236,
"step": 93500
},
{
"epoch": 281.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.08077774941921234,
"eval_runtime": 13.9349,
"eval_samples_per_second": 717.624,
"eval_steps_per_second": 2.87,
"step": 93573
},
{
"epoch": 282.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08111685514450073,
"eval_runtime": 13.0495,
"eval_samples_per_second": 766.313,
"eval_steps_per_second": 3.065,
"step": 93906
},
{
"epoch": 282.28,
"grad_norm": 6.8997673988342285,
"learning_rate": 5.905905905905906e-07,
"loss": 0.1195,
"step": 94000
},
{
"epoch": 283.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.08137265592813492,
"eval_runtime": 13.163,
"eval_samples_per_second": 759.705,
"eval_steps_per_second": 3.039,
"step": 94239
},
{
"epoch": 283.78,
"grad_norm": 12.197209358215332,
"learning_rate": 5.405405405405406e-07,
"loss": 0.1191,
"step": 94500
},
{
"epoch": 284.0,
"eval_accuracy": 0.9804,
"eval_loss": 0.08120004087686539,
"eval_runtime": 12.9217,
"eval_samples_per_second": 773.893,
"eval_steps_per_second": 3.096,
"step": 94572
},
{
"epoch": 285.0,
"eval_accuracy": 0.9805,
"eval_loss": 0.08181598037481308,
"eval_runtime": 12.828,
"eval_samples_per_second": 779.547,
"eval_steps_per_second": 3.118,
"step": 94905
},
{
"epoch": 285.29,
"grad_norm": 6.001578330993652,
"learning_rate": 4.904904904904905e-07,
"loss": 0.1205,
"step": 95000
},
{
"epoch": 286.0,
"eval_accuracy": 0.9807,
"eval_loss": 0.08141326904296875,
"eval_runtime": 13.7647,
"eval_samples_per_second": 726.495,
"eval_steps_per_second": 2.906,
"step": 95238
},
{
"epoch": 286.79,
"grad_norm": 9.633207321166992,
"learning_rate": 4.4044044044044046e-07,
"loss": 0.1203,
"step": 95500
},
{
"epoch": 287.0,
"eval_accuracy": 0.9808,
"eval_loss": 0.08182702958583832,
"eval_runtime": 14.1767,
"eval_samples_per_second": 705.381,
"eval_steps_per_second": 2.822,
"step": 95571
},
{
"epoch": 288.0,
"eval_accuracy": 0.9806,
"eval_loss": 0.08031768351793289,
"eval_runtime": 14.019,
"eval_samples_per_second": 713.316,
"eval_steps_per_second": 2.853,
"step": 95904
},
{
"epoch": 288.29,
"grad_norm": 9.451753616333008,
"learning_rate": 3.903903903903904e-07,
"loss": 0.1197,
"step": 96000
},
{
"epoch": 289.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.0809708833694458,
"eval_runtime": 13.7936,
"eval_samples_per_second": 724.975,
"eval_steps_per_second": 2.9,
"step": 96237
},
{
"epoch": 289.79,
"grad_norm": 10.313632011413574,
"learning_rate": 3.403403403403404e-07,
"loss": 0.1233,
"step": 96500
},
{
"epoch": 290.0,
"eval_accuracy": 0.9811,
"eval_loss": 0.08130063861608505,
"eval_runtime": 13.4821,
"eval_samples_per_second": 741.722,
"eval_steps_per_second": 2.967,
"step": 96570
},
{
"epoch": 291.0,
"eval_accuracy": 0.9813,
"eval_loss": 0.08096129447221756,
"eval_runtime": 13.9986,
"eval_samples_per_second": 714.359,
"eval_steps_per_second": 2.857,
"step": 96903
},
{
"epoch": 291.29,
"grad_norm": 6.7220892906188965,
"learning_rate": 2.9029029029029035e-07,
"loss": 0.12,
"step": 97000
},
{
"epoch": 292.0,
"eval_accuracy": 0.9813,
"eval_loss": 0.08056668192148209,
"eval_runtime": 13.2921,
"eval_samples_per_second": 752.329,
"eval_steps_per_second": 3.009,
"step": 97236
},
{
"epoch": 292.79,
"grad_norm": 7.212859630584717,
"learning_rate": 2.4024024024024026e-07,
"loss": 0.1219,
"step": 97500
},
{
"epoch": 293.0,
"eval_accuracy": 0.9816,
"eval_loss": 0.08098697662353516,
"eval_runtime": 13.5812,
"eval_samples_per_second": 736.31,
"eval_steps_per_second": 2.945,
"step": 97569
},
{
"epoch": 294.0,
"eval_accuracy": 0.9815,
"eval_loss": 0.08067005127668381,
"eval_runtime": 12.9034,
"eval_samples_per_second": 774.988,
"eval_steps_per_second": 3.1,
"step": 97902
},
{
"epoch": 294.29,
"grad_norm": 7.5087409019470215,
"learning_rate": 1.9019019019019022e-07,
"loss": 0.1202,
"step": 98000
},
{
"epoch": 295.0,
"eval_accuracy": 0.9813,
"eval_loss": 0.08077917248010635,
"eval_runtime": 13.4699,
"eval_samples_per_second": 742.397,
"eval_steps_per_second": 2.97,
"step": 98235
},
{
"epoch": 295.8,
"grad_norm": 7.660182952880859,
"learning_rate": 1.4014014014014016e-07,
"loss": 0.1228,
"step": 98500
},
{
"epoch": 296.0,
"eval_accuracy": 0.9815,
"eval_loss": 0.0807722955942154,
"eval_runtime": 13.0168,
"eval_samples_per_second": 768.237,
"eval_steps_per_second": 3.073,
"step": 98568
},
{
"epoch": 297.0,
"eval_accuracy": 0.9813,
"eval_loss": 0.08067157864570618,
"eval_runtime": 13.4303,
"eval_samples_per_second": 744.586,
"eval_steps_per_second": 2.978,
"step": 98901
},
{
"epoch": 297.3,
"grad_norm": 10.4266357421875,
"learning_rate": 9.00900900900901e-08,
"loss": 0.1212,
"step": 99000
},
{
"epoch": 298.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.08074088394641876,
"eval_runtime": 12.8481,
"eval_samples_per_second": 778.327,
"eval_steps_per_second": 3.113,
"step": 99234
},
{
"epoch": 298.8,
"grad_norm": 10.557640075683594,
"learning_rate": 4.004004004004004e-08,
"loss": 0.1214,
"step": 99500
},
{
"epoch": 299.0,
"eval_accuracy": 0.9812,
"eval_loss": 0.0807051733136177,
"eval_runtime": 13.1178,
"eval_samples_per_second": 762.323,
"eval_steps_per_second": 3.049,
"step": 99567
},
{
"epoch": 300.0,
"eval_accuracy": 0.981,
"eval_loss": 0.08068788051605225,
"eval_runtime": 12.9887,
"eval_samples_per_second": 769.902,
"eval_steps_per_second": 3.08,
"step": 99900
},
{
"epoch": 300.0,
"step": 99900,
"total_flos": 3.1698470226124734e+20,
"train_loss": 0.17093151241451413,
"train_runtime": 47820.897,
"train_samples_per_second": 266.62,
"train_steps_per_second": 2.089
}
],
"logging_steps": 500,
"max_steps": 99900,
"num_input_tokens_seen": 0,
"num_train_epochs": 300,
"save_steps": 500,
"total_flos": 3.1698470226124734e+20,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}