cifar10_vit-base / trainer_state.json
jialicheng's picture
Upload folder using huggingface_hub
3af8eb8 verified
{
"best_metric": 0.9899,
"best_model_checkpoint": "../../checkpoint/cifar10/vit-base/checkpoint-18981",
"epoch": 100.0,
"eval_steps": 500,
"global_step": 33300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_accuracy": 0.9599,
"eval_loss": 0.7045394778251648,
"eval_runtime": 32.9965,
"eval_samples_per_second": 303.063,
"eval_steps_per_second": 1.212,
"step": 333
},
{
"epoch": 1.5,
"grad_norm": 2.1130874156951904,
"learning_rate": 9.849849849849851e-06,
"loss": 1.2275,
"step": 500
},
{
"epoch": 2.0,
"eval_accuracy": 0.9721,
"eval_loss": 0.38615667819976807,
"eval_runtime": 27.3127,
"eval_samples_per_second": 366.13,
"eval_steps_per_second": 1.465,
"step": 666
},
{
"epoch": 3.0,
"eval_accuracy": 0.9771,
"eval_loss": 0.273359090089798,
"eval_runtime": 27.1217,
"eval_samples_per_second": 368.709,
"eval_steps_per_second": 1.475,
"step": 999
},
{
"epoch": 3.0,
"grad_norm": 2.09121036529541,
"learning_rate": 9.699699699699701e-06,
"loss": 0.4176,
"step": 1000
},
{
"epoch": 4.0,
"eval_accuracy": 0.9794,
"eval_loss": 0.21269012987613678,
"eval_runtime": 33.0791,
"eval_samples_per_second": 302.305,
"eval_steps_per_second": 1.209,
"step": 1332
},
{
"epoch": 4.5,
"grad_norm": 2.6250178813934326,
"learning_rate": 9.54954954954955e-06,
"loss": 0.3859,
"step": 1500
},
{
"epoch": 5.0,
"eval_accuracy": 0.9822,
"eval_loss": 0.17196297645568848,
"eval_runtime": 27.2002,
"eval_samples_per_second": 367.645,
"eval_steps_per_second": 1.471,
"step": 1665
},
{
"epoch": 6.0,
"eval_accuracy": 0.9834,
"eval_loss": 0.1458989977836609,
"eval_runtime": 26.7049,
"eval_samples_per_second": 374.463,
"eval_steps_per_second": 1.498,
"step": 1998
},
{
"epoch": 6.01,
"grad_norm": 3.6171772480010986,
"learning_rate": 9.3993993993994e-06,
"loss": 0.32,
"step": 2000
},
{
"epoch": 7.0,
"eval_accuracy": 0.9843,
"eval_loss": 0.12406200915575027,
"eval_runtime": 26.8675,
"eval_samples_per_second": 372.197,
"eval_steps_per_second": 1.489,
"step": 2331
},
{
"epoch": 7.51,
"grad_norm": 2.88914155960083,
"learning_rate": 9.24924924924925e-06,
"loss": 0.2795,
"step": 2500
},
{
"epoch": 8.0,
"eval_accuracy": 0.9846,
"eval_loss": 0.11055979877710342,
"eval_runtime": 26.4102,
"eval_samples_per_second": 378.641,
"eval_steps_per_second": 1.515,
"step": 2664
},
{
"epoch": 9.0,
"eval_accuracy": 0.9861,
"eval_loss": 0.09506094455718994,
"eval_runtime": 26.6993,
"eval_samples_per_second": 374.542,
"eval_steps_per_second": 1.498,
"step": 2997
},
{
"epoch": 9.01,
"grad_norm": 3.4046921730041504,
"learning_rate": 9.0990990990991e-06,
"loss": 0.2489,
"step": 3000
},
{
"epoch": 10.0,
"eval_accuracy": 0.9856,
"eval_loss": 0.08772800117731094,
"eval_runtime": 26.376,
"eval_samples_per_second": 379.132,
"eval_steps_per_second": 1.517,
"step": 3330
},
{
"epoch": 10.51,
"grad_norm": 1.4524540901184082,
"learning_rate": 8.94894894894895e-06,
"loss": 0.2284,
"step": 3500
},
{
"epoch": 11.0,
"eval_accuracy": 0.987,
"eval_loss": 0.07828080654144287,
"eval_runtime": 26.6208,
"eval_samples_per_second": 375.646,
"eval_steps_per_second": 1.503,
"step": 3663
},
{
"epoch": 12.0,
"eval_accuracy": 0.9861,
"eval_loss": 0.07426337152719498,
"eval_runtime": 27.2182,
"eval_samples_per_second": 367.401,
"eval_steps_per_second": 1.47,
"step": 3996
},
{
"epoch": 12.01,
"grad_norm": 1.878126859664917,
"learning_rate": 8.798798798798799e-06,
"loss": 0.2139,
"step": 4000
},
{
"epoch": 13.0,
"eval_accuracy": 0.9883,
"eval_loss": 0.06659487634897232,
"eval_runtime": 27.3633,
"eval_samples_per_second": 365.453,
"eval_steps_per_second": 1.462,
"step": 4329
},
{
"epoch": 13.51,
"grad_norm": 2.9144155979156494,
"learning_rate": 8.64864864864865e-06,
"loss": 0.2019,
"step": 4500
},
{
"epoch": 14.0,
"eval_accuracy": 0.9862,
"eval_loss": 0.06538616865873337,
"eval_runtime": 26.5973,
"eval_samples_per_second": 375.978,
"eval_steps_per_second": 1.504,
"step": 4662
},
{
"epoch": 15.0,
"eval_accuracy": 0.9875,
"eval_loss": 0.060847021639347076,
"eval_runtime": 27.2607,
"eval_samples_per_second": 366.829,
"eval_steps_per_second": 1.467,
"step": 4995
},
{
"epoch": 15.02,
"grad_norm": 2.323160409927368,
"learning_rate": 8.4984984984985e-06,
"loss": 0.1882,
"step": 5000
},
{
"epoch": 16.0,
"eval_accuracy": 0.9875,
"eval_loss": 0.05937081202864647,
"eval_runtime": 27.7458,
"eval_samples_per_second": 360.415,
"eval_steps_per_second": 1.442,
"step": 5328
},
{
"epoch": 16.52,
"grad_norm": 1.7828189134597778,
"learning_rate": 8.348348348348348e-06,
"loss": 0.1845,
"step": 5500
},
{
"epoch": 17.0,
"eval_accuracy": 0.9878,
"eval_loss": 0.054522428661584854,
"eval_runtime": 26.6197,
"eval_samples_per_second": 375.662,
"eval_steps_per_second": 1.503,
"step": 5661
},
{
"epoch": 18.0,
"eval_accuracy": 0.9885,
"eval_loss": 0.05343218520283699,
"eval_runtime": 26.6787,
"eval_samples_per_second": 374.831,
"eval_steps_per_second": 1.499,
"step": 5994
},
{
"epoch": 18.02,
"grad_norm": 2.5986483097076416,
"learning_rate": 8.198198198198198e-06,
"loss": 0.1762,
"step": 6000
},
{
"epoch": 19.0,
"eval_accuracy": 0.9876,
"eval_loss": 0.05616134777665138,
"eval_runtime": 27.0562,
"eval_samples_per_second": 369.601,
"eval_steps_per_second": 1.478,
"step": 6327
},
{
"epoch": 19.52,
"grad_norm": 2.639050006866455,
"learning_rate": 8.048048048048048e-06,
"loss": 0.1629,
"step": 6500
},
{
"epoch": 20.0,
"eval_accuracy": 0.9879,
"eval_loss": 0.05099354684352875,
"eval_runtime": 27.7176,
"eval_samples_per_second": 360.781,
"eval_steps_per_second": 1.443,
"step": 6660
},
{
"epoch": 21.0,
"eval_accuracy": 0.9889,
"eval_loss": 0.04877820238471031,
"eval_runtime": 26.5357,
"eval_samples_per_second": 376.85,
"eval_steps_per_second": 1.507,
"step": 6993
},
{
"epoch": 21.02,
"grad_norm": 3.4336190223693848,
"learning_rate": 7.897897897897899e-06,
"loss": 0.1622,
"step": 7000
},
{
"epoch": 22.0,
"eval_accuracy": 0.9879,
"eval_loss": 0.04886335879564285,
"eval_runtime": 27.6701,
"eval_samples_per_second": 361.401,
"eval_steps_per_second": 1.446,
"step": 7326
},
{
"epoch": 22.52,
"grad_norm": 3.954786777496338,
"learning_rate": 7.747747747747749e-06,
"loss": 0.1621,
"step": 7500
},
{
"epoch": 23.0,
"eval_accuracy": 0.9881,
"eval_loss": 0.04821654036641121,
"eval_runtime": 26.7986,
"eval_samples_per_second": 373.153,
"eval_steps_per_second": 1.493,
"step": 7659
},
{
"epoch": 24.0,
"eval_accuracy": 0.9886,
"eval_loss": 0.04642421752214432,
"eval_runtime": 27.0748,
"eval_samples_per_second": 369.347,
"eval_steps_per_second": 1.477,
"step": 7992
},
{
"epoch": 24.02,
"grad_norm": 2.539337396621704,
"learning_rate": 7.597597597597598e-06,
"loss": 0.1518,
"step": 8000
},
{
"epoch": 25.0,
"eval_accuracy": 0.9887,
"eval_loss": 0.04640224575996399,
"eval_runtime": 26.7195,
"eval_samples_per_second": 374.258,
"eval_steps_per_second": 1.497,
"step": 8325
},
{
"epoch": 25.53,
"grad_norm": 1.5434321165084839,
"learning_rate": 7.447447447447448e-06,
"loss": 0.151,
"step": 8500
},
{
"epoch": 26.0,
"eval_accuracy": 0.9884,
"eval_loss": 0.04765336588025093,
"eval_runtime": 26.3496,
"eval_samples_per_second": 379.513,
"eval_steps_per_second": 1.518,
"step": 8658
},
{
"epoch": 27.0,
"eval_accuracy": 0.9886,
"eval_loss": 0.04709744080901146,
"eval_runtime": 27.5627,
"eval_samples_per_second": 362.809,
"eval_steps_per_second": 1.451,
"step": 8991
},
{
"epoch": 27.03,
"grad_norm": 1.4213284254074097,
"learning_rate": 7.297297297297298e-06,
"loss": 0.1486,
"step": 9000
},
{
"epoch": 28.0,
"eval_accuracy": 0.9882,
"eval_loss": 0.048934612423181534,
"eval_runtime": 26.5141,
"eval_samples_per_second": 377.157,
"eval_steps_per_second": 1.509,
"step": 9324
},
{
"epoch": 28.53,
"grad_norm": 3.0046286582946777,
"learning_rate": 7.147147147147148e-06,
"loss": 0.147,
"step": 9500
},
{
"epoch": 29.0,
"eval_accuracy": 0.9884,
"eval_loss": 0.04772612452507019,
"eval_runtime": 27.447,
"eval_samples_per_second": 364.339,
"eval_steps_per_second": 1.457,
"step": 9657
},
{
"epoch": 30.0,
"eval_accuracy": 0.9883,
"eval_loss": 0.049376897513866425,
"eval_runtime": 32.9017,
"eval_samples_per_second": 303.936,
"eval_steps_per_second": 1.216,
"step": 9990
},
{
"epoch": 30.03,
"grad_norm": 2.6233084201812744,
"learning_rate": 6.996996996996997e-06,
"loss": 0.1412,
"step": 10000
},
{
"epoch": 31.0,
"eval_accuracy": 0.9881,
"eval_loss": 0.04674990102648735,
"eval_runtime": 27.7335,
"eval_samples_per_second": 360.575,
"eval_steps_per_second": 1.442,
"step": 10323
},
{
"epoch": 31.53,
"grad_norm": 2.2710893154144287,
"learning_rate": 6.846846846846848e-06,
"loss": 0.1403,
"step": 10500
},
{
"epoch": 32.0,
"eval_accuracy": 0.9888,
"eval_loss": 0.04444491118192673,
"eval_runtime": 26.7278,
"eval_samples_per_second": 374.143,
"eval_steps_per_second": 1.497,
"step": 10656
},
{
"epoch": 33.0,
"eval_accuracy": 0.9888,
"eval_loss": 0.04506918787956238,
"eval_runtime": 26.7164,
"eval_samples_per_second": 374.302,
"eval_steps_per_second": 1.497,
"step": 10989
},
{
"epoch": 33.03,
"grad_norm": 1.3235912322998047,
"learning_rate": 6.696696696696697e-06,
"loss": 0.1373,
"step": 11000
},
{
"epoch": 34.0,
"eval_accuracy": 0.9887,
"eval_loss": 0.046430543065071106,
"eval_runtime": 27.3757,
"eval_samples_per_second": 365.288,
"eval_steps_per_second": 1.461,
"step": 11322
},
{
"epoch": 34.53,
"grad_norm": 3.1201539039611816,
"learning_rate": 6.546546546546547e-06,
"loss": 0.1379,
"step": 11500
},
{
"epoch": 35.0,
"eval_accuracy": 0.9896,
"eval_loss": 0.04377752169966698,
"eval_runtime": 26.9572,
"eval_samples_per_second": 370.958,
"eval_steps_per_second": 1.484,
"step": 11655
},
{
"epoch": 36.0,
"eval_accuracy": 0.9887,
"eval_loss": 0.044030845165252686,
"eval_runtime": 27.4449,
"eval_samples_per_second": 364.366,
"eval_steps_per_second": 1.457,
"step": 11988
},
{
"epoch": 36.04,
"grad_norm": 1.388404130935669,
"learning_rate": 6.396396396396397e-06,
"loss": 0.1375,
"step": 12000
},
{
"epoch": 37.0,
"eval_accuracy": 0.9881,
"eval_loss": 0.046024248003959656,
"eval_runtime": 27.3415,
"eval_samples_per_second": 365.744,
"eval_steps_per_second": 1.463,
"step": 12321
},
{
"epoch": 37.54,
"grad_norm": 4.725052833557129,
"learning_rate": 6.246246246246247e-06,
"loss": 0.1377,
"step": 12500
},
{
"epoch": 38.0,
"eval_accuracy": 0.9896,
"eval_loss": 0.04353851079940796,
"eval_runtime": 26.7809,
"eval_samples_per_second": 373.4,
"eval_steps_per_second": 1.494,
"step": 12654
},
{
"epoch": 39.0,
"eval_accuracy": 0.989,
"eval_loss": 0.046141043305397034,
"eval_runtime": 26.7652,
"eval_samples_per_second": 373.619,
"eval_steps_per_second": 1.494,
"step": 12987
},
{
"epoch": 39.04,
"grad_norm": 1.4163159132003784,
"learning_rate": 6.096096096096097e-06,
"loss": 0.1332,
"step": 13000
},
{
"epoch": 40.0,
"eval_accuracy": 0.9897,
"eval_loss": 0.04419806972146034,
"eval_runtime": 26.8089,
"eval_samples_per_second": 373.011,
"eval_steps_per_second": 1.492,
"step": 13320
},
{
"epoch": 40.54,
"grad_norm": 2.823390007019043,
"learning_rate": 5.945945945945947e-06,
"loss": 0.1306,
"step": 13500
},
{
"epoch": 41.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04626644402742386,
"eval_runtime": 27.1555,
"eval_samples_per_second": 368.25,
"eval_steps_per_second": 1.473,
"step": 13653
},
{
"epoch": 42.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.04492880031466484,
"eval_runtime": 27.5197,
"eval_samples_per_second": 363.376,
"eval_steps_per_second": 1.454,
"step": 13986
},
{
"epoch": 42.04,
"grad_norm": 2.2915682792663574,
"learning_rate": 5.7957957957957965e-06,
"loss": 0.1289,
"step": 14000
},
{
"epoch": 43.0,
"eval_accuracy": 0.989,
"eval_loss": 0.045636676251888275,
"eval_runtime": 27.1073,
"eval_samples_per_second": 368.904,
"eval_steps_per_second": 1.476,
"step": 14319
},
{
"epoch": 43.54,
"grad_norm": 2.901336669921875,
"learning_rate": 5.645645645645647e-06,
"loss": 0.128,
"step": 14500
},
{
"epoch": 44.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.04512866213917732,
"eval_runtime": 26.2641,
"eval_samples_per_second": 380.748,
"eval_steps_per_second": 1.523,
"step": 14652
},
{
"epoch": 45.0,
"eval_accuracy": 0.9889,
"eval_loss": 0.045423876494169235,
"eval_runtime": 26.7396,
"eval_samples_per_second": 373.977,
"eval_steps_per_second": 1.496,
"step": 14985
},
{
"epoch": 45.05,
"grad_norm": 2.4207634925842285,
"learning_rate": 5.495495495495496e-06,
"loss": 0.1321,
"step": 15000
},
{
"epoch": 46.0,
"eval_accuracy": 0.9895,
"eval_loss": 0.04446360096335411,
"eval_runtime": 27.9958,
"eval_samples_per_second": 357.197,
"eval_steps_per_second": 1.429,
"step": 15318
},
{
"epoch": 46.55,
"grad_norm": 1.811728596687317,
"learning_rate": 5.345345345345346e-06,
"loss": 0.1222,
"step": 15500
},
{
"epoch": 47.0,
"eval_accuracy": 0.9893,
"eval_loss": 0.046666089445352554,
"eval_runtime": 27.8191,
"eval_samples_per_second": 359.465,
"eval_steps_per_second": 1.438,
"step": 15651
},
{
"epoch": 48.0,
"eval_accuracy": 0.9897,
"eval_loss": 0.046483393758535385,
"eval_runtime": 27.8293,
"eval_samples_per_second": 359.334,
"eval_steps_per_second": 1.437,
"step": 15984
},
{
"epoch": 48.05,
"grad_norm": 2.140018939971924,
"learning_rate": 5.195195195195195e-06,
"loss": 0.122,
"step": 16000
},
{
"epoch": 49.0,
"eval_accuracy": 0.9896,
"eval_loss": 0.045183293521404266,
"eval_runtime": 27.5683,
"eval_samples_per_second": 362.736,
"eval_steps_per_second": 1.451,
"step": 16317
},
{
"epoch": 49.55,
"grad_norm": 2.951556444168091,
"learning_rate": 5.045045045045045e-06,
"loss": 0.123,
"step": 16500
},
{
"epoch": 50.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.047760359942913055,
"eval_runtime": 26.9033,
"eval_samples_per_second": 371.702,
"eval_steps_per_second": 1.487,
"step": 16650
},
{
"epoch": 51.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.0464739091694355,
"eval_runtime": 26.782,
"eval_samples_per_second": 373.385,
"eval_steps_per_second": 1.494,
"step": 16983
},
{
"epoch": 51.05,
"grad_norm": 1.8529798984527588,
"learning_rate": 4.894894894894895e-06,
"loss": 0.1194,
"step": 17000
},
{
"epoch": 52.0,
"eval_accuracy": 0.9887,
"eval_loss": 0.048829443752765656,
"eval_runtime": 27.3149,
"eval_samples_per_second": 366.101,
"eval_steps_per_second": 1.464,
"step": 17316
},
{
"epoch": 52.55,
"grad_norm": 2.048177719116211,
"learning_rate": 4.7447447447447454e-06,
"loss": 0.1209,
"step": 17500
},
{
"epoch": 53.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.047239311039447784,
"eval_runtime": 27.9195,
"eval_samples_per_second": 358.172,
"eval_steps_per_second": 1.433,
"step": 17649
},
{
"epoch": 54.0,
"eval_accuracy": 0.9897,
"eval_loss": 0.04561839625239372,
"eval_runtime": 27.8439,
"eval_samples_per_second": 359.145,
"eval_steps_per_second": 1.437,
"step": 17982
},
{
"epoch": 54.05,
"grad_norm": 2.6934187412261963,
"learning_rate": 4.594594594594596e-06,
"loss": 0.1212,
"step": 18000
},
{
"epoch": 55.0,
"eval_accuracy": 0.9893,
"eval_loss": 0.04664906859397888,
"eval_runtime": 27.5803,
"eval_samples_per_second": 362.577,
"eval_steps_per_second": 1.45,
"step": 18315
},
{
"epoch": 55.56,
"grad_norm": 1.0602542161941528,
"learning_rate": 4.444444444444444e-06,
"loss": 0.1187,
"step": 18500
},
{
"epoch": 56.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04580499231815338,
"eval_runtime": 27.9221,
"eval_samples_per_second": 358.139,
"eval_steps_per_second": 1.433,
"step": 18648
},
{
"epoch": 57.0,
"eval_accuracy": 0.9899,
"eval_loss": 0.04467911645770073,
"eval_runtime": 26.7649,
"eval_samples_per_second": 373.624,
"eval_steps_per_second": 1.494,
"step": 18981
},
{
"epoch": 57.06,
"grad_norm": 2.04836368560791,
"learning_rate": 4.294294294294294e-06,
"loss": 0.1193,
"step": 19000
},
{
"epoch": 58.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.04191720113158226,
"eval_runtime": 26.7715,
"eval_samples_per_second": 373.531,
"eval_steps_per_second": 1.494,
"step": 19314
},
{
"epoch": 58.56,
"grad_norm": 2.186086654663086,
"learning_rate": 4.1441441441441446e-06,
"loss": 0.119,
"step": 19500
},
{
"epoch": 59.0,
"eval_accuracy": 0.9897,
"eval_loss": 0.04308394715189934,
"eval_runtime": 27.2975,
"eval_samples_per_second": 366.334,
"eval_steps_per_second": 1.465,
"step": 19647
},
{
"epoch": 60.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.0437154695391655,
"eval_runtime": 27.418,
"eval_samples_per_second": 364.724,
"eval_steps_per_second": 1.459,
"step": 19980
},
{
"epoch": 60.06,
"grad_norm": 1.6354001760482788,
"learning_rate": 3.993993993993994e-06,
"loss": 0.1165,
"step": 20000
},
{
"epoch": 61.0,
"eval_accuracy": 0.9889,
"eval_loss": 0.04698378965258598,
"eval_runtime": 26.8506,
"eval_samples_per_second": 372.431,
"eval_steps_per_second": 1.49,
"step": 20313
},
{
"epoch": 61.56,
"grad_norm": 1.5250356197357178,
"learning_rate": 3.843843843843844e-06,
"loss": 0.1146,
"step": 20500
},
{
"epoch": 62.0,
"eval_accuracy": 0.989,
"eval_loss": 0.04715728014707565,
"eval_runtime": 26.5635,
"eval_samples_per_second": 376.456,
"eval_steps_per_second": 1.506,
"step": 20646
},
{
"epoch": 63.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.044496119022369385,
"eval_runtime": 28.0733,
"eval_samples_per_second": 356.211,
"eval_steps_per_second": 1.425,
"step": 20979
},
{
"epoch": 63.06,
"grad_norm": 2.1367080211639404,
"learning_rate": 3.693693693693694e-06,
"loss": 0.1147,
"step": 21000
},
{
"epoch": 64.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04541689530014992,
"eval_runtime": 26.2197,
"eval_samples_per_second": 381.393,
"eval_steps_per_second": 1.526,
"step": 21312
},
{
"epoch": 64.56,
"grad_norm": 1.8817474842071533,
"learning_rate": 3.5435435435435437e-06,
"loss": 0.1117,
"step": 21500
},
{
"epoch": 65.0,
"eval_accuracy": 0.9899,
"eval_loss": 0.04460064694285393,
"eval_runtime": 27.2042,
"eval_samples_per_second": 367.59,
"eval_steps_per_second": 1.47,
"step": 21645
},
{
"epoch": 66.0,
"eval_accuracy": 0.989,
"eval_loss": 0.04820993170142174,
"eval_runtime": 27.2216,
"eval_samples_per_second": 367.355,
"eval_steps_per_second": 1.469,
"step": 21978
},
{
"epoch": 66.07,
"grad_norm": 4.140334129333496,
"learning_rate": 3.393393393393394e-06,
"loss": 0.1137,
"step": 22000
},
{
"epoch": 67.0,
"eval_accuracy": 0.9895,
"eval_loss": 0.04575618728995323,
"eval_runtime": 27.7597,
"eval_samples_per_second": 360.234,
"eval_steps_per_second": 1.441,
"step": 22311
},
{
"epoch": 67.57,
"grad_norm": 1.3289170265197754,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.1145,
"step": 22500
},
{
"epoch": 68.0,
"eval_accuracy": 0.989,
"eval_loss": 0.046150561422109604,
"eval_runtime": 27.6364,
"eval_samples_per_second": 361.842,
"eval_steps_per_second": 1.447,
"step": 22644
},
{
"epoch": 69.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04607143998146057,
"eval_runtime": 27.3526,
"eval_samples_per_second": 365.596,
"eval_steps_per_second": 1.462,
"step": 22977
},
{
"epoch": 69.07,
"grad_norm": 3.2696564197540283,
"learning_rate": 3.0930930930930935e-06,
"loss": 0.1136,
"step": 23000
},
{
"epoch": 70.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04546944424510002,
"eval_runtime": 26.8405,
"eval_samples_per_second": 372.571,
"eval_steps_per_second": 1.49,
"step": 23310
},
{
"epoch": 70.57,
"grad_norm": 1.5183466672897339,
"learning_rate": 2.942942942942943e-06,
"loss": 0.1144,
"step": 23500
},
{
"epoch": 71.0,
"eval_accuracy": 0.9896,
"eval_loss": 0.04547298699617386,
"eval_runtime": 28.012,
"eval_samples_per_second": 356.99,
"eval_steps_per_second": 1.428,
"step": 23643
},
{
"epoch": 72.0,
"eval_accuracy": 0.9891,
"eval_loss": 0.045761577785015106,
"eval_runtime": 26.7415,
"eval_samples_per_second": 373.951,
"eval_steps_per_second": 1.496,
"step": 23976
},
{
"epoch": 72.07,
"grad_norm": 2.621833324432373,
"learning_rate": 2.7927927927927926e-06,
"loss": 0.1126,
"step": 24000
},
{
"epoch": 73.0,
"eval_accuracy": 0.989,
"eval_loss": 0.04621967673301697,
"eval_runtime": 26.6469,
"eval_samples_per_second": 375.278,
"eval_steps_per_second": 1.501,
"step": 24309
},
{
"epoch": 73.57,
"grad_norm": 1.9195489883422852,
"learning_rate": 2.642642642642643e-06,
"loss": 0.1065,
"step": 24500
},
{
"epoch": 74.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04627339914441109,
"eval_runtime": 26.8321,
"eval_samples_per_second": 372.687,
"eval_steps_per_second": 1.491,
"step": 24642
},
{
"epoch": 75.0,
"eval_accuracy": 0.9895,
"eval_loss": 0.04610699415206909,
"eval_runtime": 26.787,
"eval_samples_per_second": 373.316,
"eval_steps_per_second": 1.493,
"step": 24975
},
{
"epoch": 75.08,
"grad_norm": 2.639979839324951,
"learning_rate": 2.4924924924924926e-06,
"loss": 0.1136,
"step": 25000
},
{
"epoch": 76.0,
"eval_accuracy": 0.9893,
"eval_loss": 0.04624738171696663,
"eval_runtime": 27.255,
"eval_samples_per_second": 366.905,
"eval_steps_per_second": 1.468,
"step": 25308
},
{
"epoch": 76.58,
"grad_norm": 1.8884766101837158,
"learning_rate": 2.3423423423423424e-06,
"loss": 0.1117,
"step": 25500
},
{
"epoch": 77.0,
"eval_accuracy": 0.9886,
"eval_loss": 0.04540451616048813,
"eval_runtime": 26.8153,
"eval_samples_per_second": 372.921,
"eval_steps_per_second": 1.492,
"step": 25641
},
{
"epoch": 78.0,
"eval_accuracy": 0.9889,
"eval_loss": 0.045627232640981674,
"eval_runtime": 27.2138,
"eval_samples_per_second": 367.46,
"eval_steps_per_second": 1.47,
"step": 25974
},
{
"epoch": 78.08,
"grad_norm": 2.558985948562622,
"learning_rate": 2.192192192192192e-06,
"loss": 0.1106,
"step": 26000
},
{
"epoch": 79.0,
"eval_accuracy": 0.9887,
"eval_loss": 0.0453827828168869,
"eval_runtime": 26.7871,
"eval_samples_per_second": 373.313,
"eval_steps_per_second": 1.493,
"step": 26307
},
{
"epoch": 79.58,
"grad_norm": 1.8785403966903687,
"learning_rate": 2.0420420420420424e-06,
"loss": 0.1085,
"step": 26500
},
{
"epoch": 80.0,
"eval_accuracy": 0.9893,
"eval_loss": 0.04575396701693535,
"eval_runtime": 26.9022,
"eval_samples_per_second": 371.716,
"eval_steps_per_second": 1.487,
"step": 26640
},
{
"epoch": 81.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.045756805688142776,
"eval_runtime": 27.2761,
"eval_samples_per_second": 366.621,
"eval_steps_per_second": 1.466,
"step": 26973
},
{
"epoch": 81.08,
"grad_norm": 1.840050458908081,
"learning_rate": 1.8918918918918922e-06,
"loss": 0.107,
"step": 27000
},
{
"epoch": 82.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04503399878740311,
"eval_runtime": 26.7515,
"eval_samples_per_second": 373.811,
"eval_steps_per_second": 1.495,
"step": 27306
},
{
"epoch": 82.58,
"grad_norm": 0.8984606862068176,
"learning_rate": 1.7417417417417418e-06,
"loss": 0.1112,
"step": 27500
},
{
"epoch": 83.0,
"eval_accuracy": 0.9896,
"eval_loss": 0.043780211359262466,
"eval_runtime": 27.2885,
"eval_samples_per_second": 366.454,
"eval_steps_per_second": 1.466,
"step": 27639
},
{
"epoch": 84.0,
"eval_accuracy": 0.9891,
"eval_loss": 0.045303359627723694,
"eval_runtime": 27.2638,
"eval_samples_per_second": 366.786,
"eval_steps_per_second": 1.467,
"step": 27972
},
{
"epoch": 84.08,
"grad_norm": 2.417297124862671,
"learning_rate": 1.5915915915915916e-06,
"loss": 0.1073,
"step": 28000
},
{
"epoch": 85.0,
"eval_accuracy": 0.9893,
"eval_loss": 0.04447495564818382,
"eval_runtime": 27.5559,
"eval_samples_per_second": 362.898,
"eval_steps_per_second": 1.452,
"step": 28305
},
{
"epoch": 85.59,
"grad_norm": 2.735337734222412,
"learning_rate": 1.4414414414414416e-06,
"loss": 0.1103,
"step": 28500
},
{
"epoch": 86.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.044364869594573975,
"eval_runtime": 26.7863,
"eval_samples_per_second": 373.326,
"eval_steps_per_second": 1.493,
"step": 28638
},
{
"epoch": 87.0,
"eval_accuracy": 0.9891,
"eval_loss": 0.044344834983348846,
"eval_runtime": 26.6701,
"eval_samples_per_second": 374.952,
"eval_steps_per_second": 1.5,
"step": 28971
},
{
"epoch": 87.09,
"grad_norm": 2.378945827484131,
"learning_rate": 1.2912912912912913e-06,
"loss": 0.1074,
"step": 29000
},
{
"epoch": 88.0,
"eval_accuracy": 0.9893,
"eval_loss": 0.04596693813800812,
"eval_runtime": 27.7741,
"eval_samples_per_second": 360.047,
"eval_steps_per_second": 1.44,
"step": 29304
},
{
"epoch": 88.59,
"grad_norm": 1.8432105779647827,
"learning_rate": 1.1411411411411411e-06,
"loss": 0.1041,
"step": 29500
},
{
"epoch": 89.0,
"eval_accuracy": 0.9891,
"eval_loss": 0.045480720698833466,
"eval_runtime": 27.1037,
"eval_samples_per_second": 368.953,
"eval_steps_per_second": 1.476,
"step": 29637
},
{
"epoch": 90.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04399260878562927,
"eval_runtime": 26.7306,
"eval_samples_per_second": 374.102,
"eval_steps_per_second": 1.496,
"step": 29970
},
{
"epoch": 90.09,
"grad_norm": 2.23136305809021,
"learning_rate": 9.909909909909911e-07,
"loss": 0.1054,
"step": 30000
},
{
"epoch": 91.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.04531684145331383,
"eval_runtime": 26.671,
"eval_samples_per_second": 374.939,
"eval_steps_per_second": 1.5,
"step": 30303
},
{
"epoch": 91.59,
"grad_norm": 2.097313642501831,
"learning_rate": 8.40840840840841e-07,
"loss": 0.1069,
"step": 30500
},
{
"epoch": 92.0,
"eval_accuracy": 0.989,
"eval_loss": 0.04511631652712822,
"eval_runtime": 27.1707,
"eval_samples_per_second": 368.043,
"eval_steps_per_second": 1.472,
"step": 30636
},
{
"epoch": 93.0,
"eval_accuracy": 0.9894,
"eval_loss": 0.044860344380140305,
"eval_runtime": 26.5648,
"eval_samples_per_second": 376.438,
"eval_steps_per_second": 1.506,
"step": 30969
},
{
"epoch": 93.09,
"grad_norm": 2.0008368492126465,
"learning_rate": 6.906906906906907e-07,
"loss": 0.1056,
"step": 31000
},
{
"epoch": 94.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.04571190103888512,
"eval_runtime": 27.2069,
"eval_samples_per_second": 367.554,
"eval_steps_per_second": 1.47,
"step": 31302
},
{
"epoch": 94.59,
"grad_norm": 2.618077039718628,
"learning_rate": 5.405405405405406e-07,
"loss": 0.1069,
"step": 31500
},
{
"epoch": 95.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.04491310566663742,
"eval_runtime": 26.6385,
"eval_samples_per_second": 375.396,
"eval_steps_per_second": 1.502,
"step": 31635
},
{
"epoch": 96.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.0449623242020607,
"eval_runtime": 26.4041,
"eval_samples_per_second": 378.729,
"eval_steps_per_second": 1.515,
"step": 31968
},
{
"epoch": 96.1,
"grad_norm": 1.9157476425170898,
"learning_rate": 3.903903903903904e-07,
"loss": 0.1053,
"step": 32000
},
{
"epoch": 97.0,
"eval_accuracy": 0.9896,
"eval_loss": 0.044889744371175766,
"eval_runtime": 27.0991,
"eval_samples_per_second": 369.016,
"eval_steps_per_second": 1.476,
"step": 32301
},
{
"epoch": 97.6,
"grad_norm": 2.6996421813964844,
"learning_rate": 2.4024024024024026e-07,
"loss": 0.1068,
"step": 32500
},
{
"epoch": 98.0,
"eval_accuracy": 0.9893,
"eval_loss": 0.04525148868560791,
"eval_runtime": 27.1528,
"eval_samples_per_second": 368.286,
"eval_steps_per_second": 1.473,
"step": 32634
},
{
"epoch": 99.0,
"eval_accuracy": 0.9891,
"eval_loss": 0.045253388583660126,
"eval_runtime": 27.5963,
"eval_samples_per_second": 362.368,
"eval_steps_per_second": 1.449,
"step": 32967
},
{
"epoch": 99.1,
"grad_norm": 2.057509422302246,
"learning_rate": 9.00900900900901e-08,
"loss": 0.1059,
"step": 33000
},
{
"epoch": 100.0,
"eval_accuracy": 0.9892,
"eval_loss": 0.045226842164993286,
"eval_runtime": 26.356,
"eval_samples_per_second": 379.421,
"eval_steps_per_second": 1.518,
"step": 33300
},
{
"epoch": 100.0,
"step": 33300,
"total_flos": 3.293645700925431e+20,
"train_loss": 0.08339859163438952,
"train_runtime": 26101.948,
"train_samples_per_second": 162.823,
"train_steps_per_second": 1.276
}
],
"logging_steps": 500,
"max_steps": 33300,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"total_flos": 3.293645700925431e+20,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}