mbert_sin-sinh / trainer_state.json
DGurgurov's picture
Uploading checkpoint-88000 for mbert - sin-sinh
6fb0da5 verified
{
"best_metric": 0.7817407250404358,
"best_model_checkpoint": "./model_fine-tune/glot/mbert/sin-Sinh/checkpoint-88000",
"epoch": 57.40378343118069,
"eval_steps": 500,
"global_step": 88000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.32615786040443573,
"grad_norm": 11.542426109313965,
"learning_rate": 9.95e-05,
"loss": 1.3665,
"step": 500
},
{
"epoch": 0.32615786040443573,
"eval_accuracy": 0.7435959972211136,
"eval_loss": 1.184606671333313,
"eval_runtime": 96.3373,
"eval_samples_per_second": 127.282,
"eval_steps_per_second": 3.986,
"step": 500
},
{
"epoch": 0.6523157208088715,
"grad_norm": 4.45168399810791,
"learning_rate": 9.900000000000001e-05,
"loss": 1.2146,
"step": 1000
},
{
"epoch": 0.6523157208088715,
"eval_accuracy": 0.7612149949461918,
"eval_loss": 1.1183263063430786,
"eval_runtime": 95.7412,
"eval_samples_per_second": 128.074,
"eval_steps_per_second": 4.011,
"step": 1000
},
{
"epoch": 0.9784735812133072,
"grad_norm": 4.524835109710693,
"learning_rate": 9.850000000000001e-05,
"loss": 1.1681,
"step": 1500
},
{
"epoch": 0.9784735812133072,
"eval_accuracy": 0.768949081178049,
"eval_loss": 1.0730011463165283,
"eval_runtime": 95.6577,
"eval_samples_per_second": 128.186,
"eval_steps_per_second": 4.014,
"step": 1500
},
{
"epoch": 1.304631441617743,
"grad_norm": 6.722175598144531,
"learning_rate": 9.8e-05,
"loss": 1.1172,
"step": 2000
},
{
"epoch": 1.304631441617743,
"eval_accuracy": 0.7709548983117451,
"eval_loss": 1.0751733779907227,
"eval_runtime": 96.0121,
"eval_samples_per_second": 127.713,
"eval_steps_per_second": 3.999,
"step": 2000
},
{
"epoch": 1.6307893020221789,
"grad_norm": 4.5525407791137695,
"learning_rate": 9.75e-05,
"loss": 1.0856,
"step": 2500
},
{
"epoch": 1.6307893020221789,
"eval_accuracy": 0.7754712373171111,
"eval_loss": 1.0482443571090698,
"eval_runtime": 96.0314,
"eval_samples_per_second": 127.687,
"eval_steps_per_second": 3.999,
"step": 2500
},
{
"epoch": 1.9569471624266144,
"grad_norm": 3.725614309310913,
"learning_rate": 9.7e-05,
"loss": 1.0682,
"step": 3000
},
{
"epoch": 1.9569471624266144,
"eval_accuracy": 0.7820907707722566,
"eval_loss": 1.017106056213379,
"eval_runtime": 96.2201,
"eval_samples_per_second": 127.437,
"eval_steps_per_second": 3.991,
"step": 3000
},
{
"epoch": 2.2831050228310503,
"grad_norm": 4.34595251083374,
"learning_rate": 9.65e-05,
"loss": 1.046,
"step": 3500
},
{
"epoch": 2.2831050228310503,
"eval_accuracy": 0.7852632360034133,
"eval_loss": 1.006613850593567,
"eval_runtime": 95.8046,
"eval_samples_per_second": 127.99,
"eval_steps_per_second": 4.008,
"step": 3500
},
{
"epoch": 2.609262883235486,
"grad_norm": 3.8304195404052734,
"learning_rate": 9.6e-05,
"loss": 1.0252,
"step": 4000
},
{
"epoch": 2.609262883235486,
"eval_accuracy": 0.7844518509171077,
"eval_loss": 0.9925754070281982,
"eval_runtime": 96.0595,
"eval_samples_per_second": 127.65,
"eval_steps_per_second": 3.998,
"step": 4000
},
{
"epoch": 2.935420743639922,
"grad_norm": 4.004533290863037,
"learning_rate": 9.55e-05,
"loss": 1.0202,
"step": 4500
},
{
"epoch": 2.935420743639922,
"eval_accuracy": 0.7862023720892519,
"eval_loss": 0.9879273772239685,
"eval_runtime": 96.1151,
"eval_samples_per_second": 127.576,
"eval_steps_per_second": 3.995,
"step": 4500
},
{
"epoch": 3.2615786040443573,
"grad_norm": 5.68316650390625,
"learning_rate": 9.5e-05,
"loss": 0.9935,
"step": 5000
},
{
"epoch": 3.2615786040443573,
"eval_accuracy": 0.7892981535025771,
"eval_loss": 0.9794703722000122,
"eval_runtime": 95.6967,
"eval_samples_per_second": 128.134,
"eval_steps_per_second": 4.013,
"step": 5000
},
{
"epoch": 3.5877364644487932,
"grad_norm": 3.8201630115509033,
"learning_rate": 9.449999999999999e-05,
"loss": 0.9965,
"step": 5500
},
{
"epoch": 3.5877364644487932,
"eval_accuracy": 0.7902053479095879,
"eval_loss": 0.9639435410499573,
"eval_runtime": 96.059,
"eval_samples_per_second": 127.651,
"eval_steps_per_second": 3.998,
"step": 5500
},
{
"epoch": 3.9138943248532287,
"grad_norm": 4.568619728088379,
"learning_rate": 9.4e-05,
"loss": 0.9773,
"step": 6000
},
{
"epoch": 3.9138943248532287,
"eval_accuracy": 0.7901874809885227,
"eval_loss": 0.9642708897590637,
"eval_runtime": 96.0775,
"eval_samples_per_second": 127.626,
"eval_steps_per_second": 3.997,
"step": 6000
},
{
"epoch": 4.240052185257665,
"grad_norm": 2.4079413414001465,
"learning_rate": 9.350000000000001e-05,
"loss": 0.9573,
"step": 6500
},
{
"epoch": 4.240052185257665,
"eval_accuracy": 0.7912424302921408,
"eval_loss": 0.9602788686752319,
"eval_runtime": 95.6177,
"eval_samples_per_second": 128.24,
"eval_steps_per_second": 4.016,
"step": 6500
},
{
"epoch": 4.566210045662101,
"grad_norm": 3.638129711151123,
"learning_rate": 9.300000000000001e-05,
"loss": 0.9655,
"step": 7000
},
{
"epoch": 4.566210045662101,
"eval_accuracy": 0.7932507570316953,
"eval_loss": 0.9564482569694519,
"eval_runtime": 96.2782,
"eval_samples_per_second": 127.36,
"eval_steps_per_second": 3.988,
"step": 7000
},
{
"epoch": 4.892367906066536,
"grad_norm": 6.558393955230713,
"learning_rate": 9.250000000000001e-05,
"loss": 0.9555,
"step": 7500
},
{
"epoch": 4.892367906066536,
"eval_accuracy": 0.7944629506255089,
"eval_loss": 0.9509521722793579,
"eval_runtime": 96.1362,
"eval_samples_per_second": 127.548,
"eval_steps_per_second": 3.994,
"step": 7500
},
{
"epoch": 5.218525766470972,
"grad_norm": 3.637993335723877,
"learning_rate": 9.200000000000001e-05,
"loss": 0.9378,
"step": 8000
},
{
"epoch": 5.218525766470972,
"eval_accuracy": 0.7949035700239684,
"eval_loss": 0.9451656937599182,
"eval_runtime": 95.4917,
"eval_samples_per_second": 128.409,
"eval_steps_per_second": 4.021,
"step": 8000
},
{
"epoch": 5.544683626875408,
"grad_norm": 3.2590765953063965,
"learning_rate": 9.15e-05,
"loss": 0.942,
"step": 8500
},
{
"epoch": 5.544683626875408,
"eval_accuracy": 0.7976977603049797,
"eval_loss": 0.9348493814468384,
"eval_runtime": 95.6296,
"eval_samples_per_second": 128.224,
"eval_steps_per_second": 4.015,
"step": 8500
},
{
"epoch": 5.870841487279844,
"grad_norm": 6.074111461639404,
"learning_rate": 9.1e-05,
"loss": 0.9189,
"step": 9000
},
{
"epoch": 5.870841487279844,
"eval_accuracy": 0.7951165602308211,
"eval_loss": 0.9503134489059448,
"eval_runtime": 95.929,
"eval_samples_per_second": 127.824,
"eval_steps_per_second": 4.003,
"step": 9000
},
{
"epoch": 6.1969993476842795,
"grad_norm": 4.091071605682373,
"learning_rate": 9.05e-05,
"loss": 0.9096,
"step": 9500
},
{
"epoch": 6.1969993476842795,
"eval_accuracy": 0.7966632678695453,
"eval_loss": 0.9422316551208496,
"eval_runtime": 95.9886,
"eval_samples_per_second": 127.744,
"eval_steps_per_second": 4.0,
"step": 9500
},
{
"epoch": 6.523157208088715,
"grad_norm": 4.287664890289307,
"learning_rate": 9e-05,
"loss": 0.9207,
"step": 10000
},
{
"epoch": 6.523157208088715,
"eval_accuracy": 0.7982671256149266,
"eval_loss": 0.9268999695777893,
"eval_runtime": 95.9908,
"eval_samples_per_second": 127.741,
"eval_steps_per_second": 4.0,
"step": 10000
},
{
"epoch": 6.8493150684931505,
"grad_norm": 4.4528703689575195,
"learning_rate": 8.950000000000001e-05,
"loss": 0.9067,
"step": 10500
},
{
"epoch": 6.8493150684931505,
"eval_accuracy": 0.8010467333934564,
"eval_loss": 0.9209058284759521,
"eval_runtime": 95.8309,
"eval_samples_per_second": 127.955,
"eval_steps_per_second": 4.007,
"step": 10500
},
{
"epoch": 7.1754729288975865,
"grad_norm": 4.453877925872803,
"learning_rate": 8.900000000000001e-05,
"loss": 0.8987,
"step": 11000
},
{
"epoch": 7.1754729288975865,
"eval_accuracy": 0.8013618864649552,
"eval_loss": 0.9242467284202576,
"eval_runtime": 95.5629,
"eval_samples_per_second": 128.313,
"eval_steps_per_second": 4.018,
"step": 11000
},
{
"epoch": 7.501630789302022,
"grad_norm": 4.282083988189697,
"learning_rate": 8.850000000000001e-05,
"loss": 0.8906,
"step": 11500
},
{
"epoch": 7.501630789302022,
"eval_accuracy": 0.8037603951451062,
"eval_loss": 0.9170116186141968,
"eval_runtime": 96.0767,
"eval_samples_per_second": 127.627,
"eval_steps_per_second": 3.997,
"step": 11500
},
{
"epoch": 7.8277886497064575,
"grad_norm": 4.653249740600586,
"learning_rate": 8.800000000000001e-05,
"loss": 0.8976,
"step": 12000
},
{
"epoch": 7.8277886497064575,
"eval_accuracy": 0.8042002012072434,
"eval_loss": 0.9147453308105469,
"eval_runtime": 95.4281,
"eval_samples_per_second": 128.495,
"eval_steps_per_second": 4.024,
"step": 12000
},
{
"epoch": 8.153946510110893,
"grad_norm": 3.689419984817505,
"learning_rate": 8.75e-05,
"loss": 0.8822,
"step": 12500
},
{
"epoch": 8.153946510110893,
"eval_accuracy": 0.8048982122558644,
"eval_loss": 0.8996030688285828,
"eval_runtime": 96.1046,
"eval_samples_per_second": 127.59,
"eval_steps_per_second": 3.996,
"step": 12500
},
{
"epoch": 8.48010437051533,
"grad_norm": 3.341475009918213,
"learning_rate": 8.7e-05,
"loss": 0.8643,
"step": 13000
},
{
"epoch": 8.48010437051533,
"eval_accuracy": 0.8045987248855143,
"eval_loss": 0.9084787368774414,
"eval_runtime": 95.8893,
"eval_samples_per_second": 127.877,
"eval_steps_per_second": 4.005,
"step": 13000
},
{
"epoch": 8.806262230919765,
"grad_norm": 3.8555312156677246,
"learning_rate": 8.65e-05,
"loss": 0.8652,
"step": 13500
},
{
"epoch": 8.806262230919765,
"eval_accuracy": 0.8064863340113129,
"eval_loss": 0.891981840133667,
"eval_runtime": 96.0373,
"eval_samples_per_second": 127.68,
"eval_steps_per_second": 3.998,
"step": 13500
},
{
"epoch": 9.132420091324201,
"grad_norm": 3.0453758239746094,
"learning_rate": 8.6e-05,
"loss": 0.8624,
"step": 14000
},
{
"epoch": 9.132420091324201,
"eval_accuracy": 0.807120171356821,
"eval_loss": 0.8966971039772034,
"eval_runtime": 96.0645,
"eval_samples_per_second": 127.643,
"eval_steps_per_second": 3.997,
"step": 14000
},
{
"epoch": 9.458577951728637,
"grad_norm": 2.872584342956543,
"learning_rate": 8.55e-05,
"loss": 0.849,
"step": 14500
},
{
"epoch": 9.458577951728637,
"eval_accuracy": 0.805082546907549,
"eval_loss": 0.8947042226791382,
"eval_runtime": 95.5177,
"eval_samples_per_second": 128.374,
"eval_steps_per_second": 4.02,
"step": 14500
},
{
"epoch": 9.784735812133073,
"grad_norm": 3.990968942642212,
"learning_rate": 8.5e-05,
"loss": 0.8464,
"step": 15000
},
{
"epoch": 9.784735812133073,
"eval_accuracy": 0.8059118422518474,
"eval_loss": 0.8978227376937866,
"eval_runtime": 96.134,
"eval_samples_per_second": 127.551,
"eval_steps_per_second": 3.994,
"step": 15000
},
{
"epoch": 10.110893672537507,
"grad_norm": 4.263514995574951,
"learning_rate": 8.450000000000001e-05,
"loss": 0.8502,
"step": 15500
},
{
"epoch": 10.110893672537507,
"eval_accuracy": 0.8088704201234094,
"eval_loss": 0.8844937086105347,
"eval_runtime": 96.0116,
"eval_samples_per_second": 127.714,
"eval_steps_per_second": 4.0,
"step": 15500
},
{
"epoch": 10.437051532941943,
"grad_norm": 4.85188102722168,
"learning_rate": 8.4e-05,
"loss": 0.8368,
"step": 16000
},
{
"epoch": 10.437051532941943,
"eval_accuracy": 0.8090195118695352,
"eval_loss": 0.8942297101020813,
"eval_runtime": 95.8669,
"eval_samples_per_second": 127.907,
"eval_steps_per_second": 4.006,
"step": 16000
},
{
"epoch": 10.76320939334638,
"grad_norm": 4.126960754394531,
"learning_rate": 8.35e-05,
"loss": 0.8443,
"step": 16500
},
{
"epoch": 10.76320939334638,
"eval_accuracy": 0.80988295142817,
"eval_loss": 0.8783074021339417,
"eval_runtime": 95.4865,
"eval_samples_per_second": 128.416,
"eval_steps_per_second": 4.022,
"step": 16500
},
{
"epoch": 11.089367253750815,
"grad_norm": 3.0771892070770264,
"learning_rate": 8.3e-05,
"loss": 0.8407,
"step": 17000
},
{
"epoch": 11.089367253750815,
"eval_accuracy": 0.810145120489948,
"eval_loss": 0.8840489983558655,
"eval_runtime": 95.9033,
"eval_samples_per_second": 127.858,
"eval_steps_per_second": 4.004,
"step": 17000
},
{
"epoch": 11.415525114155251,
"grad_norm": 3.296961784362793,
"learning_rate": 8.25e-05,
"loss": 0.822,
"step": 17500
},
{
"epoch": 11.415525114155251,
"eval_accuracy": 0.8105256945571506,
"eval_loss": 0.8853065371513367,
"eval_runtime": 95.4717,
"eval_samples_per_second": 128.436,
"eval_steps_per_second": 4.022,
"step": 17500
},
{
"epoch": 11.741682974559687,
"grad_norm": 2.933551788330078,
"learning_rate": 8.2e-05,
"loss": 0.8373,
"step": 18000
},
{
"epoch": 11.741682974559687,
"eval_accuracy": 0.8086732617590695,
"eval_loss": 0.875987708568573,
"eval_runtime": 95.5854,
"eval_samples_per_second": 128.283,
"eval_steps_per_second": 4.017,
"step": 18000
},
{
"epoch": 12.067840834964123,
"grad_norm": 2.9668538570404053,
"learning_rate": 8.15e-05,
"loss": 0.82,
"step": 18500
},
{
"epoch": 12.067840834964123,
"eval_accuracy": 0.8100703443169197,
"eval_loss": 0.8864119648933411,
"eval_runtime": 95.6174,
"eval_samples_per_second": 128.24,
"eval_steps_per_second": 4.016,
"step": 18500
},
{
"epoch": 12.393998695368559,
"grad_norm": 3.0517213344573975,
"learning_rate": 8.1e-05,
"loss": 0.8075,
"step": 19000
},
{
"epoch": 12.393998695368559,
"eval_accuracy": 0.8106054996964357,
"eval_loss": 0.8773519396781921,
"eval_runtime": 95.5913,
"eval_samples_per_second": 128.275,
"eval_steps_per_second": 4.017,
"step": 19000
},
{
"epoch": 12.720156555772995,
"grad_norm": 2.811016321182251,
"learning_rate": 8.05e-05,
"loss": 0.8034,
"step": 19500
},
{
"epoch": 12.720156555772995,
"eval_accuracy": 0.812250382040057,
"eval_loss": 0.8746283650398254,
"eval_runtime": 96.0236,
"eval_samples_per_second": 127.698,
"eval_steps_per_second": 3.999,
"step": 19500
},
{
"epoch": 13.04631441617743,
"grad_norm": 3.343775987625122,
"learning_rate": 8e-05,
"loss": 0.812,
"step": 20000
},
{
"epoch": 13.04631441617743,
"eval_accuracy": 0.8116936347456187,
"eval_loss": 0.8740746974945068,
"eval_runtime": 96.1535,
"eval_samples_per_second": 127.525,
"eval_steps_per_second": 3.994,
"step": 20000
},
{
"epoch": 13.372472276581865,
"grad_norm": 3.361279010772705,
"learning_rate": 7.950000000000001e-05,
"loss": 0.7896,
"step": 20500
},
{
"epoch": 13.372472276581865,
"eval_accuracy": 0.8125255285808708,
"eval_loss": 0.8752043843269348,
"eval_runtime": 95.5714,
"eval_samples_per_second": 128.302,
"eval_steps_per_second": 4.018,
"step": 20500
},
{
"epoch": 13.698630136986301,
"grad_norm": 4.277713298797607,
"learning_rate": 7.900000000000001e-05,
"loss": 0.808,
"step": 21000
},
{
"epoch": 13.698630136986301,
"eval_accuracy": 0.8146125537606598,
"eval_loss": 0.8755714893341064,
"eval_runtime": 95.9658,
"eval_samples_per_second": 127.775,
"eval_steps_per_second": 4.001,
"step": 21000
},
{
"epoch": 14.024787997390737,
"grad_norm": 4.07389497756958,
"learning_rate": 7.850000000000001e-05,
"loss": 0.8066,
"step": 21500
},
{
"epoch": 14.024787997390737,
"eval_accuracy": 0.810667004970075,
"eval_loss": 0.8822521567344666,
"eval_runtime": 96.0854,
"eval_samples_per_second": 127.616,
"eval_steps_per_second": 3.996,
"step": 21500
},
{
"epoch": 14.350945857795173,
"grad_norm": 3.8551950454711914,
"learning_rate": 7.800000000000001e-05,
"loss": 0.7873,
"step": 22000
},
{
"epoch": 14.350945857795173,
"eval_accuracy": 0.8103896295640106,
"eval_loss": 0.8803927302360535,
"eval_runtime": 95.4874,
"eval_samples_per_second": 128.415,
"eval_steps_per_second": 4.021,
"step": 22000
},
{
"epoch": 14.677103718199609,
"grad_norm": 3.6163582801818848,
"learning_rate": 7.75e-05,
"loss": 0.7853,
"step": 22500
},
{
"epoch": 14.677103718199609,
"eval_accuracy": 0.8146700066154773,
"eval_loss": 0.862837016582489,
"eval_runtime": 95.5698,
"eval_samples_per_second": 128.304,
"eval_steps_per_second": 4.018,
"step": 22500
},
{
"epoch": 15.003261578604045,
"grad_norm": 2.713881492614746,
"learning_rate": 7.7e-05,
"loss": 0.7852,
"step": 23000
},
{
"epoch": 15.003261578604045,
"eval_accuracy": 0.8136574074074074,
"eval_loss": 0.8718428611755371,
"eval_runtime": 95.7231,
"eval_samples_per_second": 128.099,
"eval_steps_per_second": 4.012,
"step": 23000
},
{
"epoch": 15.32941943900848,
"grad_norm": 3.4165608882904053,
"learning_rate": 7.65e-05,
"loss": 0.7733,
"step": 23500
},
{
"epoch": 15.32941943900848,
"eval_accuracy": 0.813523767016291,
"eval_loss": 0.8665691614151001,
"eval_runtime": 96.1635,
"eval_samples_per_second": 127.512,
"eval_steps_per_second": 3.993,
"step": 23500
},
{
"epoch": 15.655577299412915,
"grad_norm": 4.0435028076171875,
"learning_rate": 7.6e-05,
"loss": 0.7837,
"step": 24000
},
{
"epoch": 15.655577299412915,
"eval_accuracy": 0.8137777942210218,
"eval_loss": 0.8687108159065247,
"eval_runtime": 95.6602,
"eval_samples_per_second": 128.183,
"eval_steps_per_second": 4.014,
"step": 24000
},
{
"epoch": 15.981735159817351,
"grad_norm": 4.136204719543457,
"learning_rate": 7.55e-05,
"loss": 0.7826,
"step": 24500
},
{
"epoch": 15.981735159817351,
"eval_accuracy": 0.8160837708759752,
"eval_loss": 0.8521152138710022,
"eval_runtime": 95.6741,
"eval_samples_per_second": 128.164,
"eval_steps_per_second": 4.014,
"step": 24500
},
{
"epoch": 16.307893020221787,
"grad_norm": 3.419949769973755,
"learning_rate": 7.500000000000001e-05,
"loss": 0.7583,
"step": 25000
},
{
"epoch": 16.307893020221787,
"eval_accuracy": 0.8170063016613471,
"eval_loss": 0.8535459041595459,
"eval_runtime": 95.3894,
"eval_samples_per_second": 128.547,
"eval_steps_per_second": 4.026,
"step": 25000
},
{
"epoch": 16.634050880626223,
"grad_norm": 3.9244208335876465,
"learning_rate": 7.450000000000001e-05,
"loss": 0.7598,
"step": 25500
},
{
"epoch": 16.634050880626223,
"eval_accuracy": 0.8164982668338373,
"eval_loss": 0.8608457446098328,
"eval_runtime": 96.0993,
"eval_samples_per_second": 127.597,
"eval_steps_per_second": 3.996,
"step": 25500
},
{
"epoch": 16.96020874103066,
"grad_norm": 3.6305317878723145,
"learning_rate": 7.4e-05,
"loss": 0.7802,
"step": 26000
},
{
"epoch": 16.96020874103066,
"eval_accuracy": 0.8151692519974184,
"eval_loss": 0.8590179681777954,
"eval_runtime": 95.6373,
"eval_samples_per_second": 128.214,
"eval_steps_per_second": 4.015,
"step": 26000
},
{
"epoch": 17.286366601435095,
"grad_norm": 2.6497814655303955,
"learning_rate": 7.35e-05,
"loss": 0.756,
"step": 26500
},
{
"epoch": 17.286366601435095,
"eval_accuracy": 0.817159829218489,
"eval_loss": 0.8574303388595581,
"eval_runtime": 96.134,
"eval_samples_per_second": 127.551,
"eval_steps_per_second": 3.994,
"step": 26500
},
{
"epoch": 17.61252446183953,
"grad_norm": 3.486994504928589,
"learning_rate": 7.3e-05,
"loss": 0.7676,
"step": 27000
},
{
"epoch": 17.61252446183953,
"eval_accuracy": 0.816771978631783,
"eval_loss": 0.8543539047241211,
"eval_runtime": 96.1458,
"eval_samples_per_second": 127.535,
"eval_steps_per_second": 3.994,
"step": 27000
},
{
"epoch": 17.938682322243967,
"grad_norm": 3.2243542671203613,
"learning_rate": 7.25e-05,
"loss": 0.7547,
"step": 27500
},
{
"epoch": 17.938682322243967,
"eval_accuracy": 0.8156156378296409,
"eval_loss": 0.8701485395431519,
"eval_runtime": 96.0383,
"eval_samples_per_second": 127.678,
"eval_steps_per_second": 3.998,
"step": 27500
},
{
"epoch": 18.264840182648403,
"grad_norm": 2.7559664249420166,
"learning_rate": 7.2e-05,
"loss": 0.7561,
"step": 28000
},
{
"epoch": 18.264840182648403,
"eval_accuracy": 0.8179125205918637,
"eval_loss": 0.8573756217956543,
"eval_runtime": 95.5829,
"eval_samples_per_second": 128.287,
"eval_steps_per_second": 4.017,
"step": 28000
},
{
"epoch": 18.59099804305284,
"grad_norm": 3.51413631439209,
"learning_rate": 7.15e-05,
"loss": 0.749,
"step": 28500
},
{
"epoch": 18.59099804305284,
"eval_accuracy": 0.8171728232288245,
"eval_loss": 0.8583955764770508,
"eval_runtime": 95.9961,
"eval_samples_per_second": 127.734,
"eval_steps_per_second": 4.0,
"step": 28500
},
{
"epoch": 18.917155903457274,
"grad_norm": 4.024128437042236,
"learning_rate": 7.1e-05,
"loss": 0.7537,
"step": 29000
},
{
"epoch": 18.917155903457274,
"eval_accuracy": 0.8167475998072432,
"eval_loss": 0.8595439195632935,
"eval_runtime": 95.6024,
"eval_samples_per_second": 128.26,
"eval_steps_per_second": 4.017,
"step": 29000
},
{
"epoch": 19.24331376386171,
"grad_norm": 3.6477324962615967,
"learning_rate": 7.05e-05,
"loss": 0.7487,
"step": 29500
},
{
"epoch": 19.24331376386171,
"eval_accuracy": 0.8184730854235579,
"eval_loss": 0.853278636932373,
"eval_runtime": 96.1748,
"eval_samples_per_second": 127.497,
"eval_steps_per_second": 3.993,
"step": 29500
},
{
"epoch": 19.569471624266146,
"grad_norm": 2.851353406906128,
"learning_rate": 7e-05,
"loss": 0.7344,
"step": 30000
},
{
"epoch": 19.569471624266146,
"eval_accuracy": 0.8171112162521809,
"eval_loss": 0.8451135158538818,
"eval_runtime": 96.1975,
"eval_samples_per_second": 127.467,
"eval_steps_per_second": 3.992,
"step": 30000
},
{
"epoch": 19.89562948467058,
"grad_norm": 3.1359875202178955,
"learning_rate": 6.95e-05,
"loss": 0.7403,
"step": 30500
},
{
"epoch": 19.89562948467058,
"eval_accuracy": 0.8189718509502995,
"eval_loss": 0.846347987651825,
"eval_runtime": 96.1126,
"eval_samples_per_second": 127.579,
"eval_steps_per_second": 3.995,
"step": 30500
},
{
"epoch": 20.221787345075015,
"grad_norm": 3.7490246295928955,
"learning_rate": 6.9e-05,
"loss": 0.7309,
"step": 31000
},
{
"epoch": 20.221787345075015,
"eval_accuracy": 0.818888083223582,
"eval_loss": 0.8462125658988953,
"eval_runtime": 95.5039,
"eval_samples_per_second": 128.393,
"eval_steps_per_second": 4.021,
"step": 31000
},
{
"epoch": 20.54794520547945,
"grad_norm": 4.7173590660095215,
"learning_rate": 6.850000000000001e-05,
"loss": 0.7343,
"step": 31500
},
{
"epoch": 20.54794520547945,
"eval_accuracy": 0.8196272538269742,
"eval_loss": 0.8415057063102722,
"eval_runtime": 95.5642,
"eval_samples_per_second": 128.312,
"eval_steps_per_second": 4.018,
"step": 31500
},
{
"epoch": 20.874103065883887,
"grad_norm": 2.962167501449585,
"learning_rate": 6.800000000000001e-05,
"loss": 0.7232,
"step": 32000
},
{
"epoch": 20.874103065883887,
"eval_accuracy": 0.8207100504439659,
"eval_loss": 0.8412309288978577,
"eval_runtime": 95.5324,
"eval_samples_per_second": 128.354,
"eval_steps_per_second": 4.02,
"step": 32000
},
{
"epoch": 21.200260926288323,
"grad_norm": 4.313079357147217,
"learning_rate": 6.750000000000001e-05,
"loss": 0.7354,
"step": 32500
},
{
"epoch": 21.200260926288323,
"eval_accuracy": 0.8217455730903164,
"eval_loss": 0.8336274027824402,
"eval_runtime": 96.048,
"eval_samples_per_second": 127.665,
"eval_steps_per_second": 3.998,
"step": 32500
},
{
"epoch": 21.52641878669276,
"grad_norm": 3.247512102127075,
"learning_rate": 6.7e-05,
"loss": 0.7157,
"step": 33000
},
{
"epoch": 21.52641878669276,
"eval_accuracy": 0.821757600384175,
"eval_loss": 0.8315772414207458,
"eval_runtime": 96.0261,
"eval_samples_per_second": 127.694,
"eval_steps_per_second": 3.999,
"step": 33000
},
{
"epoch": 21.852576647097194,
"grad_norm": 4.300323486328125,
"learning_rate": 6.65e-05,
"loss": 0.7122,
"step": 33500
},
{
"epoch": 21.852576647097194,
"eval_accuracy": 0.8213358263218032,
"eval_loss": 0.8339926600456238,
"eval_runtime": 96.0296,
"eval_samples_per_second": 127.69,
"eval_steps_per_second": 3.999,
"step": 33500
},
{
"epoch": 22.17873450750163,
"grad_norm": 3.0622804164886475,
"learning_rate": 6.6e-05,
"loss": 0.7069,
"step": 34000
},
{
"epoch": 22.17873450750163,
"eval_accuracy": 0.8231302606696578,
"eval_loss": 0.8308265805244446,
"eval_runtime": 96.0547,
"eval_samples_per_second": 127.656,
"eval_steps_per_second": 3.998,
"step": 34000
},
{
"epoch": 22.504892367906066,
"grad_norm": 2.5095720291137695,
"learning_rate": 6.55e-05,
"loss": 0.7033,
"step": 34500
},
{
"epoch": 22.504892367906066,
"eval_accuracy": 0.8223606059040972,
"eval_loss": 0.8291507959365845,
"eval_runtime": 96.0539,
"eval_samples_per_second": 127.658,
"eval_steps_per_second": 3.998,
"step": 34500
},
{
"epoch": 22.831050228310502,
"grad_norm": 5.5047526359558105,
"learning_rate": 6.500000000000001e-05,
"loss": 0.7065,
"step": 35000
},
{
"epoch": 22.831050228310502,
"eval_accuracy": 0.8222459148693573,
"eval_loss": 0.8312565684318542,
"eval_runtime": 95.5399,
"eval_samples_per_second": 128.344,
"eval_steps_per_second": 4.019,
"step": 35000
},
{
"epoch": 23.15720808871494,
"grad_norm": 4.183090686798096,
"learning_rate": 6.450000000000001e-05,
"loss": 0.7172,
"step": 35500
},
{
"epoch": 23.15720808871494,
"eval_accuracy": 0.8214502944798018,
"eval_loss": 0.8354819416999817,
"eval_runtime": 95.6311,
"eval_samples_per_second": 128.222,
"eval_steps_per_second": 4.015,
"step": 35500
},
{
"epoch": 23.483365949119374,
"grad_norm": 2.854464292526245,
"learning_rate": 6.400000000000001e-05,
"loss": 0.7011,
"step": 36000
},
{
"epoch": 23.483365949119374,
"eval_accuracy": 0.8203978144669242,
"eval_loss": 0.8381808400154114,
"eval_runtime": 95.5013,
"eval_samples_per_second": 128.396,
"eval_steps_per_second": 4.021,
"step": 36000
},
{
"epoch": 23.80952380952381,
"grad_norm": 3.004467725753784,
"learning_rate": 6.35e-05,
"loss": 0.7038,
"step": 36500
},
{
"epoch": 23.80952380952381,
"eval_accuracy": 0.8229179786185795,
"eval_loss": 0.8298040628433228,
"eval_runtime": 95.9071,
"eval_samples_per_second": 127.853,
"eval_steps_per_second": 4.004,
"step": 36500
},
{
"epoch": 24.135681669928246,
"grad_norm": 4.745896816253662,
"learning_rate": 6.3e-05,
"loss": 0.6954,
"step": 37000
},
{
"epoch": 24.135681669928246,
"eval_accuracy": 0.8235633886255924,
"eval_loss": 0.828513503074646,
"eval_runtime": 96.1965,
"eval_samples_per_second": 127.468,
"eval_steps_per_second": 3.992,
"step": 37000
},
{
"epoch": 24.461839530332682,
"grad_norm": 2.830902338027954,
"learning_rate": 6.25e-05,
"loss": 0.6934,
"step": 37500
},
{
"epoch": 24.461839530332682,
"eval_accuracy": 0.8210810246933818,
"eval_loss": 0.8337345123291016,
"eval_runtime": 96.1694,
"eval_samples_per_second": 127.504,
"eval_steps_per_second": 3.993,
"step": 37500
},
{
"epoch": 24.787997390737118,
"grad_norm": 4.337691783905029,
"learning_rate": 6.2e-05,
"loss": 0.7033,
"step": 38000
},
{
"epoch": 24.787997390737118,
"eval_accuracy": 0.8232294407589964,
"eval_loss": 0.833626925945282,
"eval_runtime": 96.0655,
"eval_samples_per_second": 127.642,
"eval_steps_per_second": 3.997,
"step": 38000
},
{
"epoch": 25.114155251141554,
"grad_norm": 3.392941951751709,
"learning_rate": 6.15e-05,
"loss": 0.6916,
"step": 38500
},
{
"epoch": 25.114155251141554,
"eval_accuracy": 0.823752687374898,
"eval_loss": 0.8312628865242004,
"eval_runtime": 96.059,
"eval_samples_per_second": 127.651,
"eval_steps_per_second": 3.998,
"step": 38500
},
{
"epoch": 25.44031311154599,
"grad_norm": 2.851759910583496,
"learning_rate": 6.1e-05,
"loss": 0.686,
"step": 39000
},
{
"epoch": 25.44031311154599,
"eval_accuracy": 0.8232236734329158,
"eval_loss": 0.832546055316925,
"eval_runtime": 95.5576,
"eval_samples_per_second": 128.321,
"eval_steps_per_second": 4.019,
"step": 39000
},
{
"epoch": 25.766470971950422,
"grad_norm": 3.790151596069336,
"learning_rate": 6.05e-05,
"loss": 0.6905,
"step": 39500
},
{
"epoch": 25.766470971950422,
"eval_accuracy": 0.823065086481128,
"eval_loss": 0.8281510472297668,
"eval_runtime": 95.5599,
"eval_samples_per_second": 128.317,
"eval_steps_per_second": 4.018,
"step": 39500
},
{
"epoch": 26.09262883235486,
"grad_norm": 6.393603324890137,
"learning_rate": 6e-05,
"loss": 0.691,
"step": 40000
},
{
"epoch": 26.09262883235486,
"eval_accuracy": 0.8235015047960742,
"eval_loss": 0.8384564518928528,
"eval_runtime": 95.5299,
"eval_samples_per_second": 128.358,
"eval_steps_per_second": 4.02,
"step": 40000
},
{
"epoch": 26.418786692759294,
"grad_norm": 4.260555267333984,
"learning_rate": 5.95e-05,
"loss": 0.6893,
"step": 40500
},
{
"epoch": 26.418786692759294,
"eval_accuracy": 0.8240677513211805,
"eval_loss": 0.8298010230064392,
"eval_runtime": 95.5669,
"eval_samples_per_second": 128.308,
"eval_steps_per_second": 4.018,
"step": 40500
},
{
"epoch": 26.74494455316373,
"grad_norm": 2.733369827270508,
"learning_rate": 5.9e-05,
"loss": 0.6701,
"step": 41000
},
{
"epoch": 26.74494455316373,
"eval_accuracy": 0.8248589664707147,
"eval_loss": 0.8331744074821472,
"eval_runtime": 95.5789,
"eval_samples_per_second": 128.292,
"eval_steps_per_second": 4.018,
"step": 41000
},
{
"epoch": 27.071102413568166,
"grad_norm": 4.225335597991943,
"learning_rate": 5.85e-05,
"loss": 0.671,
"step": 41500
},
{
"epoch": 27.071102413568166,
"eval_accuracy": 0.8248499337562081,
"eval_loss": 0.8276962041854858,
"eval_runtime": 95.6366,
"eval_samples_per_second": 128.214,
"eval_steps_per_second": 4.015,
"step": 41500
},
{
"epoch": 27.397260273972602,
"grad_norm": 3.768827199935913,
"learning_rate": 5.8e-05,
"loss": 0.6702,
"step": 42000
},
{
"epoch": 27.397260273972602,
"eval_accuracy": 0.8253913197493783,
"eval_loss": 0.8237897753715515,
"eval_runtime": 95.5458,
"eval_samples_per_second": 128.336,
"eval_steps_per_second": 4.019,
"step": 42000
},
{
"epoch": 27.723418134377038,
"grad_norm": 3.414738178253174,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.673,
"step": 42500
},
{
"epoch": 27.723418134377038,
"eval_accuracy": 0.8261082246911201,
"eval_loss": 0.8088709712028503,
"eval_runtime": 95.7571,
"eval_samples_per_second": 128.053,
"eval_steps_per_second": 4.01,
"step": 42500
},
{
"epoch": 28.049575994781474,
"grad_norm": 3.5771758556365967,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.6634,
"step": 43000
},
{
"epoch": 28.049575994781474,
"eval_accuracy": 0.8264020093800707,
"eval_loss": 0.818773090839386,
"eval_runtime": 96.1431,
"eval_samples_per_second": 127.539,
"eval_steps_per_second": 3.994,
"step": 43000
},
{
"epoch": 28.37573385518591,
"grad_norm": 3.543851852416992,
"learning_rate": 5.65e-05,
"loss": 0.6646,
"step": 43500
},
{
"epoch": 28.37573385518591,
"eval_accuracy": 0.8269009920796627,
"eval_loss": 0.8075844049453735,
"eval_runtime": 96.1529,
"eval_samples_per_second": 127.526,
"eval_steps_per_second": 3.994,
"step": 43500
},
{
"epoch": 28.701891715590346,
"grad_norm": 2.2786142826080322,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.6531,
"step": 44000
},
{
"epoch": 28.701891715590346,
"eval_accuracy": 0.8257994932295168,
"eval_loss": 0.8344977498054504,
"eval_runtime": 95.968,
"eval_samples_per_second": 127.772,
"eval_steps_per_second": 4.001,
"step": 44000
},
{
"epoch": 29.028049575994782,
"grad_norm": 1.8194258213043213,
"learning_rate": 5.550000000000001e-05,
"loss": 0.6614,
"step": 44500
},
{
"epoch": 29.028049575994782,
"eval_accuracy": 0.8255736392742796,
"eval_loss": 0.8443058729171753,
"eval_runtime": 95.5717,
"eval_samples_per_second": 128.302,
"eval_steps_per_second": 4.018,
"step": 44500
},
{
"epoch": 29.354207436399218,
"grad_norm": 3.3961451053619385,
"learning_rate": 5.500000000000001e-05,
"loss": 0.6449,
"step": 45000
},
{
"epoch": 29.354207436399218,
"eval_accuracy": 0.8267738583860818,
"eval_loss": 0.8181779384613037,
"eval_runtime": 95.6353,
"eval_samples_per_second": 128.216,
"eval_steps_per_second": 4.015,
"step": 45000
},
{
"epoch": 29.680365296803654,
"grad_norm": 2.9679033756256104,
"learning_rate": 5.45e-05,
"loss": 0.6635,
"step": 45500
},
{
"epoch": 29.680365296803654,
"eval_accuracy": 0.8279328287606433,
"eval_loss": 0.8159452676773071,
"eval_runtime": 95.4111,
"eval_samples_per_second": 128.517,
"eval_steps_per_second": 4.025,
"step": 45500
},
{
"epoch": 30.00652315720809,
"grad_norm": 2.6850576400756836,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.6447,
"step": 46000
},
{
"epoch": 30.00652315720809,
"eval_accuracy": 0.8257240166507135,
"eval_loss": 0.8173608183860779,
"eval_runtime": 96.0834,
"eval_samples_per_second": 127.618,
"eval_steps_per_second": 3.997,
"step": 46000
},
{
"epoch": 30.332681017612526,
"grad_norm": 3.2325327396392822,
"learning_rate": 5.3500000000000006e-05,
"loss": 0.6485,
"step": 46500
},
{
"epoch": 30.332681017612526,
"eval_accuracy": 0.8278799641462891,
"eval_loss": 0.8071762919425964,
"eval_runtime": 95.5652,
"eval_samples_per_second": 128.31,
"eval_steps_per_second": 4.018,
"step": 46500
},
{
"epoch": 30.65883887801696,
"grad_norm": 2.906803607940674,
"learning_rate": 5.300000000000001e-05,
"loss": 0.6469,
"step": 47000
},
{
"epoch": 30.65883887801696,
"eval_accuracy": 0.8256298324146351,
"eval_loss": 0.8320009112358093,
"eval_runtime": 96.0736,
"eval_samples_per_second": 127.631,
"eval_steps_per_second": 3.997,
"step": 47000
},
{
"epoch": 30.984996738421398,
"grad_norm": 3.0411088466644287,
"learning_rate": 5.25e-05,
"loss": 0.646,
"step": 47500
},
{
"epoch": 30.984996738421398,
"eval_accuracy": 0.8279118105560092,
"eval_loss": 0.8084993958473206,
"eval_runtime": 96.0838,
"eval_samples_per_second": 127.618,
"eval_steps_per_second": 3.997,
"step": 47500
},
{
"epoch": 31.31115459882583,
"grad_norm": 2.555142879486084,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.6406,
"step": 48000
},
{
"epoch": 31.31115459882583,
"eval_accuracy": 0.8262719717693807,
"eval_loss": 0.8186313509941101,
"eval_runtime": 96.129,
"eval_samples_per_second": 127.558,
"eval_steps_per_second": 3.995,
"step": 48000
},
{
"epoch": 31.637312459230266,
"grad_norm": 3.7424721717834473,
"learning_rate": 5.1500000000000005e-05,
"loss": 0.6288,
"step": 48500
},
{
"epoch": 31.637312459230266,
"eval_accuracy": 0.8274525815671282,
"eval_loss": 0.8299573659896851,
"eval_runtime": 95.5383,
"eval_samples_per_second": 128.346,
"eval_steps_per_second": 4.019,
"step": 48500
},
{
"epoch": 31.963470319634702,
"grad_norm": 3.879730701446533,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.65,
"step": 49000
},
{
"epoch": 31.963470319634702,
"eval_accuracy": 0.8267922127873606,
"eval_loss": 0.8268994092941284,
"eval_runtime": 96.0503,
"eval_samples_per_second": 127.662,
"eval_steps_per_second": 3.998,
"step": 49000
},
{
"epoch": 32.28962818003914,
"grad_norm": 3.536586046218872,
"learning_rate": 5.05e-05,
"loss": 0.6274,
"step": 49500
},
{
"epoch": 32.28962818003914,
"eval_accuracy": 0.8275685236934419,
"eval_loss": 0.8197194933891296,
"eval_runtime": 95.5627,
"eval_samples_per_second": 128.314,
"eval_steps_per_second": 4.018,
"step": 49500
},
{
"epoch": 32.615786040443574,
"grad_norm": 3.459678888320923,
"learning_rate": 5e-05,
"loss": 0.6356,
"step": 50000
},
{
"epoch": 32.615786040443574,
"eval_accuracy": 0.8304228315712264,
"eval_loss": 0.8121655583381653,
"eval_runtime": 96.0794,
"eval_samples_per_second": 127.624,
"eval_steps_per_second": 3.997,
"step": 50000
},
{
"epoch": 32.94194390084801,
"grad_norm": 2.755117177963257,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.635,
"step": 50500
},
{
"epoch": 32.94194390084801,
"eval_accuracy": 0.828548567533728,
"eval_loss": 0.8212075233459473,
"eval_runtime": 96.1005,
"eval_samples_per_second": 127.596,
"eval_steps_per_second": 3.996,
"step": 50500
},
{
"epoch": 33.268101761252446,
"grad_norm": 3.8209736347198486,
"learning_rate": 4.9e-05,
"loss": 0.6281,
"step": 51000
},
{
"epoch": 33.268101761252446,
"eval_accuracy": 0.8276856793583589,
"eval_loss": 0.8094030022621155,
"eval_runtime": 95.616,
"eval_samples_per_second": 128.242,
"eval_steps_per_second": 4.016,
"step": 51000
},
{
"epoch": 33.59425962165688,
"grad_norm": 3.2926814556121826,
"learning_rate": 4.85e-05,
"loss": 0.6307,
"step": 51500
},
{
"epoch": 33.59425962165688,
"eval_accuracy": 0.8286809543543899,
"eval_loss": 0.8193591237068176,
"eval_runtime": 95.5699,
"eval_samples_per_second": 128.304,
"eval_steps_per_second": 4.018,
"step": 51500
},
{
"epoch": 33.92041748206132,
"grad_norm": 3.2185568809509277,
"learning_rate": 4.8e-05,
"loss": 0.6321,
"step": 52000
},
{
"epoch": 33.92041748206132,
"eval_accuracy": 0.8282692108265063,
"eval_loss": 0.8121231198310852,
"eval_runtime": 96.125,
"eval_samples_per_second": 127.563,
"eval_steps_per_second": 3.995,
"step": 52000
},
{
"epoch": 34.24657534246575,
"grad_norm": 3.80593204498291,
"learning_rate": 4.75e-05,
"loss": 0.6193,
"step": 52500
},
{
"epoch": 34.24657534246575,
"eval_accuracy": 0.829132275838543,
"eval_loss": 0.820734441280365,
"eval_runtime": 96.0513,
"eval_samples_per_second": 127.661,
"eval_steps_per_second": 3.998,
"step": 52500
},
{
"epoch": 34.57273320287019,
"grad_norm": 2.613852024078369,
"learning_rate": 4.7e-05,
"loss": 0.623,
"step": 53000
},
{
"epoch": 34.57273320287019,
"eval_accuracy": 0.8284756446565424,
"eval_loss": 0.8152901530265808,
"eval_runtime": 95.6772,
"eval_samples_per_second": 128.16,
"eval_steps_per_second": 4.013,
"step": 53000
},
{
"epoch": 34.898891063274625,
"grad_norm": 3.2367916107177734,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.6145,
"step": 53500
},
{
"epoch": 34.898891063274625,
"eval_accuracy": 0.8289422000874521,
"eval_loss": 0.8108070492744446,
"eval_runtime": 96.0171,
"eval_samples_per_second": 127.706,
"eval_steps_per_second": 3.999,
"step": 53500
},
{
"epoch": 35.22504892367906,
"grad_norm": 3.648937225341797,
"learning_rate": 4.600000000000001e-05,
"loss": 0.6239,
"step": 54000
},
{
"epoch": 35.22504892367906,
"eval_accuracy": 0.8282779590625927,
"eval_loss": 0.823903501033783,
"eval_runtime": 95.6837,
"eval_samples_per_second": 128.151,
"eval_steps_per_second": 4.013,
"step": 54000
},
{
"epoch": 35.5512067840835,
"grad_norm": 3.7973756790161133,
"learning_rate": 4.55e-05,
"loss": 0.6156,
"step": 54500
},
{
"epoch": 35.5512067840835,
"eval_accuracy": 0.8285820066641985,
"eval_loss": 0.8196555972099304,
"eval_runtime": 96.1175,
"eval_samples_per_second": 127.573,
"eval_steps_per_second": 3.995,
"step": 54500
},
{
"epoch": 35.87736464448793,
"grad_norm": 2.7111735343933105,
"learning_rate": 4.5e-05,
"loss": 0.6145,
"step": 55000
},
{
"epoch": 35.87736464448793,
"eval_accuracy": 0.8307134489048389,
"eval_loss": 0.8130238652229309,
"eval_runtime": 95.5978,
"eval_samples_per_second": 128.266,
"eval_steps_per_second": 4.017,
"step": 55000
},
{
"epoch": 36.20352250489237,
"grad_norm": 3.3603804111480713,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.6039,
"step": 55500
},
{
"epoch": 36.20352250489237,
"eval_accuracy": 0.8315611388544674,
"eval_loss": 0.8143065571784973,
"eval_runtime": 95.5067,
"eval_samples_per_second": 128.389,
"eval_steps_per_second": 4.021,
"step": 55500
},
{
"epoch": 36.529680365296805,
"grad_norm": 4.012826919555664,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.6146,
"step": 56000
},
{
"epoch": 36.529680365296805,
"eval_accuracy": 0.8305585729381424,
"eval_loss": 0.8082120418548584,
"eval_runtime": 96.1141,
"eval_samples_per_second": 127.578,
"eval_steps_per_second": 3.995,
"step": 56000
},
{
"epoch": 36.85583822570124,
"grad_norm": 3.831219434738159,
"learning_rate": 4.35e-05,
"loss": 0.6008,
"step": 56500
},
{
"epoch": 36.85583822570124,
"eval_accuracy": 0.8321564326943739,
"eval_loss": 0.8039395213127136,
"eval_runtime": 95.527,
"eval_samples_per_second": 128.362,
"eval_steps_per_second": 4.02,
"step": 56500
},
{
"epoch": 37.18199608610568,
"grad_norm": 3.3420674800872803,
"learning_rate": 4.3e-05,
"loss": 0.6025,
"step": 57000
},
{
"epoch": 37.18199608610568,
"eval_accuracy": 0.831398227733968,
"eval_loss": 0.8035178780555725,
"eval_runtime": 95.5597,
"eval_samples_per_second": 128.318,
"eval_steps_per_second": 4.018,
"step": 57000
},
{
"epoch": 37.50815394651011,
"grad_norm": 3.31644344329834,
"learning_rate": 4.25e-05,
"loss": 0.6014,
"step": 57500
},
{
"epoch": 37.50815394651011,
"eval_accuracy": 0.8313993123480942,
"eval_loss": 0.8076711893081665,
"eval_runtime": 95.5701,
"eval_samples_per_second": 128.304,
"eval_steps_per_second": 4.018,
"step": 57500
},
{
"epoch": 37.83431180691455,
"grad_norm": 2.9908030033111572,
"learning_rate": 4.2e-05,
"loss": 0.5964,
"step": 58000
},
{
"epoch": 37.83431180691455,
"eval_accuracy": 0.8307974621476649,
"eval_loss": 0.8147750496864319,
"eval_runtime": 95.5728,
"eval_samples_per_second": 128.3,
"eval_steps_per_second": 4.018,
"step": 58000
},
{
"epoch": 38.160469667318985,
"grad_norm": 3.3767218589782715,
"learning_rate": 4.15e-05,
"loss": 0.5919,
"step": 58500
},
{
"epoch": 38.160469667318985,
"eval_accuracy": 0.8321526738027666,
"eval_loss": 0.8096674084663391,
"eval_runtime": 95.502,
"eval_samples_per_second": 128.395,
"eval_steps_per_second": 4.021,
"step": 58500
},
{
"epoch": 38.48662752772342,
"grad_norm": 3.17134428024292,
"learning_rate": 4.1e-05,
"loss": 0.5915,
"step": 59000
},
{
"epoch": 38.48662752772342,
"eval_accuracy": 0.8314462895539176,
"eval_loss": 0.8136842846870422,
"eval_runtime": 96.17,
"eval_samples_per_second": 127.503,
"eval_steps_per_second": 3.993,
"step": 59000
},
{
"epoch": 38.81278538812786,
"grad_norm": 2.8910727500915527,
"learning_rate": 4.05e-05,
"loss": 0.6036,
"step": 59500
},
{
"epoch": 38.81278538812786,
"eval_accuracy": 0.8307667164389769,
"eval_loss": 0.8048545718193054,
"eval_runtime": 96.2445,
"eval_samples_per_second": 127.405,
"eval_steps_per_second": 3.99,
"step": 59500
},
{
"epoch": 39.13894324853229,
"grad_norm": 3.3615550994873047,
"learning_rate": 4e-05,
"loss": 0.5865,
"step": 60000
},
{
"epoch": 39.13894324853229,
"eval_accuracy": 0.8312174034922063,
"eval_loss": 0.8044449090957642,
"eval_runtime": 95.555,
"eval_samples_per_second": 128.324,
"eval_steps_per_second": 4.019,
"step": 60000
},
{
"epoch": 39.46510110893673,
"grad_norm": 2.6367311477661133,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.588,
"step": 60500
},
{
"epoch": 39.46510110893673,
"eval_accuracy": 0.8304663566029479,
"eval_loss": 0.821357250213623,
"eval_runtime": 95.9701,
"eval_samples_per_second": 127.769,
"eval_steps_per_second": 4.001,
"step": 60500
},
{
"epoch": 39.79125896934116,
"grad_norm": 2.833712100982666,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.586,
"step": 61000
},
{
"epoch": 39.79125896934116,
"eval_accuracy": 0.829917319891339,
"eval_loss": 0.8134341239929199,
"eval_runtime": 95.9703,
"eval_samples_per_second": 127.769,
"eval_steps_per_second": 4.001,
"step": 61000
},
{
"epoch": 40.11741682974559,
"grad_norm": 2.9397034645080566,
"learning_rate": 3.85e-05,
"loss": 0.571,
"step": 61500
},
{
"epoch": 40.11741682974559,
"eval_accuracy": 0.8292021602772011,
"eval_loss": 0.8118214011192322,
"eval_runtime": 95.6187,
"eval_samples_per_second": 128.239,
"eval_steps_per_second": 4.016,
"step": 61500
},
{
"epoch": 40.44357469015003,
"grad_norm": 2.5662078857421875,
"learning_rate": 3.8e-05,
"loss": 0.5749,
"step": 62000
},
{
"epoch": 40.44357469015003,
"eval_accuracy": 0.8333482202670716,
"eval_loss": 0.8056595325469971,
"eval_runtime": 96.04,
"eval_samples_per_second": 127.676,
"eval_steps_per_second": 3.998,
"step": 62000
},
{
"epoch": 40.769732550554465,
"grad_norm": 3.049619674682617,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.5793,
"step": 62500
},
{
"epoch": 40.769732550554465,
"eval_accuracy": 0.8336420510760487,
"eval_loss": 0.8024507164955139,
"eval_runtime": 95.6023,
"eval_samples_per_second": 128.261,
"eval_steps_per_second": 4.017,
"step": 62500
},
{
"epoch": 41.0958904109589,
"grad_norm": 2.7201578617095947,
"learning_rate": 3.7e-05,
"loss": 0.5778,
"step": 63000
},
{
"epoch": 41.0958904109589,
"eval_accuracy": 0.8319174128616887,
"eval_loss": 0.8089193105697632,
"eval_runtime": 95.6644,
"eval_samples_per_second": 128.177,
"eval_steps_per_second": 4.014,
"step": 63000
},
{
"epoch": 41.42204827136334,
"grad_norm": 2.6951823234558105,
"learning_rate": 3.65e-05,
"loss": 0.5612,
"step": 63500
},
{
"epoch": 41.42204827136334,
"eval_accuracy": 0.8335347454422788,
"eval_loss": 0.80367112159729,
"eval_runtime": 95.6191,
"eval_samples_per_second": 128.238,
"eval_steps_per_second": 4.016,
"step": 63500
},
{
"epoch": 41.74820613176777,
"grad_norm": 4.181463718414307,
"learning_rate": 3.6e-05,
"loss": 0.573,
"step": 64000
},
{
"epoch": 41.74820613176777,
"eval_accuracy": 0.8344243842473971,
"eval_loss": 0.7983211278915405,
"eval_runtime": 95.629,
"eval_samples_per_second": 128.225,
"eval_steps_per_second": 4.016,
"step": 64000
},
{
"epoch": 42.07436399217221,
"grad_norm": 3.2401015758514404,
"learning_rate": 3.55e-05,
"loss": 0.5696,
"step": 64500
},
{
"epoch": 42.07436399217221,
"eval_accuracy": 0.8310029766521158,
"eval_loss": 0.8181082606315613,
"eval_runtime": 96.0081,
"eval_samples_per_second": 127.718,
"eval_steps_per_second": 4.0,
"step": 64500
},
{
"epoch": 42.400521852576645,
"grad_norm": 3.277033805847168,
"learning_rate": 3.5e-05,
"loss": 0.564,
"step": 65000
},
{
"epoch": 42.400521852576645,
"eval_accuracy": 0.8335396635951874,
"eval_loss": 0.8013662695884705,
"eval_runtime": 95.6901,
"eval_samples_per_second": 128.143,
"eval_steps_per_second": 4.013,
"step": 65000
},
{
"epoch": 42.72667971298108,
"grad_norm": 4.847507476806641,
"learning_rate": 3.45e-05,
"loss": 0.5686,
"step": 65500
},
{
"epoch": 42.72667971298108,
"eval_accuracy": 0.833032886000638,
"eval_loss": 0.8119781613349915,
"eval_runtime": 96.1047,
"eval_samples_per_second": 127.59,
"eval_steps_per_second": 3.996,
"step": 65500
},
{
"epoch": 43.05283757338552,
"grad_norm": 3.5789191722869873,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.5599,
"step": 66000
},
{
"epoch": 43.05283757338552,
"eval_accuracy": 0.8339582654059341,
"eval_loss": 0.8123458027839661,
"eval_runtime": 96.2815,
"eval_samples_per_second": 127.356,
"eval_steps_per_second": 3.988,
"step": 66000
},
{
"epoch": 43.37899543378995,
"grad_norm": 3.250049352645874,
"learning_rate": 3.35e-05,
"loss": 0.5555,
"step": 66500
},
{
"epoch": 43.37899543378995,
"eval_accuracy": 0.8328555366091573,
"eval_loss": 0.8025438785552979,
"eval_runtime": 96.065,
"eval_samples_per_second": 127.643,
"eval_steps_per_second": 3.997,
"step": 66500
},
{
"epoch": 43.70515329419439,
"grad_norm": 2.0880448818206787,
"learning_rate": 3.3e-05,
"loss": 0.5599,
"step": 67000
},
{
"epoch": 43.70515329419439,
"eval_accuracy": 0.8333740804125086,
"eval_loss": 0.8076988458633423,
"eval_runtime": 95.9358,
"eval_samples_per_second": 127.815,
"eval_steps_per_second": 4.003,
"step": 67000
},
{
"epoch": 44.031311154598825,
"grad_norm": 4.393618106842041,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.557,
"step": 67500
},
{
"epoch": 44.031311154598825,
"eval_accuracy": 0.8331855464420306,
"eval_loss": 0.8098081946372986,
"eval_runtime": 95.4292,
"eval_samples_per_second": 128.493,
"eval_steps_per_second": 4.024,
"step": 67500
},
{
"epoch": 44.35746901500326,
"grad_norm": 5.500895023345947,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.5488,
"step": 68000
},
{
"epoch": 44.35746901500326,
"eval_accuracy": 0.8343688661284848,
"eval_loss": 0.8053340315818787,
"eval_runtime": 95.5428,
"eval_samples_per_second": 128.34,
"eval_steps_per_second": 4.019,
"step": 68000
},
{
"epoch": 44.6836268754077,
"grad_norm": 2.8524134159088135,
"learning_rate": 3.15e-05,
"loss": 0.5504,
"step": 68500
},
{
"epoch": 44.6836268754077,
"eval_accuracy": 0.8342111492484318,
"eval_loss": 0.7959148287773132,
"eval_runtime": 96.1022,
"eval_samples_per_second": 127.593,
"eval_steps_per_second": 3.996,
"step": 68500
},
{
"epoch": 45.00978473581213,
"grad_norm": 3.5730416774749756,
"learning_rate": 3.1e-05,
"loss": 0.5505,
"step": 69000
},
{
"epoch": 45.00978473581213,
"eval_accuracy": 0.836165363858391,
"eval_loss": 0.7981916069984436,
"eval_runtime": 95.9912,
"eval_samples_per_second": 127.741,
"eval_steps_per_second": 4.0,
"step": 69000
},
{
"epoch": 45.33594259621657,
"grad_norm": 4.396024703979492,
"learning_rate": 3.05e-05,
"loss": 0.5486,
"step": 69500
},
{
"epoch": 45.33594259621657,
"eval_accuracy": 0.8332422855455485,
"eval_loss": 0.803587794303894,
"eval_runtime": 95.7722,
"eval_samples_per_second": 128.033,
"eval_steps_per_second": 4.01,
"step": 69500
},
{
"epoch": 45.662100456621005,
"grad_norm": 2.496528387069702,
"learning_rate": 3e-05,
"loss": 0.5448,
"step": 70000
},
{
"epoch": 45.662100456621005,
"eval_accuracy": 0.8342580056510478,
"eval_loss": 0.807097315788269,
"eval_runtime": 95.754,
"eval_samples_per_second": 128.057,
"eval_steps_per_second": 4.01,
"step": 70000
},
{
"epoch": 45.98825831702544,
"grad_norm": 3.3834452629089355,
"learning_rate": 2.95e-05,
"loss": 0.5464,
"step": 70500
},
{
"epoch": 45.98825831702544,
"eval_accuracy": 0.8337018788400463,
"eval_loss": 0.8085704445838928,
"eval_runtime": 95.6878,
"eval_samples_per_second": 128.146,
"eval_steps_per_second": 4.013,
"step": 70500
},
{
"epoch": 46.31441617742988,
"grad_norm": 2.3634603023529053,
"learning_rate": 2.9e-05,
"loss": 0.5438,
"step": 71000
},
{
"epoch": 46.31441617742988,
"eval_accuracy": 0.8341511430834018,
"eval_loss": 0.8026093244552612,
"eval_runtime": 96.1617,
"eval_samples_per_second": 127.514,
"eval_steps_per_second": 3.993,
"step": 71000
},
{
"epoch": 46.64057403783431,
"grad_norm": 3.5233588218688965,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.5394,
"step": 71500
},
{
"epoch": 46.64057403783431,
"eval_accuracy": 0.8342311802203471,
"eval_loss": 0.8086482882499695,
"eval_runtime": 96.1007,
"eval_samples_per_second": 127.595,
"eval_steps_per_second": 3.996,
"step": 71500
},
{
"epoch": 46.96673189823875,
"grad_norm": 2.4381179809570312,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.5349,
"step": 72000
},
{
"epoch": 46.96673189823875,
"eval_accuracy": 0.8354184145463185,
"eval_loss": 0.805791437625885,
"eval_runtime": 96.0711,
"eval_samples_per_second": 127.635,
"eval_steps_per_second": 3.997,
"step": 72000
},
{
"epoch": 47.292889758643184,
"grad_norm": 3.008922576904297,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.5327,
"step": 72500
},
{
"epoch": 47.292889758643184,
"eval_accuracy": 0.8362029367748818,
"eval_loss": 0.7979453206062317,
"eval_runtime": 96.1804,
"eval_samples_per_second": 127.49,
"eval_steps_per_second": 3.992,
"step": 72500
},
{
"epoch": 47.61904761904762,
"grad_norm": 3.482806444168091,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.5334,
"step": 73000
},
{
"epoch": 47.61904761904762,
"eval_accuracy": 0.8365827562635599,
"eval_loss": 0.8000433444976807,
"eval_runtime": 95.6103,
"eval_samples_per_second": 128.25,
"eval_steps_per_second": 4.016,
"step": 73000
},
{
"epoch": 47.945205479452056,
"grad_norm": 2.3070437908172607,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.5339,
"step": 73500
},
{
"epoch": 47.945205479452056,
"eval_accuracy": 0.8357628837739592,
"eval_loss": 0.8132687211036682,
"eval_runtime": 96.239,
"eval_samples_per_second": 127.412,
"eval_steps_per_second": 3.99,
"step": 73500
},
{
"epoch": 48.27136333985649,
"grad_norm": 3.9446592330932617,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.523,
"step": 74000
},
{
"epoch": 48.27136333985649,
"eval_accuracy": 0.8368259826536262,
"eval_loss": 0.786713719367981,
"eval_runtime": 96.19,
"eval_samples_per_second": 127.477,
"eval_steps_per_second": 3.992,
"step": 74000
},
{
"epoch": 48.59752120026093,
"grad_norm": 3.631774663925171,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.527,
"step": 74500
},
{
"epoch": 48.59752120026093,
"eval_accuracy": 0.8353441631588309,
"eval_loss": 0.7979721426963806,
"eval_runtime": 96.3234,
"eval_samples_per_second": 127.3,
"eval_steps_per_second": 3.987,
"step": 74500
},
{
"epoch": 48.923679060665364,
"grad_norm": 3.334470510482788,
"learning_rate": 2.5e-05,
"loss": 0.5318,
"step": 75000
},
{
"epoch": 48.923679060665364,
"eval_accuracy": 0.8345138108475779,
"eval_loss": 0.8020584583282471,
"eval_runtime": 96.0458,
"eval_samples_per_second": 127.668,
"eval_steps_per_second": 3.998,
"step": 75000
},
{
"epoch": 49.2498369210698,
"grad_norm": 3.5331332683563232,
"learning_rate": 2.45e-05,
"loss": 0.5207,
"step": 75500
},
{
"epoch": 49.2498369210698,
"eval_accuracy": 0.8387281134722294,
"eval_loss": 0.7923426628112793,
"eval_runtime": 96.1278,
"eval_samples_per_second": 127.559,
"eval_steps_per_second": 3.995,
"step": 75500
},
{
"epoch": 49.575994781474236,
"grad_norm": 3.9823110103607178,
"learning_rate": 2.4e-05,
"loss": 0.5209,
"step": 76000
},
{
"epoch": 49.575994781474236,
"eval_accuracy": 0.8363262984494693,
"eval_loss": 0.8024851083755493,
"eval_runtime": 96.1881,
"eval_samples_per_second": 127.479,
"eval_steps_per_second": 3.992,
"step": 76000
},
{
"epoch": 49.90215264187867,
"grad_norm": 3.4313673973083496,
"learning_rate": 2.35e-05,
"loss": 0.5201,
"step": 76500
},
{
"epoch": 49.90215264187867,
"eval_accuracy": 0.836565998919567,
"eval_loss": 0.8021891713142395,
"eval_runtime": 96.0086,
"eval_samples_per_second": 127.718,
"eval_steps_per_second": 4.0,
"step": 76500
},
{
"epoch": 50.22831050228311,
"grad_norm": 3.1152913570404053,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.5139,
"step": 77000
},
{
"epoch": 50.22831050228311,
"eval_accuracy": 0.8371820454427441,
"eval_loss": 0.8075475692749023,
"eval_runtime": 96.1279,
"eval_samples_per_second": 127.559,
"eval_steps_per_second": 3.995,
"step": 77000
},
{
"epoch": 50.554468362687544,
"grad_norm": 4.154286861419678,
"learning_rate": 2.25e-05,
"loss": 0.5136,
"step": 77500
},
{
"epoch": 50.554468362687544,
"eval_accuracy": 0.8396914258560779,
"eval_loss": 0.7858642339706421,
"eval_runtime": 96.2083,
"eval_samples_per_second": 127.453,
"eval_steps_per_second": 3.991,
"step": 77500
},
{
"epoch": 50.88062622309198,
"grad_norm": 2.3929026126861572,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.5116,
"step": 78000
},
{
"epoch": 50.88062622309198,
"eval_accuracy": 0.8367701163066699,
"eval_loss": 0.7931028604507446,
"eval_runtime": 95.7046,
"eval_samples_per_second": 128.123,
"eval_steps_per_second": 4.012,
"step": 78000
},
{
"epoch": 51.20678408349641,
"grad_norm": 3.0909852981567383,
"learning_rate": 2.15e-05,
"loss": 0.5189,
"step": 78500
},
{
"epoch": 51.20678408349641,
"eval_accuracy": 0.8370578517272915,
"eval_loss": 0.799592137336731,
"eval_runtime": 96.1212,
"eval_samples_per_second": 127.568,
"eval_steps_per_second": 3.995,
"step": 78500
},
{
"epoch": 51.532941943900845,
"grad_norm": 3.350635051727295,
"learning_rate": 2.1e-05,
"loss": 0.5095,
"step": 79000
},
{
"epoch": 51.532941943900845,
"eval_accuracy": 0.836364720171079,
"eval_loss": 0.8035129308700562,
"eval_runtime": 95.5727,
"eval_samples_per_second": 128.3,
"eval_steps_per_second": 4.018,
"step": 79000
},
{
"epoch": 51.85909980430528,
"grad_norm": 4.1038432121276855,
"learning_rate": 2.05e-05,
"loss": 0.51,
"step": 79500
},
{
"epoch": 51.85909980430528,
"eval_accuracy": 0.836879590639366,
"eval_loss": 0.8043432235717773,
"eval_runtime": 96.0977,
"eval_samples_per_second": 127.599,
"eval_steps_per_second": 3.996,
"step": 79500
},
{
"epoch": 52.18525766470972,
"grad_norm": 3.108510971069336,
"learning_rate": 2e-05,
"loss": 0.5036,
"step": 80000
},
{
"epoch": 52.18525766470972,
"eval_accuracy": 0.8368426134295879,
"eval_loss": 0.8010953664779663,
"eval_runtime": 95.5615,
"eval_samples_per_second": 128.315,
"eval_steps_per_second": 4.018,
"step": 80000
},
{
"epoch": 52.51141552511415,
"grad_norm": 3.1937355995178223,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.5127,
"step": 80500
},
{
"epoch": 52.51141552511415,
"eval_accuracy": 0.8369965405101081,
"eval_loss": 0.7906477451324463,
"eval_runtime": 95.7155,
"eval_samples_per_second": 128.109,
"eval_steps_per_second": 4.012,
"step": 80500
},
{
"epoch": 52.83757338551859,
"grad_norm": 3.720996141433716,
"learning_rate": 1.9e-05,
"loss": 0.5035,
"step": 81000
},
{
"epoch": 52.83757338551859,
"eval_accuracy": 0.8397895174956149,
"eval_loss": 0.7947555184364319,
"eval_runtime": 95.6287,
"eval_samples_per_second": 128.225,
"eval_steps_per_second": 4.016,
"step": 81000
},
{
"epoch": 53.163731245923024,
"grad_norm": 2.5395772457122803,
"learning_rate": 1.85e-05,
"loss": 0.4958,
"step": 81500
},
{
"epoch": 53.163731245923024,
"eval_accuracy": 0.8374669324421457,
"eval_loss": 0.8044614791870117,
"eval_runtime": 96.0682,
"eval_samples_per_second": 127.638,
"eval_steps_per_second": 3.997,
"step": 81500
},
{
"epoch": 53.48988910632746,
"grad_norm": 3.4834647178649902,
"learning_rate": 1.8e-05,
"loss": 0.5056,
"step": 82000
},
{
"epoch": 53.48988910632746,
"eval_accuracy": 0.8400154459651129,
"eval_loss": 0.789566695690155,
"eval_runtime": 96.0293,
"eval_samples_per_second": 127.69,
"eval_steps_per_second": 3.999,
"step": 82000
},
{
"epoch": 53.816046966731896,
"grad_norm": 4.184600830078125,
"learning_rate": 1.75e-05,
"loss": 0.4952,
"step": 82500
},
{
"epoch": 53.816046966731896,
"eval_accuracy": 0.8376462588050942,
"eval_loss": 0.811129629611969,
"eval_runtime": 96.2175,
"eval_samples_per_second": 127.44,
"eval_steps_per_second": 3.991,
"step": 82500
},
{
"epoch": 54.14220482713633,
"grad_norm": 2.301683187484741,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.4987,
"step": 83000
},
{
"epoch": 54.14220482713633,
"eval_accuracy": 0.8384409578370847,
"eval_loss": 0.7945307493209839,
"eval_runtime": 96.0056,
"eval_samples_per_second": 127.722,
"eval_steps_per_second": 4.0,
"step": 83000
},
{
"epoch": 54.46836268754077,
"grad_norm": 3.919367551803589,
"learning_rate": 1.65e-05,
"loss": 0.496,
"step": 83500
},
{
"epoch": 54.46836268754077,
"eval_accuracy": 0.8394707240542301,
"eval_loss": 0.7912357449531555,
"eval_runtime": 96.1449,
"eval_samples_per_second": 127.537,
"eval_steps_per_second": 3.994,
"step": 83500
},
{
"epoch": 54.794520547945204,
"grad_norm": 2.68684983253479,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.4895,
"step": 84000
},
{
"epoch": 54.794520547945204,
"eval_accuracy": 0.8381230887985037,
"eval_loss": 0.788875162601471,
"eval_runtime": 96.2309,
"eval_samples_per_second": 127.423,
"eval_steps_per_second": 3.99,
"step": 84000
},
{
"epoch": 55.12067840834964,
"grad_norm": 4.197926044464111,
"learning_rate": 1.55e-05,
"loss": 0.4946,
"step": 84500
},
{
"epoch": 55.12067840834964,
"eval_accuracy": 0.8396828466447496,
"eval_loss": 0.7965431213378906,
"eval_runtime": 96.0388,
"eval_samples_per_second": 127.678,
"eval_steps_per_second": 3.998,
"step": 84500
},
{
"epoch": 55.446836268754076,
"grad_norm": 2.5157694816589355,
"learning_rate": 1.5e-05,
"loss": 0.4886,
"step": 85000
},
{
"epoch": 55.446836268754076,
"eval_accuracy": 0.8387194828110288,
"eval_loss": 0.8012397289276123,
"eval_runtime": 96.2067,
"eval_samples_per_second": 127.455,
"eval_steps_per_second": 3.991,
"step": 85000
},
{
"epoch": 55.77299412915851,
"grad_norm": 2.7158167362213135,
"learning_rate": 1.45e-05,
"loss": 0.484,
"step": 85500
},
{
"epoch": 55.77299412915851,
"eval_accuracy": 0.8395588257109374,
"eval_loss": 0.7894542813301086,
"eval_runtime": 95.6318,
"eval_samples_per_second": 128.221,
"eval_steps_per_second": 4.015,
"step": 85500
},
{
"epoch": 56.09915198956295,
"grad_norm": 3.833855390548706,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.4921,
"step": 86000
},
{
"epoch": 56.09915198956295,
"eval_accuracy": 0.8383367532042704,
"eval_loss": 0.8041108250617981,
"eval_runtime": 95.6095,
"eval_samples_per_second": 128.251,
"eval_steps_per_second": 4.016,
"step": 86000
},
{
"epoch": 56.425309849967384,
"grad_norm": 3.353053331375122,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.4866,
"step": 86500
},
{
"epoch": 56.425309849967384,
"eval_accuracy": 0.8387854971453993,
"eval_loss": 0.7873290181159973,
"eval_runtime": 96.098,
"eval_samples_per_second": 127.599,
"eval_steps_per_second": 3.996,
"step": 86500
},
{
"epoch": 56.75146771037182,
"grad_norm": 3.2117395401000977,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.4878,
"step": 87000
},
{
"epoch": 56.75146771037182,
"eval_accuracy": 0.8396631660025774,
"eval_loss": 0.7974265813827515,
"eval_runtime": 95.6357,
"eval_samples_per_second": 128.216,
"eval_steps_per_second": 4.015,
"step": 87000
},
{
"epoch": 57.077625570776256,
"grad_norm": 3.84195613861084,
"learning_rate": 1.25e-05,
"loss": 0.4757,
"step": 87500
},
{
"epoch": 57.077625570776256,
"eval_accuracy": 0.8371895013512013,
"eval_loss": 0.8067038655281067,
"eval_runtime": 96.1865,
"eval_samples_per_second": 127.481,
"eval_steps_per_second": 3.992,
"step": 87500
},
{
"epoch": 57.40378343118069,
"grad_norm": 2.6807780265808105,
"learning_rate": 1.2e-05,
"loss": 0.4793,
"step": 88000
},
{
"epoch": 57.40378343118069,
"eval_accuracy": 0.840466263302501,
"eval_loss": 0.7817407250404358,
"eval_runtime": 96.1004,
"eval_samples_per_second": 127.596,
"eval_steps_per_second": 3.996,
"step": 88000
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 66,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.419534048504054e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}