pixel-intermediate / last-checkpoint /trainer_state.json
Nadav's picture
Training in progress, step 100000
4ff1820
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.506294658046954,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"learning_rate": 9.999999999999999e-06,
"loss": 0.5192,
"step": 500
},
{
"epoch": 0.09,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4978,
"step": 1000
},
{
"epoch": 0.09,
"eval_loss": 0.4652232229709625,
"eval_runtime": 21.3928,
"eval_samples_per_second": 23.372,
"eval_steps_per_second": 0.748,
"step": 1000
},
{
"epoch": 0.13,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4877,
"step": 1500
},
{
"epoch": 0.17,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4811,
"step": 2000
},
{
"epoch": 0.17,
"eval_loss": 0.4523410201072693,
"eval_runtime": 15.3182,
"eval_samples_per_second": 32.641,
"eval_steps_per_second": 1.045,
"step": 2000
},
{
"epoch": 0.21,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4747,
"step": 2500
},
{
"epoch": 0.26,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4714,
"step": 3000
},
{
"epoch": 0.26,
"eval_loss": 0.44367074966430664,
"eval_runtime": 16.026,
"eval_samples_per_second": 31.199,
"eval_steps_per_second": 0.998,
"step": 3000
},
{
"epoch": 0.3,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4671,
"step": 3500
},
{
"epoch": 0.34,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4648,
"step": 4000
},
{
"epoch": 0.34,
"eval_loss": 0.4375583827495575,
"eval_runtime": 16.9713,
"eval_samples_per_second": 29.461,
"eval_steps_per_second": 0.943,
"step": 4000
},
{
"epoch": 0.38,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4628,
"step": 4500
},
{
"epoch": 0.43,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4611,
"step": 5000
},
{
"epoch": 0.43,
"eval_loss": 0.4329264163970947,
"eval_runtime": 20.3173,
"eval_samples_per_second": 24.61,
"eval_steps_per_second": 0.788,
"step": 5000
},
{
"epoch": 0.47,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4588,
"step": 5500
},
{
"epoch": 0.51,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4567,
"step": 6000
},
{
"epoch": 0.51,
"eval_loss": 0.4276145100593567,
"eval_runtime": 16.3756,
"eval_samples_per_second": 30.533,
"eval_steps_per_second": 0.977,
"step": 6000
},
{
"epoch": 0.55,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4549,
"step": 6500
},
{
"epoch": 0.6,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4527,
"step": 7000
},
{
"epoch": 0.6,
"eval_loss": 0.42289844155311584,
"eval_runtime": 15.9391,
"eval_samples_per_second": 31.369,
"eval_steps_per_second": 1.004,
"step": 7000
},
{
"epoch": 0.64,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4523,
"step": 7500
},
{
"epoch": 0.68,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4504,
"step": 8000
},
{
"epoch": 0.68,
"eval_loss": 0.4213045537471771,
"eval_runtime": 15.5457,
"eval_samples_per_second": 32.163,
"eval_steps_per_second": 1.029,
"step": 8000
},
{
"epoch": 0.72,
"learning_rate": 9.999999999999999e-06,
"loss": 0.449,
"step": 8500
},
{
"epoch": 0.77,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4473,
"step": 9000
},
{
"epoch": 0.77,
"eval_loss": 0.41637736558914185,
"eval_runtime": 15.7487,
"eval_samples_per_second": 31.749,
"eval_steps_per_second": 1.016,
"step": 9000
},
{
"epoch": 0.81,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4468,
"step": 9500
},
{
"epoch": 0.85,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4447,
"step": 10000
},
{
"epoch": 0.85,
"eval_loss": 0.4148881733417511,
"eval_runtime": 15.3622,
"eval_samples_per_second": 32.547,
"eval_steps_per_second": 1.042,
"step": 10000
},
{
"epoch": 0.89,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4437,
"step": 10500
},
{
"epoch": 0.94,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4433,
"step": 11000
},
{
"epoch": 0.94,
"eval_loss": 0.4144207835197449,
"eval_runtime": 30.4128,
"eval_samples_per_second": 16.44,
"eval_steps_per_second": 0.526,
"step": 11000
},
{
"epoch": 0.98,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4431,
"step": 11500
},
{
"epoch": 1.02,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4413,
"step": 12000
},
{
"epoch": 1.02,
"eval_loss": 0.4120546877384186,
"eval_runtime": 14.9708,
"eval_samples_per_second": 33.398,
"eval_steps_per_second": 1.069,
"step": 12000
},
{
"epoch": 1.06,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4401,
"step": 12500
},
{
"epoch": 1.11,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4395,
"step": 13000
},
{
"epoch": 1.11,
"eval_loss": 0.40858784317970276,
"eval_runtime": 16.4691,
"eval_samples_per_second": 30.36,
"eval_steps_per_second": 0.972,
"step": 13000
},
{
"epoch": 1.15,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4411,
"step": 13500
},
{
"epoch": 1.19,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4391,
"step": 14000
},
{
"epoch": 1.19,
"eval_loss": 0.40859168767929077,
"eval_runtime": 16.6715,
"eval_samples_per_second": 29.991,
"eval_steps_per_second": 0.96,
"step": 14000
},
{
"epoch": 1.23,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4385,
"step": 14500
},
{
"epoch": 1.28,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4371,
"step": 15000
},
{
"epoch": 1.28,
"eval_loss": 0.4050961434841156,
"eval_runtime": 14.6709,
"eval_samples_per_second": 34.081,
"eval_steps_per_second": 1.091,
"step": 15000
},
{
"epoch": 1.32,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4358,
"step": 15500
},
{
"epoch": 1.36,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4363,
"step": 16000
},
{
"epoch": 1.36,
"eval_loss": 0.4048325717449188,
"eval_runtime": 16.0756,
"eval_samples_per_second": 31.103,
"eval_steps_per_second": 0.995,
"step": 16000
},
{
"epoch": 1.4,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4352,
"step": 16500
},
{
"epoch": 1.45,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4346,
"step": 17000
},
{
"epoch": 1.45,
"eval_loss": 0.4037468731403351,
"eval_runtime": 16.4235,
"eval_samples_per_second": 30.444,
"eval_steps_per_second": 0.974,
"step": 17000
},
{
"epoch": 1.49,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4336,
"step": 17500
},
{
"epoch": 1.53,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4335,
"step": 18000
},
{
"epoch": 1.53,
"eval_loss": 0.402103453874588,
"eval_runtime": 28.6118,
"eval_samples_per_second": 17.475,
"eval_steps_per_second": 0.559,
"step": 18000
},
{
"epoch": 1.57,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4325,
"step": 18500
},
{
"epoch": 1.62,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4319,
"step": 19000
},
{
"epoch": 1.62,
"eval_loss": 0.4030299186706543,
"eval_runtime": 16.452,
"eval_samples_per_second": 30.391,
"eval_steps_per_second": 0.973,
"step": 19000
},
{
"epoch": 1.66,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4311,
"step": 19500
},
{
"epoch": 1.7,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4317,
"step": 20000
},
{
"epoch": 1.7,
"eval_loss": 0.40188169479370117,
"eval_runtime": 15.416,
"eval_samples_per_second": 32.434,
"eval_steps_per_second": 1.038,
"step": 20000
},
{
"epoch": 1.74,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4313,
"step": 20500
},
{
"epoch": 1.79,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4296,
"step": 21000
},
{
"epoch": 1.79,
"eval_loss": 0.39878711104393005,
"eval_runtime": 16.1844,
"eval_samples_per_second": 30.894,
"eval_steps_per_second": 0.989,
"step": 21000
},
{
"epoch": 1.83,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4288,
"step": 21500
},
{
"epoch": 1.87,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4278,
"step": 22000
},
{
"epoch": 1.87,
"eval_loss": 0.3984658718109131,
"eval_runtime": 17.0912,
"eval_samples_per_second": 29.255,
"eval_steps_per_second": 0.936,
"step": 22000
},
{
"epoch": 1.91,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4278,
"step": 22500
},
{
"epoch": 1.96,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4276,
"step": 23000
},
{
"epoch": 1.96,
"eval_loss": 0.3981262743473053,
"eval_runtime": 16.5906,
"eval_samples_per_second": 30.138,
"eval_steps_per_second": 0.964,
"step": 23000
},
{
"epoch": 2.0,
"learning_rate": 9.999999999999999e-06,
"loss": 0.428,
"step": 23500
},
{
"epoch": 2.04,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4264,
"step": 24000
},
{
"epoch": 2.04,
"eval_loss": 0.39774054288864136,
"eval_runtime": 24.4452,
"eval_samples_per_second": 20.454,
"eval_steps_per_second": 0.655,
"step": 24000
},
{
"epoch": 2.08,
"learning_rate": 9.999999999999999e-06,
"loss": 0.427,
"step": 24500
},
{
"epoch": 2.13,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4267,
"step": 25000
},
{
"epoch": 2.13,
"eval_loss": 0.3962687849998474,
"eval_runtime": 16.5048,
"eval_samples_per_second": 30.294,
"eval_steps_per_second": 0.969,
"step": 25000
},
{
"epoch": 2.17,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4271,
"step": 25500
},
{
"epoch": 2.21,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4252,
"step": 26000
},
{
"epoch": 2.21,
"eval_loss": 0.3965121805667877,
"eval_runtime": 16.1623,
"eval_samples_per_second": 30.936,
"eval_steps_per_second": 0.99,
"step": 26000
},
{
"epoch": 2.25,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4255,
"step": 26500
},
{
"epoch": 2.3,
"learning_rate": 9.999999999999999e-06,
"loss": 0.425,
"step": 27000
},
{
"epoch": 2.3,
"eval_loss": 0.39477214217185974,
"eval_runtime": 15.7512,
"eval_samples_per_second": 31.744,
"eval_steps_per_second": 1.016,
"step": 27000
},
{
"epoch": 2.34,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4248,
"step": 27500
},
{
"epoch": 2.38,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4248,
"step": 28000
},
{
"epoch": 2.38,
"eval_loss": 0.395481139421463,
"eval_runtime": 15.4129,
"eval_samples_per_second": 32.44,
"eval_steps_per_second": 1.038,
"step": 28000
},
{
"epoch": 2.42,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4246,
"step": 28500
},
{
"epoch": 2.47,
"learning_rate": 9.999999999999999e-06,
"loss": 0.424,
"step": 29000
},
{
"epoch": 2.47,
"eval_loss": 0.3951389193534851,
"eval_runtime": 15.7676,
"eval_samples_per_second": 31.711,
"eval_steps_per_second": 1.015,
"step": 29000
},
{
"epoch": 2.51,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4241,
"step": 29500
},
{
"epoch": 2.55,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4234,
"step": 30000
},
{
"epoch": 2.55,
"eval_loss": 0.3956534266471863,
"eval_runtime": 15.8104,
"eval_samples_per_second": 31.625,
"eval_steps_per_second": 1.012,
"step": 30000
},
{
"epoch": 2.59,
"learning_rate": 9.999999999999999e-06,
"loss": 0.422,
"step": 30500
},
{
"epoch": 2.64,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4227,
"step": 31000
},
{
"epoch": 2.64,
"eval_loss": 0.3907557427883148,
"eval_runtime": 16.5808,
"eval_samples_per_second": 30.155,
"eval_steps_per_second": 0.965,
"step": 31000
},
{
"epoch": 2.68,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4213,
"step": 31500
},
{
"epoch": 2.72,
"learning_rate": 9.999999999999999e-06,
"loss": 0.421,
"step": 32000
},
{
"epoch": 2.72,
"eval_loss": 0.3934537172317505,
"eval_runtime": 24.4217,
"eval_samples_per_second": 20.474,
"eval_steps_per_second": 0.655,
"step": 32000
},
{
"epoch": 2.76,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4207,
"step": 32500
},
{
"epoch": 2.81,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4206,
"step": 33000
},
{
"epoch": 2.81,
"eval_loss": 0.3901897072792053,
"eval_runtime": 16.8693,
"eval_samples_per_second": 29.64,
"eval_steps_per_second": 0.948,
"step": 33000
},
{
"epoch": 2.85,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4202,
"step": 33500
},
{
"epoch": 2.89,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4196,
"step": 34000
},
{
"epoch": 2.89,
"eval_loss": 0.3905479609966278,
"eval_runtime": 16.5144,
"eval_samples_per_second": 30.277,
"eval_steps_per_second": 0.969,
"step": 34000
},
{
"epoch": 2.93,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4191,
"step": 34500
},
{
"epoch": 2.98,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4205,
"step": 35000
},
{
"epoch": 2.98,
"eval_loss": 0.390372633934021,
"eval_runtime": 16.8904,
"eval_samples_per_second": 29.603,
"eval_steps_per_second": 0.947,
"step": 35000
},
{
"epoch": 3.02,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4195,
"step": 35500
},
{
"epoch": 3.06,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4194,
"step": 36000
},
{
"epoch": 3.06,
"eval_loss": 0.38856348395347595,
"eval_runtime": 16.5028,
"eval_samples_per_second": 30.298,
"eval_steps_per_second": 0.97,
"step": 36000
},
{
"epoch": 3.1,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4193,
"step": 36500
},
{
"epoch": 3.15,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4208,
"step": 37000
},
{
"epoch": 3.15,
"eval_loss": 0.3889642059803009,
"eval_runtime": 28.0106,
"eval_samples_per_second": 17.85,
"eval_steps_per_second": 0.571,
"step": 37000
},
{
"epoch": 3.19,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4189,
"step": 37500
},
{
"epoch": 3.23,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4187,
"step": 38000
},
{
"epoch": 3.23,
"eval_loss": 0.3886989653110504,
"eval_runtime": 15.6007,
"eval_samples_per_second": 32.05,
"eval_steps_per_second": 1.026,
"step": 38000
},
{
"epoch": 3.27,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4181,
"step": 38500
},
{
"epoch": 3.32,
"learning_rate": 9.999999999999999e-06,
"loss": 0.417,
"step": 39000
},
{
"epoch": 3.32,
"eval_loss": 0.3878667950630188,
"eval_runtime": 14.893,
"eval_samples_per_second": 33.573,
"eval_steps_per_second": 1.074,
"step": 39000
},
{
"epoch": 3.36,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4176,
"step": 39500
},
{
"epoch": 3.4,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4164,
"step": 40000
},
{
"epoch": 3.4,
"eval_loss": 0.3855785131454468,
"eval_runtime": 15.2409,
"eval_samples_per_second": 32.806,
"eval_steps_per_second": 1.05,
"step": 40000
},
{
"epoch": 3.45,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4167,
"step": 40500
},
{
"epoch": 3.49,
"learning_rate": 9.999999999999999e-06,
"loss": 0.417,
"step": 41000
},
{
"epoch": 3.49,
"eval_loss": 0.38663551211357117,
"eval_runtime": 24.5074,
"eval_samples_per_second": 20.402,
"eval_steps_per_second": 0.653,
"step": 41000
},
{
"epoch": 3.53,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4168,
"step": 41500
},
{
"epoch": 3.57,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4159,
"step": 42000
},
{
"epoch": 3.57,
"eval_loss": 0.38440173864364624,
"eval_runtime": 30.9795,
"eval_samples_per_second": 16.14,
"eval_steps_per_second": 0.516,
"step": 42000
},
{
"epoch": 3.62,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4151,
"step": 42500
},
{
"epoch": 3.66,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4155,
"step": 43000
},
{
"epoch": 3.66,
"eval_loss": 0.3864738941192627,
"eval_runtime": 24.9969,
"eval_samples_per_second": 20.002,
"eval_steps_per_second": 0.64,
"step": 43000
},
{
"epoch": 3.7,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4157,
"step": 43500
},
{
"epoch": 3.74,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4158,
"step": 44000
},
{
"epoch": 3.74,
"eval_loss": 0.3862515091896057,
"eval_runtime": 28.5688,
"eval_samples_per_second": 17.502,
"eval_steps_per_second": 0.56,
"step": 44000
},
{
"epoch": 3.79,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4147,
"step": 44500
},
{
"epoch": 3.83,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4134,
"step": 45000
},
{
"epoch": 3.83,
"eval_loss": 0.38480713963508606,
"eval_runtime": 27.3513,
"eval_samples_per_second": 18.281,
"eval_steps_per_second": 0.585,
"step": 45000
},
{
"epoch": 3.87,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4147,
"step": 45500
},
{
"epoch": 3.91,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4145,
"step": 46000
},
{
"epoch": 3.91,
"eval_loss": 0.3854221701622009,
"eval_runtime": 27.205,
"eval_samples_per_second": 18.379,
"eval_steps_per_second": 0.588,
"step": 46000
},
{
"epoch": 3.96,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4149,
"step": 46500
},
{
"epoch": 4.0,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4143,
"step": 47000
},
{
"epoch": 4.0,
"eval_loss": 0.38265106081962585,
"eval_runtime": 26.169,
"eval_samples_per_second": 19.107,
"eval_steps_per_second": 0.611,
"step": 47000
},
{
"epoch": 4.04,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4129,
"step": 47500
},
{
"epoch": 4.08,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4144,
"step": 48000
},
{
"epoch": 4.08,
"eval_loss": 0.382869690656662,
"eval_runtime": 25.2103,
"eval_samples_per_second": 19.833,
"eval_steps_per_second": 0.635,
"step": 48000
},
{
"epoch": 4.13,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4131,
"step": 48500
},
{
"epoch": 4.17,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4147,
"step": 49000
},
{
"epoch": 4.17,
"eval_loss": 0.38291990756988525,
"eval_runtime": 36.6033,
"eval_samples_per_second": 13.66,
"eval_steps_per_second": 0.437,
"step": 49000
},
{
"epoch": 4.21,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4125,
"step": 49500
},
{
"epoch": 4.25,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4143,
"step": 50000
},
{
"epoch": 4.25,
"eval_loss": 0.3828723728656769,
"eval_runtime": 27.6434,
"eval_samples_per_second": 18.088,
"eval_steps_per_second": 0.579,
"step": 50000
},
{
"epoch": 4.3,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4127,
"step": 50500
},
{
"epoch": 4.34,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4131,
"step": 51000
},
{
"epoch": 4.34,
"eval_loss": 0.3833463191986084,
"eval_runtime": 50.874,
"eval_samples_per_second": 9.828,
"eval_steps_per_second": 0.315,
"step": 51000
},
{
"epoch": 4.38,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4129,
"step": 51500
},
{
"epoch": 4.42,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4129,
"step": 52000
},
{
"epoch": 4.42,
"eval_loss": 0.38282835483551025,
"eval_runtime": 18.4975,
"eval_samples_per_second": 27.031,
"eval_steps_per_second": 0.865,
"step": 52000
},
{
"epoch": 4.47,
"learning_rate": 9.999999999999999e-06,
"loss": 0.412,
"step": 52500
},
{
"epoch": 4.51,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4121,
"step": 53000
},
{
"epoch": 4.51,
"eval_loss": 0.3821110427379608,
"eval_runtime": 19.4342,
"eval_samples_per_second": 25.728,
"eval_steps_per_second": 0.823,
"step": 53000
},
{
"epoch": 4.55,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4112,
"step": 53500
},
{
"epoch": 4.59,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4116,
"step": 54000
},
{
"epoch": 4.59,
"eval_loss": 0.3829655051231384,
"eval_runtime": 21.2757,
"eval_samples_per_second": 23.501,
"eval_steps_per_second": 0.752,
"step": 54000
},
{
"epoch": 4.64,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4108,
"step": 54500
},
{
"epoch": 4.68,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4104,
"step": 55000
},
{
"epoch": 4.68,
"eval_loss": 0.3811788260936737,
"eval_runtime": 16.6571,
"eval_samples_per_second": 30.017,
"eval_steps_per_second": 0.961,
"step": 55000
},
{
"epoch": 4.72,
"learning_rate": 9.999999999999999e-06,
"loss": 0.411,
"step": 55500
},
{
"epoch": 4.76,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4108,
"step": 56000
},
{
"epoch": 4.76,
"eval_loss": 0.38048413395881653,
"eval_runtime": 27.2288,
"eval_samples_per_second": 18.363,
"eval_steps_per_second": 0.588,
"step": 56000
},
{
"epoch": 4.81,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4112,
"step": 56500
},
{
"epoch": 4.85,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4098,
"step": 57000
},
{
"epoch": 4.85,
"eval_loss": 0.3806820809841156,
"eval_runtime": 28.3585,
"eval_samples_per_second": 17.631,
"eval_steps_per_second": 0.564,
"step": 57000
},
{
"epoch": 4.89,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4083,
"step": 57500
},
{
"epoch": 4.93,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4097,
"step": 58000
},
{
"epoch": 4.93,
"eval_loss": 0.38133466243743896,
"eval_runtime": 31.8927,
"eval_samples_per_second": 15.678,
"eval_steps_per_second": 0.502,
"step": 58000
},
{
"epoch": 4.98,
"learning_rate": 9.999999999999999e-06,
"loss": 0.41,
"step": 58500
},
{
"epoch": 5.02,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4098,
"step": 59000
},
{
"epoch": 5.02,
"eval_loss": 0.380397766828537,
"eval_runtime": 29.3164,
"eval_samples_per_second": 17.055,
"eval_steps_per_second": 0.546,
"step": 59000
},
{
"epoch": 5.06,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4094,
"step": 59500
},
{
"epoch": 5.1,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4092,
"step": 60000
},
{
"epoch": 5.1,
"eval_loss": 0.38139721751213074,
"eval_runtime": 19.7764,
"eval_samples_per_second": 25.283,
"eval_steps_per_second": 0.809,
"step": 60000
},
{
"epoch": 5.15,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4108,
"step": 60500
},
{
"epoch": 5.19,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4094,
"step": 61000
},
{
"epoch": 5.19,
"eval_loss": 0.3796501159667969,
"eval_runtime": 18.1293,
"eval_samples_per_second": 27.58,
"eval_steps_per_second": 0.883,
"step": 61000
},
{
"epoch": 5.23,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4092,
"step": 61500
},
{
"epoch": 5.27,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4091,
"step": 62000
},
{
"epoch": 5.27,
"eval_loss": 0.3790924549102783,
"eval_runtime": 20.9048,
"eval_samples_per_second": 23.918,
"eval_steps_per_second": 0.765,
"step": 62000
},
{
"epoch": 5.32,
"learning_rate": 9.999999999999999e-06,
"loss": 0.408,
"step": 62500
},
{
"epoch": 5.36,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4102,
"step": 63000
},
{
"epoch": 5.36,
"eval_loss": 0.3805426061153412,
"eval_runtime": 27.4404,
"eval_samples_per_second": 18.221,
"eval_steps_per_second": 0.583,
"step": 63000
},
{
"epoch": 5.4,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4086,
"step": 63500
},
{
"epoch": 5.44,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4087,
"step": 64000
},
{
"epoch": 5.44,
"eval_loss": 0.37830984592437744,
"eval_runtime": 14.8851,
"eval_samples_per_second": 33.591,
"eval_steps_per_second": 1.075,
"step": 64000
},
{
"epoch": 5.49,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4081,
"step": 64500
},
{
"epoch": 5.53,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4083,
"step": 65000
},
{
"epoch": 5.53,
"eval_loss": 0.3796636164188385,
"eval_runtime": 17.3567,
"eval_samples_per_second": 28.807,
"eval_steps_per_second": 0.922,
"step": 65000
},
{
"epoch": 5.57,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4078,
"step": 65500
},
{
"epoch": 5.61,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4078,
"step": 66000
},
{
"epoch": 5.61,
"eval_loss": 0.3783106803894043,
"eval_runtime": 29.6676,
"eval_samples_per_second": 16.853,
"eval_steps_per_second": 0.539,
"step": 66000
},
{
"epoch": 5.66,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4067,
"step": 66500
},
{
"epoch": 5.7,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4072,
"step": 67000
},
{
"epoch": 5.7,
"eval_loss": 0.3780921399593353,
"eval_runtime": 15.2739,
"eval_samples_per_second": 32.736,
"eval_steps_per_second": 1.048,
"step": 67000
},
{
"epoch": 5.74,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4079,
"step": 67500
},
{
"epoch": 5.78,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4057,
"step": 68000
},
{
"epoch": 5.78,
"eval_loss": 0.37833890318870544,
"eval_runtime": 17.1263,
"eval_samples_per_second": 29.195,
"eval_steps_per_second": 0.934,
"step": 68000
},
{
"epoch": 5.83,
"learning_rate": 9.999999999999999e-06,
"loss": 0.406,
"step": 68500
},
{
"epoch": 5.87,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4065,
"step": 69000
},
{
"epoch": 5.87,
"eval_loss": 0.37815991044044495,
"eval_runtime": 19.0772,
"eval_samples_per_second": 26.209,
"eval_steps_per_second": 0.839,
"step": 69000
},
{
"epoch": 5.91,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4063,
"step": 69500
},
{
"epoch": 5.95,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4062,
"step": 70000
},
{
"epoch": 5.95,
"eval_loss": 0.3770570158958435,
"eval_runtime": 15.6266,
"eval_samples_per_second": 31.997,
"eval_steps_per_second": 1.024,
"step": 70000
},
{
"epoch": 6.0,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4073,
"step": 70500
},
{
"epoch": 6.04,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4051,
"step": 71000
},
{
"epoch": 6.04,
"eval_loss": 0.3775251507759094,
"eval_runtime": 16.0318,
"eval_samples_per_second": 31.188,
"eval_steps_per_second": 0.998,
"step": 71000
},
{
"epoch": 6.08,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4066,
"step": 71500
},
{
"epoch": 6.12,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4057,
"step": 72000
},
{
"epoch": 6.12,
"eval_loss": 0.37701237201690674,
"eval_runtime": 15.6982,
"eval_samples_per_second": 31.851,
"eval_steps_per_second": 1.019,
"step": 72000
},
{
"epoch": 6.17,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4067,
"step": 72500
},
{
"epoch": 6.21,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4061,
"step": 73000
},
{
"epoch": 6.21,
"eval_loss": 0.37806421518325806,
"eval_runtime": 15.7852,
"eval_samples_per_second": 31.675,
"eval_steps_per_second": 1.014,
"step": 73000
},
{
"epoch": 6.25,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4053,
"step": 73500
},
{
"epoch": 6.29,
"learning_rate": 9.999999999999999e-06,
"loss": 0.405,
"step": 74000
},
{
"epoch": 6.29,
"eval_loss": 0.3771826922893524,
"eval_runtime": 15.5158,
"eval_samples_per_second": 32.225,
"eval_steps_per_second": 1.031,
"step": 74000
},
{
"epoch": 6.34,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4064,
"step": 74500
},
{
"epoch": 6.38,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4053,
"step": 75000
},
{
"epoch": 6.38,
"eval_loss": 0.377290278673172,
"eval_runtime": 23.3698,
"eval_samples_per_second": 21.395,
"eval_steps_per_second": 0.685,
"step": 75000
},
{
"epoch": 6.42,
"learning_rate": 9.999999999999999e-06,
"loss": 0.406,
"step": 75500
},
{
"epoch": 6.46,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4054,
"step": 76000
},
{
"epoch": 6.46,
"eval_loss": 0.3762701749801636,
"eval_runtime": 15.2662,
"eval_samples_per_second": 32.752,
"eval_steps_per_second": 1.048,
"step": 76000
},
{
"epoch": 6.51,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4047,
"step": 76500
},
{
"epoch": 6.55,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4043,
"step": 77000
},
{
"epoch": 6.55,
"eval_loss": 0.3773665130138397,
"eval_runtime": 23.0339,
"eval_samples_per_second": 21.707,
"eval_steps_per_second": 0.695,
"step": 77000
},
{
"epoch": 6.59,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4041,
"step": 77500
},
{
"epoch": 6.63,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4044,
"step": 78000
},
{
"epoch": 6.63,
"eval_loss": 0.3738757371902466,
"eval_runtime": 16.5496,
"eval_samples_per_second": 30.212,
"eval_steps_per_second": 0.967,
"step": 78000
},
{
"epoch": 6.68,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4038,
"step": 78500
},
{
"epoch": 6.72,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4038,
"step": 79000
},
{
"epoch": 6.72,
"eval_loss": 0.37452879548072815,
"eval_runtime": 16.7684,
"eval_samples_per_second": 29.818,
"eval_steps_per_second": 0.954,
"step": 79000
},
{
"epoch": 6.76,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4039,
"step": 79500
},
{
"epoch": 6.81,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4045,
"step": 80000
},
{
"epoch": 6.81,
"eval_loss": 0.3761942684650421,
"eval_runtime": 16.6694,
"eval_samples_per_second": 29.995,
"eval_steps_per_second": 0.96,
"step": 80000
},
{
"epoch": 6.85,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4036,
"step": 80500
},
{
"epoch": 6.89,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4035,
"step": 81000
},
{
"epoch": 6.89,
"eval_loss": 0.3746860921382904,
"eval_runtime": 15.7109,
"eval_samples_per_second": 31.825,
"eval_steps_per_second": 1.018,
"step": 81000
},
{
"epoch": 6.93,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4037,
"step": 81500
},
{
"epoch": 6.98,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4045,
"step": 82000
},
{
"epoch": 6.98,
"eval_loss": 0.37363681197166443,
"eval_runtime": 22.9088,
"eval_samples_per_second": 21.826,
"eval_steps_per_second": 0.698,
"step": 82000
},
{
"epoch": 7.02,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4035,
"step": 82500
},
{
"epoch": 7.06,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4031,
"step": 83000
},
{
"epoch": 7.06,
"eval_loss": 0.37529370188713074,
"eval_runtime": 14.7314,
"eval_samples_per_second": 33.941,
"eval_steps_per_second": 1.086,
"step": 83000
},
{
"epoch": 7.1,
"learning_rate": 9.999999999999999e-06,
"loss": 0.402,
"step": 83500
},
{
"epoch": 7.15,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4042,
"step": 84000
},
{
"epoch": 7.15,
"eval_loss": 0.37475818395614624,
"eval_runtime": 15.8331,
"eval_samples_per_second": 31.579,
"eval_steps_per_second": 1.011,
"step": 84000
},
{
"epoch": 7.19,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4032,
"step": 84500
},
{
"epoch": 7.23,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4029,
"step": 85000
},
{
"epoch": 7.23,
"eval_loss": 0.3748987317085266,
"eval_runtime": 17.2956,
"eval_samples_per_second": 28.909,
"eval_steps_per_second": 0.925,
"step": 85000
},
{
"epoch": 7.27,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4034,
"step": 85500
},
{
"epoch": 7.32,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4029,
"step": 86000
},
{
"epoch": 7.32,
"eval_loss": 0.37344664335250854,
"eval_runtime": 15.9881,
"eval_samples_per_second": 31.273,
"eval_steps_per_second": 1.001,
"step": 86000
},
{
"epoch": 7.36,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4043,
"step": 86500
},
{
"epoch": 7.4,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4019,
"step": 87000
},
{
"epoch": 7.4,
"eval_loss": 0.3718353509902954,
"eval_runtime": 15.2483,
"eval_samples_per_second": 32.791,
"eval_steps_per_second": 1.049,
"step": 87000
},
{
"epoch": 7.44,
"learning_rate": 9.999999999999999e-06,
"loss": 0.403,
"step": 87500
},
{
"epoch": 7.49,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4023,
"step": 88000
},
{
"epoch": 7.49,
"eval_loss": 0.37316328287124634,
"eval_runtime": 26.8261,
"eval_samples_per_second": 18.639,
"eval_steps_per_second": 0.596,
"step": 88000
},
{
"epoch": 7.53,
"learning_rate": 9.999999999999999e-06,
"loss": 0.402,
"step": 88500
},
{
"epoch": 7.57,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4022,
"step": 89000
},
{
"epoch": 7.57,
"eval_loss": 0.3714210093021393,
"eval_runtime": 15.634,
"eval_samples_per_second": 31.982,
"eval_steps_per_second": 1.023,
"step": 89000
},
{
"epoch": 7.61,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4017,
"step": 89500
},
{
"epoch": 7.66,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4019,
"step": 90000
},
{
"epoch": 7.66,
"eval_loss": 0.3728122413158417,
"eval_runtime": 16.3123,
"eval_samples_per_second": 30.652,
"eval_steps_per_second": 0.981,
"step": 90000
},
{
"epoch": 7.7,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4017,
"step": 90500
},
{
"epoch": 7.74,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4016,
"step": 91000
},
{
"epoch": 7.74,
"eval_loss": 0.3734327256679535,
"eval_runtime": 17.5409,
"eval_samples_per_second": 28.505,
"eval_steps_per_second": 0.912,
"step": 91000
},
{
"epoch": 7.78,
"learning_rate": 9.999999999999999e-06,
"loss": 0.3998,
"step": 91500
},
{
"epoch": 7.83,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4006,
"step": 92000
},
{
"epoch": 7.83,
"eval_loss": 0.3747243583202362,
"eval_runtime": 17.4755,
"eval_samples_per_second": 28.611,
"eval_steps_per_second": 0.916,
"step": 92000
},
{
"epoch": 7.87,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4013,
"step": 92500
},
{
"epoch": 7.91,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4008,
"step": 93000
},
{
"epoch": 7.91,
"eval_loss": 0.37303251028060913,
"eval_runtime": 17.0856,
"eval_samples_per_second": 29.264,
"eval_steps_per_second": 0.936,
"step": 93000
},
{
"epoch": 7.95,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4008,
"step": 93500
},
{
"epoch": 8.0,
"learning_rate": 9.999999999999999e-06,
"loss": 0.402,
"step": 94000
},
{
"epoch": 8.0,
"eval_loss": 0.37281692028045654,
"eval_runtime": 17.9894,
"eval_samples_per_second": 27.794,
"eval_steps_per_second": 0.889,
"step": 94000
},
{
"epoch": 8.04,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4005,
"step": 94500
},
{
"epoch": 8.08,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4008,
"step": 95000
},
{
"epoch": 8.08,
"eval_loss": 0.37092164158821106,
"eval_runtime": 17.2285,
"eval_samples_per_second": 29.022,
"eval_steps_per_second": 0.929,
"step": 95000
},
{
"epoch": 8.12,
"learning_rate": 9.999999999999999e-06,
"loss": 0.3997,
"step": 95500
},
{
"epoch": 8.17,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4024,
"step": 96000
},
{
"epoch": 8.17,
"eval_loss": 0.37120264768600464,
"eval_runtime": 16.2125,
"eval_samples_per_second": 30.84,
"eval_steps_per_second": 0.987,
"step": 96000
},
{
"epoch": 8.21,
"learning_rate": 9.999999999999999e-06,
"loss": 0.3997,
"step": 96500
},
{
"epoch": 8.25,
"learning_rate": 9.999999999999999e-06,
"loss": 0.402,
"step": 97000
},
{
"epoch": 8.25,
"eval_loss": 0.37261128425598145,
"eval_runtime": 16.3463,
"eval_samples_per_second": 30.588,
"eval_steps_per_second": 0.979,
"step": 97000
},
{
"epoch": 8.29,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4004,
"step": 97500
},
{
"epoch": 8.34,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4003,
"step": 98000
},
{
"epoch": 8.34,
"eval_loss": 0.37027257680892944,
"eval_runtime": 20.0807,
"eval_samples_per_second": 24.9,
"eval_steps_per_second": 0.797,
"step": 98000
},
{
"epoch": 8.38,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4002,
"step": 98500
},
{
"epoch": 8.42,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4007,
"step": 99000
},
{
"epoch": 8.42,
"eval_loss": 0.37140411138534546,
"eval_runtime": 21.4309,
"eval_samples_per_second": 23.331,
"eval_steps_per_second": 0.747,
"step": 99000
},
{
"epoch": 8.46,
"learning_rate": 9.999999999999999e-06,
"loss": 0.4,
"step": 99500
},
{
"epoch": 8.51,
"learning_rate": 9.999999999999999e-06,
"loss": 0.3997,
"step": 100000
},
{
"epoch": 8.51,
"eval_loss": 0.3693406283855438,
"eval_runtime": 17.2208,
"eval_samples_per_second": 29.035,
"eval_steps_per_second": 0.929,
"step": 100000
}
],
"max_steps": 1000000,
"num_train_epochs": 86,
"total_flos": 4.600200440697905e+21,
"trial_name": null,
"trial_params": null
}