{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.506294658046954, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 9.999999999999999e-06, "loss": 0.5192, "step": 500 }, { "epoch": 0.09, "learning_rate": 9.999999999999999e-06, "loss": 0.4978, "step": 1000 }, { "epoch": 0.09, "eval_loss": 0.4652232229709625, "eval_runtime": 21.3928, "eval_samples_per_second": 23.372, "eval_steps_per_second": 0.748, "step": 1000 }, { "epoch": 0.13, "learning_rate": 9.999999999999999e-06, "loss": 0.4877, "step": 1500 }, { "epoch": 0.17, "learning_rate": 9.999999999999999e-06, "loss": 0.4811, "step": 2000 }, { "epoch": 0.17, "eval_loss": 0.4523410201072693, "eval_runtime": 15.3182, "eval_samples_per_second": 32.641, "eval_steps_per_second": 1.045, "step": 2000 }, { "epoch": 0.21, "learning_rate": 9.999999999999999e-06, "loss": 0.4747, "step": 2500 }, { "epoch": 0.26, "learning_rate": 9.999999999999999e-06, "loss": 0.4714, "step": 3000 }, { "epoch": 0.26, "eval_loss": 0.44367074966430664, "eval_runtime": 16.026, "eval_samples_per_second": 31.199, "eval_steps_per_second": 0.998, "step": 3000 }, { "epoch": 0.3, "learning_rate": 9.999999999999999e-06, "loss": 0.4671, "step": 3500 }, { "epoch": 0.34, "learning_rate": 9.999999999999999e-06, "loss": 0.4648, "step": 4000 }, { "epoch": 0.34, "eval_loss": 0.4375583827495575, "eval_runtime": 16.9713, "eval_samples_per_second": 29.461, "eval_steps_per_second": 0.943, "step": 4000 }, { "epoch": 0.38, "learning_rate": 9.999999999999999e-06, "loss": 0.4628, "step": 4500 }, { "epoch": 0.43, "learning_rate": 9.999999999999999e-06, "loss": 0.4611, "step": 5000 }, { "epoch": 0.43, "eval_loss": 0.4329264163970947, "eval_runtime": 20.3173, "eval_samples_per_second": 24.61, "eval_steps_per_second": 0.788, "step": 5000 }, { "epoch": 0.47, "learning_rate": 9.999999999999999e-06, "loss": 0.4588, "step": 5500 }, { "epoch": 0.51, "learning_rate": 9.999999999999999e-06, "loss": 0.4567, "step": 6000 }, { "epoch": 0.51, "eval_loss": 0.4276145100593567, "eval_runtime": 16.3756, "eval_samples_per_second": 30.533, "eval_steps_per_second": 0.977, "step": 6000 }, { "epoch": 0.55, "learning_rate": 9.999999999999999e-06, "loss": 0.4549, "step": 6500 }, { "epoch": 0.6, "learning_rate": 9.999999999999999e-06, "loss": 0.4527, "step": 7000 }, { "epoch": 0.6, "eval_loss": 0.42289844155311584, "eval_runtime": 15.9391, "eval_samples_per_second": 31.369, "eval_steps_per_second": 1.004, "step": 7000 }, { "epoch": 0.64, "learning_rate": 9.999999999999999e-06, "loss": 0.4523, "step": 7500 }, { "epoch": 0.68, "learning_rate": 9.999999999999999e-06, "loss": 0.4504, "step": 8000 }, { "epoch": 0.68, "eval_loss": 0.4213045537471771, "eval_runtime": 15.5457, "eval_samples_per_second": 32.163, "eval_steps_per_second": 1.029, "step": 8000 }, { "epoch": 0.72, "learning_rate": 9.999999999999999e-06, "loss": 0.449, "step": 8500 }, { "epoch": 0.77, "learning_rate": 9.999999999999999e-06, "loss": 0.4473, "step": 9000 }, { "epoch": 0.77, "eval_loss": 0.41637736558914185, "eval_runtime": 15.7487, "eval_samples_per_second": 31.749, "eval_steps_per_second": 1.016, "step": 9000 }, { "epoch": 0.81, "learning_rate": 9.999999999999999e-06, "loss": 0.4468, "step": 9500 }, { "epoch": 0.85, "learning_rate": 9.999999999999999e-06, "loss": 0.4447, "step": 10000 }, { "epoch": 0.85, "eval_loss": 0.4148881733417511, "eval_runtime": 15.3622, "eval_samples_per_second": 32.547, "eval_steps_per_second": 1.042, "step": 10000 }, { "epoch": 0.89, "learning_rate": 9.999999999999999e-06, "loss": 0.4437, "step": 10500 }, { "epoch": 0.94, "learning_rate": 9.999999999999999e-06, "loss": 0.4433, "step": 11000 }, { "epoch": 0.94, "eval_loss": 0.4144207835197449, "eval_runtime": 30.4128, "eval_samples_per_second": 16.44, "eval_steps_per_second": 0.526, "step": 11000 }, { "epoch": 0.98, "learning_rate": 9.999999999999999e-06, "loss": 0.4431, "step": 11500 }, { "epoch": 1.02, "learning_rate": 9.999999999999999e-06, "loss": 0.4413, "step": 12000 }, { "epoch": 1.02, "eval_loss": 0.4120546877384186, "eval_runtime": 14.9708, "eval_samples_per_second": 33.398, "eval_steps_per_second": 1.069, "step": 12000 }, { "epoch": 1.06, "learning_rate": 9.999999999999999e-06, "loss": 0.4401, "step": 12500 }, { "epoch": 1.11, "learning_rate": 9.999999999999999e-06, "loss": 0.4395, "step": 13000 }, { "epoch": 1.11, "eval_loss": 0.40858784317970276, "eval_runtime": 16.4691, "eval_samples_per_second": 30.36, "eval_steps_per_second": 0.972, "step": 13000 }, { "epoch": 1.15, "learning_rate": 9.999999999999999e-06, "loss": 0.4411, "step": 13500 }, { "epoch": 1.19, "learning_rate": 9.999999999999999e-06, "loss": 0.4391, "step": 14000 }, { "epoch": 1.19, "eval_loss": 0.40859168767929077, "eval_runtime": 16.6715, "eval_samples_per_second": 29.991, "eval_steps_per_second": 0.96, "step": 14000 }, { "epoch": 1.23, "learning_rate": 9.999999999999999e-06, "loss": 0.4385, "step": 14500 }, { "epoch": 1.28, "learning_rate": 9.999999999999999e-06, "loss": 0.4371, "step": 15000 }, { "epoch": 1.28, "eval_loss": 0.4050961434841156, "eval_runtime": 14.6709, "eval_samples_per_second": 34.081, "eval_steps_per_second": 1.091, "step": 15000 }, { "epoch": 1.32, "learning_rate": 9.999999999999999e-06, "loss": 0.4358, "step": 15500 }, { "epoch": 1.36, "learning_rate": 9.999999999999999e-06, "loss": 0.4363, "step": 16000 }, { "epoch": 1.36, "eval_loss": 0.4048325717449188, "eval_runtime": 16.0756, "eval_samples_per_second": 31.103, "eval_steps_per_second": 0.995, "step": 16000 }, { "epoch": 1.4, "learning_rate": 9.999999999999999e-06, "loss": 0.4352, "step": 16500 }, { "epoch": 1.45, "learning_rate": 9.999999999999999e-06, "loss": 0.4346, "step": 17000 }, { "epoch": 1.45, "eval_loss": 0.4037468731403351, "eval_runtime": 16.4235, "eval_samples_per_second": 30.444, "eval_steps_per_second": 0.974, "step": 17000 }, { "epoch": 1.49, "learning_rate": 9.999999999999999e-06, "loss": 0.4336, "step": 17500 }, { "epoch": 1.53, "learning_rate": 9.999999999999999e-06, "loss": 0.4335, "step": 18000 }, { "epoch": 1.53, "eval_loss": 0.402103453874588, "eval_runtime": 28.6118, "eval_samples_per_second": 17.475, "eval_steps_per_second": 0.559, "step": 18000 }, { "epoch": 1.57, "learning_rate": 9.999999999999999e-06, "loss": 0.4325, "step": 18500 }, { "epoch": 1.62, "learning_rate": 9.999999999999999e-06, "loss": 0.4319, "step": 19000 }, { "epoch": 1.62, "eval_loss": 0.4030299186706543, "eval_runtime": 16.452, "eval_samples_per_second": 30.391, "eval_steps_per_second": 0.973, "step": 19000 }, { "epoch": 1.66, "learning_rate": 9.999999999999999e-06, "loss": 0.4311, "step": 19500 }, { "epoch": 1.7, "learning_rate": 9.999999999999999e-06, "loss": 0.4317, "step": 20000 }, { "epoch": 1.7, "eval_loss": 0.40188169479370117, "eval_runtime": 15.416, "eval_samples_per_second": 32.434, "eval_steps_per_second": 1.038, "step": 20000 }, { "epoch": 1.74, "learning_rate": 9.999999999999999e-06, "loss": 0.4313, "step": 20500 }, { "epoch": 1.79, "learning_rate": 9.999999999999999e-06, "loss": 0.4296, "step": 21000 }, { "epoch": 1.79, "eval_loss": 0.39878711104393005, "eval_runtime": 16.1844, "eval_samples_per_second": 30.894, "eval_steps_per_second": 0.989, "step": 21000 }, { "epoch": 1.83, "learning_rate": 9.999999999999999e-06, "loss": 0.4288, "step": 21500 }, { "epoch": 1.87, "learning_rate": 9.999999999999999e-06, "loss": 0.4278, "step": 22000 }, { "epoch": 1.87, "eval_loss": 0.3984658718109131, "eval_runtime": 17.0912, "eval_samples_per_second": 29.255, "eval_steps_per_second": 0.936, "step": 22000 }, { "epoch": 1.91, "learning_rate": 9.999999999999999e-06, "loss": 0.4278, "step": 22500 }, { "epoch": 1.96, "learning_rate": 9.999999999999999e-06, "loss": 0.4276, "step": 23000 }, { "epoch": 1.96, "eval_loss": 0.3981262743473053, "eval_runtime": 16.5906, "eval_samples_per_second": 30.138, "eval_steps_per_second": 0.964, "step": 23000 }, { "epoch": 2.0, "learning_rate": 9.999999999999999e-06, "loss": 0.428, "step": 23500 }, { "epoch": 2.04, "learning_rate": 9.999999999999999e-06, "loss": 0.4264, "step": 24000 }, { "epoch": 2.04, "eval_loss": 0.39774054288864136, "eval_runtime": 24.4452, "eval_samples_per_second": 20.454, "eval_steps_per_second": 0.655, "step": 24000 }, { "epoch": 2.08, "learning_rate": 9.999999999999999e-06, "loss": 0.427, "step": 24500 }, { "epoch": 2.13, "learning_rate": 9.999999999999999e-06, "loss": 0.4267, "step": 25000 }, { "epoch": 2.13, "eval_loss": 0.3962687849998474, "eval_runtime": 16.5048, "eval_samples_per_second": 30.294, "eval_steps_per_second": 0.969, "step": 25000 }, { "epoch": 2.17, "learning_rate": 9.999999999999999e-06, "loss": 0.4271, "step": 25500 }, { "epoch": 2.21, "learning_rate": 9.999999999999999e-06, "loss": 0.4252, "step": 26000 }, { "epoch": 2.21, "eval_loss": 0.3965121805667877, "eval_runtime": 16.1623, "eval_samples_per_second": 30.936, "eval_steps_per_second": 0.99, "step": 26000 }, { "epoch": 2.25, "learning_rate": 9.999999999999999e-06, "loss": 0.4255, "step": 26500 }, { "epoch": 2.3, "learning_rate": 9.999999999999999e-06, "loss": 0.425, "step": 27000 }, { "epoch": 2.3, "eval_loss": 0.39477214217185974, "eval_runtime": 15.7512, "eval_samples_per_second": 31.744, "eval_steps_per_second": 1.016, "step": 27000 }, { "epoch": 2.34, "learning_rate": 9.999999999999999e-06, "loss": 0.4248, "step": 27500 }, { "epoch": 2.38, "learning_rate": 9.999999999999999e-06, "loss": 0.4248, "step": 28000 }, { "epoch": 2.38, "eval_loss": 0.395481139421463, "eval_runtime": 15.4129, "eval_samples_per_second": 32.44, "eval_steps_per_second": 1.038, "step": 28000 }, { "epoch": 2.42, "learning_rate": 9.999999999999999e-06, "loss": 0.4246, "step": 28500 }, { "epoch": 2.47, "learning_rate": 9.999999999999999e-06, "loss": 0.424, "step": 29000 }, { "epoch": 2.47, "eval_loss": 0.3951389193534851, "eval_runtime": 15.7676, "eval_samples_per_second": 31.711, "eval_steps_per_second": 1.015, "step": 29000 }, { "epoch": 2.51, "learning_rate": 9.999999999999999e-06, "loss": 0.4241, "step": 29500 }, { "epoch": 2.55, "learning_rate": 9.999999999999999e-06, "loss": 0.4234, "step": 30000 }, { "epoch": 2.55, "eval_loss": 0.3956534266471863, "eval_runtime": 15.8104, "eval_samples_per_second": 31.625, "eval_steps_per_second": 1.012, "step": 30000 }, { "epoch": 2.59, "learning_rate": 9.999999999999999e-06, "loss": 0.422, "step": 30500 }, { "epoch": 2.64, "learning_rate": 9.999999999999999e-06, "loss": 0.4227, "step": 31000 }, { "epoch": 2.64, "eval_loss": 0.3907557427883148, "eval_runtime": 16.5808, "eval_samples_per_second": 30.155, "eval_steps_per_second": 0.965, "step": 31000 }, { "epoch": 2.68, "learning_rate": 9.999999999999999e-06, "loss": 0.4213, "step": 31500 }, { "epoch": 2.72, "learning_rate": 9.999999999999999e-06, "loss": 0.421, "step": 32000 }, { "epoch": 2.72, "eval_loss": 0.3934537172317505, "eval_runtime": 24.4217, "eval_samples_per_second": 20.474, "eval_steps_per_second": 0.655, "step": 32000 }, { "epoch": 2.76, "learning_rate": 9.999999999999999e-06, "loss": 0.4207, "step": 32500 }, { "epoch": 2.81, "learning_rate": 9.999999999999999e-06, "loss": 0.4206, "step": 33000 }, { "epoch": 2.81, "eval_loss": 0.3901897072792053, "eval_runtime": 16.8693, "eval_samples_per_second": 29.64, "eval_steps_per_second": 0.948, "step": 33000 }, { "epoch": 2.85, "learning_rate": 9.999999999999999e-06, "loss": 0.4202, "step": 33500 }, { "epoch": 2.89, "learning_rate": 9.999999999999999e-06, "loss": 0.4196, "step": 34000 }, { "epoch": 2.89, "eval_loss": 0.3905479609966278, "eval_runtime": 16.5144, "eval_samples_per_second": 30.277, "eval_steps_per_second": 0.969, "step": 34000 }, { "epoch": 2.93, "learning_rate": 9.999999999999999e-06, "loss": 0.4191, "step": 34500 }, { "epoch": 2.98, "learning_rate": 9.999999999999999e-06, "loss": 0.4205, "step": 35000 }, { "epoch": 2.98, "eval_loss": 0.390372633934021, "eval_runtime": 16.8904, "eval_samples_per_second": 29.603, "eval_steps_per_second": 0.947, "step": 35000 }, { "epoch": 3.02, "learning_rate": 9.999999999999999e-06, "loss": 0.4195, "step": 35500 }, { "epoch": 3.06, "learning_rate": 9.999999999999999e-06, "loss": 0.4194, "step": 36000 }, { "epoch": 3.06, "eval_loss": 0.38856348395347595, "eval_runtime": 16.5028, "eval_samples_per_second": 30.298, "eval_steps_per_second": 0.97, "step": 36000 }, { "epoch": 3.1, "learning_rate": 9.999999999999999e-06, "loss": 0.4193, "step": 36500 }, { "epoch": 3.15, "learning_rate": 9.999999999999999e-06, "loss": 0.4208, "step": 37000 }, { "epoch": 3.15, "eval_loss": 0.3889642059803009, "eval_runtime": 28.0106, "eval_samples_per_second": 17.85, "eval_steps_per_second": 0.571, "step": 37000 }, { "epoch": 3.19, "learning_rate": 9.999999999999999e-06, "loss": 0.4189, "step": 37500 }, { "epoch": 3.23, "learning_rate": 9.999999999999999e-06, "loss": 0.4187, "step": 38000 }, { "epoch": 3.23, "eval_loss": 0.3886989653110504, "eval_runtime": 15.6007, "eval_samples_per_second": 32.05, "eval_steps_per_second": 1.026, "step": 38000 }, { "epoch": 3.27, "learning_rate": 9.999999999999999e-06, "loss": 0.4181, "step": 38500 }, { "epoch": 3.32, "learning_rate": 9.999999999999999e-06, "loss": 0.417, "step": 39000 }, { "epoch": 3.32, "eval_loss": 0.3878667950630188, "eval_runtime": 14.893, "eval_samples_per_second": 33.573, "eval_steps_per_second": 1.074, "step": 39000 }, { "epoch": 3.36, "learning_rate": 9.999999999999999e-06, "loss": 0.4176, "step": 39500 }, { "epoch": 3.4, "learning_rate": 9.999999999999999e-06, "loss": 0.4164, "step": 40000 }, { "epoch": 3.4, "eval_loss": 0.3855785131454468, "eval_runtime": 15.2409, "eval_samples_per_second": 32.806, "eval_steps_per_second": 1.05, "step": 40000 }, { "epoch": 3.45, "learning_rate": 9.999999999999999e-06, "loss": 0.4167, "step": 40500 }, { "epoch": 3.49, "learning_rate": 9.999999999999999e-06, "loss": 0.417, "step": 41000 }, { "epoch": 3.49, "eval_loss": 0.38663551211357117, "eval_runtime": 24.5074, "eval_samples_per_second": 20.402, "eval_steps_per_second": 0.653, "step": 41000 }, { "epoch": 3.53, "learning_rate": 9.999999999999999e-06, "loss": 0.4168, "step": 41500 }, { "epoch": 3.57, "learning_rate": 9.999999999999999e-06, "loss": 0.4159, "step": 42000 }, { "epoch": 3.57, "eval_loss": 0.38440173864364624, "eval_runtime": 30.9795, "eval_samples_per_second": 16.14, "eval_steps_per_second": 0.516, "step": 42000 }, { "epoch": 3.62, "learning_rate": 9.999999999999999e-06, "loss": 0.4151, "step": 42500 }, { "epoch": 3.66, "learning_rate": 9.999999999999999e-06, "loss": 0.4155, "step": 43000 }, { "epoch": 3.66, "eval_loss": 0.3864738941192627, "eval_runtime": 24.9969, "eval_samples_per_second": 20.002, "eval_steps_per_second": 0.64, "step": 43000 }, { "epoch": 3.7, "learning_rate": 9.999999999999999e-06, "loss": 0.4157, "step": 43500 }, { "epoch": 3.74, "learning_rate": 9.999999999999999e-06, "loss": 0.4158, "step": 44000 }, { "epoch": 3.74, "eval_loss": 0.3862515091896057, "eval_runtime": 28.5688, "eval_samples_per_second": 17.502, "eval_steps_per_second": 0.56, "step": 44000 }, { "epoch": 3.79, "learning_rate": 9.999999999999999e-06, "loss": 0.4147, "step": 44500 }, { "epoch": 3.83, "learning_rate": 9.999999999999999e-06, "loss": 0.4134, "step": 45000 }, { "epoch": 3.83, "eval_loss": 0.38480713963508606, "eval_runtime": 27.3513, "eval_samples_per_second": 18.281, "eval_steps_per_second": 0.585, "step": 45000 }, { "epoch": 3.87, "learning_rate": 9.999999999999999e-06, "loss": 0.4147, "step": 45500 }, { "epoch": 3.91, "learning_rate": 9.999999999999999e-06, "loss": 0.4145, "step": 46000 }, { "epoch": 3.91, "eval_loss": 0.3854221701622009, "eval_runtime": 27.205, "eval_samples_per_second": 18.379, "eval_steps_per_second": 0.588, "step": 46000 }, { "epoch": 3.96, "learning_rate": 9.999999999999999e-06, "loss": 0.4149, "step": 46500 }, { "epoch": 4.0, "learning_rate": 9.999999999999999e-06, "loss": 0.4143, "step": 47000 }, { "epoch": 4.0, "eval_loss": 0.38265106081962585, "eval_runtime": 26.169, "eval_samples_per_second": 19.107, "eval_steps_per_second": 0.611, "step": 47000 }, { "epoch": 4.04, "learning_rate": 9.999999999999999e-06, "loss": 0.4129, "step": 47500 }, { "epoch": 4.08, "learning_rate": 9.999999999999999e-06, "loss": 0.4144, "step": 48000 }, { "epoch": 4.08, "eval_loss": 0.382869690656662, "eval_runtime": 25.2103, "eval_samples_per_second": 19.833, "eval_steps_per_second": 0.635, "step": 48000 }, { "epoch": 4.13, "learning_rate": 9.999999999999999e-06, "loss": 0.4131, "step": 48500 }, { "epoch": 4.17, "learning_rate": 9.999999999999999e-06, "loss": 0.4147, "step": 49000 }, { "epoch": 4.17, "eval_loss": 0.38291990756988525, "eval_runtime": 36.6033, "eval_samples_per_second": 13.66, "eval_steps_per_second": 0.437, "step": 49000 }, { "epoch": 4.21, "learning_rate": 9.999999999999999e-06, "loss": 0.4125, "step": 49500 }, { "epoch": 4.25, "learning_rate": 9.999999999999999e-06, "loss": 0.4143, "step": 50000 }, { "epoch": 4.25, "eval_loss": 0.3828723728656769, "eval_runtime": 27.6434, "eval_samples_per_second": 18.088, "eval_steps_per_second": 0.579, "step": 50000 }, { "epoch": 4.3, "learning_rate": 9.999999999999999e-06, "loss": 0.4127, "step": 50500 }, { "epoch": 4.34, "learning_rate": 9.999999999999999e-06, "loss": 0.4131, "step": 51000 }, { "epoch": 4.34, "eval_loss": 0.3833463191986084, "eval_runtime": 50.874, "eval_samples_per_second": 9.828, "eval_steps_per_second": 0.315, "step": 51000 }, { "epoch": 4.38, "learning_rate": 9.999999999999999e-06, "loss": 0.4129, "step": 51500 }, { "epoch": 4.42, "learning_rate": 9.999999999999999e-06, "loss": 0.4129, "step": 52000 }, { "epoch": 4.42, "eval_loss": 0.38282835483551025, "eval_runtime": 18.4975, "eval_samples_per_second": 27.031, "eval_steps_per_second": 0.865, "step": 52000 }, { "epoch": 4.47, "learning_rate": 9.999999999999999e-06, "loss": 0.412, "step": 52500 }, { "epoch": 4.51, "learning_rate": 9.999999999999999e-06, "loss": 0.4121, "step": 53000 }, { "epoch": 4.51, "eval_loss": 0.3821110427379608, "eval_runtime": 19.4342, "eval_samples_per_second": 25.728, "eval_steps_per_second": 0.823, "step": 53000 }, { "epoch": 4.55, "learning_rate": 9.999999999999999e-06, "loss": 0.4112, "step": 53500 }, { "epoch": 4.59, "learning_rate": 9.999999999999999e-06, "loss": 0.4116, "step": 54000 }, { "epoch": 4.59, "eval_loss": 0.3829655051231384, "eval_runtime": 21.2757, "eval_samples_per_second": 23.501, "eval_steps_per_second": 0.752, "step": 54000 }, { "epoch": 4.64, "learning_rate": 9.999999999999999e-06, "loss": 0.4108, "step": 54500 }, { "epoch": 4.68, "learning_rate": 9.999999999999999e-06, "loss": 0.4104, "step": 55000 }, { "epoch": 4.68, "eval_loss": 0.3811788260936737, "eval_runtime": 16.6571, "eval_samples_per_second": 30.017, "eval_steps_per_second": 0.961, "step": 55000 }, { "epoch": 4.72, "learning_rate": 9.999999999999999e-06, "loss": 0.411, "step": 55500 }, { "epoch": 4.76, "learning_rate": 9.999999999999999e-06, "loss": 0.4108, "step": 56000 }, { "epoch": 4.76, "eval_loss": 0.38048413395881653, "eval_runtime": 27.2288, "eval_samples_per_second": 18.363, "eval_steps_per_second": 0.588, "step": 56000 }, { "epoch": 4.81, "learning_rate": 9.999999999999999e-06, "loss": 0.4112, "step": 56500 }, { "epoch": 4.85, "learning_rate": 9.999999999999999e-06, "loss": 0.4098, "step": 57000 }, { "epoch": 4.85, "eval_loss": 0.3806820809841156, "eval_runtime": 28.3585, "eval_samples_per_second": 17.631, "eval_steps_per_second": 0.564, "step": 57000 }, { "epoch": 4.89, "learning_rate": 9.999999999999999e-06, "loss": 0.4083, "step": 57500 }, { "epoch": 4.93, "learning_rate": 9.999999999999999e-06, "loss": 0.4097, "step": 58000 }, { "epoch": 4.93, "eval_loss": 0.38133466243743896, "eval_runtime": 31.8927, "eval_samples_per_second": 15.678, "eval_steps_per_second": 0.502, "step": 58000 }, { "epoch": 4.98, "learning_rate": 9.999999999999999e-06, "loss": 0.41, "step": 58500 }, { "epoch": 5.02, "learning_rate": 9.999999999999999e-06, "loss": 0.4098, "step": 59000 }, { "epoch": 5.02, "eval_loss": 0.380397766828537, "eval_runtime": 29.3164, "eval_samples_per_second": 17.055, "eval_steps_per_second": 0.546, "step": 59000 }, { "epoch": 5.06, "learning_rate": 9.999999999999999e-06, "loss": 0.4094, "step": 59500 }, { "epoch": 5.1, "learning_rate": 9.999999999999999e-06, "loss": 0.4092, "step": 60000 }, { "epoch": 5.1, "eval_loss": 0.38139721751213074, "eval_runtime": 19.7764, "eval_samples_per_second": 25.283, "eval_steps_per_second": 0.809, "step": 60000 }, { "epoch": 5.15, "learning_rate": 9.999999999999999e-06, "loss": 0.4108, "step": 60500 }, { "epoch": 5.19, "learning_rate": 9.999999999999999e-06, "loss": 0.4094, "step": 61000 }, { "epoch": 5.19, "eval_loss": 0.3796501159667969, "eval_runtime": 18.1293, "eval_samples_per_second": 27.58, "eval_steps_per_second": 0.883, "step": 61000 }, { "epoch": 5.23, "learning_rate": 9.999999999999999e-06, "loss": 0.4092, "step": 61500 }, { "epoch": 5.27, "learning_rate": 9.999999999999999e-06, "loss": 0.4091, "step": 62000 }, { "epoch": 5.27, "eval_loss": 0.3790924549102783, "eval_runtime": 20.9048, "eval_samples_per_second": 23.918, "eval_steps_per_second": 0.765, "step": 62000 }, { "epoch": 5.32, "learning_rate": 9.999999999999999e-06, "loss": 0.408, "step": 62500 }, { "epoch": 5.36, "learning_rate": 9.999999999999999e-06, "loss": 0.4102, "step": 63000 }, { "epoch": 5.36, "eval_loss": 0.3805426061153412, "eval_runtime": 27.4404, "eval_samples_per_second": 18.221, "eval_steps_per_second": 0.583, "step": 63000 }, { "epoch": 5.4, "learning_rate": 9.999999999999999e-06, "loss": 0.4086, "step": 63500 }, { "epoch": 5.44, "learning_rate": 9.999999999999999e-06, "loss": 0.4087, "step": 64000 }, { "epoch": 5.44, "eval_loss": 0.37830984592437744, "eval_runtime": 14.8851, "eval_samples_per_second": 33.591, "eval_steps_per_second": 1.075, "step": 64000 }, { "epoch": 5.49, "learning_rate": 9.999999999999999e-06, "loss": 0.4081, "step": 64500 }, { "epoch": 5.53, "learning_rate": 9.999999999999999e-06, "loss": 0.4083, "step": 65000 }, { "epoch": 5.53, "eval_loss": 0.3796636164188385, "eval_runtime": 17.3567, "eval_samples_per_second": 28.807, "eval_steps_per_second": 0.922, "step": 65000 }, { "epoch": 5.57, "learning_rate": 9.999999999999999e-06, "loss": 0.4078, "step": 65500 }, { "epoch": 5.61, "learning_rate": 9.999999999999999e-06, "loss": 0.4078, "step": 66000 }, { "epoch": 5.61, "eval_loss": 0.3783106803894043, "eval_runtime": 29.6676, "eval_samples_per_second": 16.853, "eval_steps_per_second": 0.539, "step": 66000 }, { "epoch": 5.66, "learning_rate": 9.999999999999999e-06, "loss": 0.4067, "step": 66500 }, { "epoch": 5.7, "learning_rate": 9.999999999999999e-06, "loss": 0.4072, "step": 67000 }, { "epoch": 5.7, "eval_loss": 0.3780921399593353, "eval_runtime": 15.2739, "eval_samples_per_second": 32.736, "eval_steps_per_second": 1.048, "step": 67000 }, { "epoch": 5.74, "learning_rate": 9.999999999999999e-06, "loss": 0.4079, "step": 67500 }, { "epoch": 5.78, "learning_rate": 9.999999999999999e-06, "loss": 0.4057, "step": 68000 }, { "epoch": 5.78, "eval_loss": 0.37833890318870544, "eval_runtime": 17.1263, "eval_samples_per_second": 29.195, "eval_steps_per_second": 0.934, "step": 68000 }, { "epoch": 5.83, "learning_rate": 9.999999999999999e-06, "loss": 0.406, "step": 68500 }, { "epoch": 5.87, "learning_rate": 9.999999999999999e-06, "loss": 0.4065, "step": 69000 }, { "epoch": 5.87, "eval_loss": 0.37815991044044495, "eval_runtime": 19.0772, "eval_samples_per_second": 26.209, "eval_steps_per_second": 0.839, "step": 69000 }, { "epoch": 5.91, "learning_rate": 9.999999999999999e-06, "loss": 0.4063, "step": 69500 }, { "epoch": 5.95, "learning_rate": 9.999999999999999e-06, "loss": 0.4062, "step": 70000 }, { "epoch": 5.95, "eval_loss": 0.3770570158958435, "eval_runtime": 15.6266, "eval_samples_per_second": 31.997, "eval_steps_per_second": 1.024, "step": 70000 }, { "epoch": 6.0, "learning_rate": 9.999999999999999e-06, "loss": 0.4073, "step": 70500 }, { "epoch": 6.04, "learning_rate": 9.999999999999999e-06, "loss": 0.4051, "step": 71000 }, { "epoch": 6.04, "eval_loss": 0.3775251507759094, "eval_runtime": 16.0318, "eval_samples_per_second": 31.188, "eval_steps_per_second": 0.998, "step": 71000 }, { "epoch": 6.08, "learning_rate": 9.999999999999999e-06, "loss": 0.4066, "step": 71500 }, { "epoch": 6.12, "learning_rate": 9.999999999999999e-06, "loss": 0.4057, "step": 72000 }, { "epoch": 6.12, "eval_loss": 0.37701237201690674, "eval_runtime": 15.6982, "eval_samples_per_second": 31.851, "eval_steps_per_second": 1.019, "step": 72000 }, { "epoch": 6.17, "learning_rate": 9.999999999999999e-06, "loss": 0.4067, "step": 72500 }, { "epoch": 6.21, "learning_rate": 9.999999999999999e-06, "loss": 0.4061, "step": 73000 }, { "epoch": 6.21, "eval_loss": 0.37806421518325806, "eval_runtime": 15.7852, "eval_samples_per_second": 31.675, "eval_steps_per_second": 1.014, "step": 73000 }, { "epoch": 6.25, "learning_rate": 9.999999999999999e-06, "loss": 0.4053, "step": 73500 }, { "epoch": 6.29, "learning_rate": 9.999999999999999e-06, "loss": 0.405, "step": 74000 }, { "epoch": 6.29, "eval_loss": 0.3771826922893524, "eval_runtime": 15.5158, "eval_samples_per_second": 32.225, "eval_steps_per_second": 1.031, "step": 74000 }, { "epoch": 6.34, "learning_rate": 9.999999999999999e-06, "loss": 0.4064, "step": 74500 }, { "epoch": 6.38, "learning_rate": 9.999999999999999e-06, "loss": 0.4053, "step": 75000 }, { "epoch": 6.38, "eval_loss": 0.377290278673172, "eval_runtime": 23.3698, "eval_samples_per_second": 21.395, "eval_steps_per_second": 0.685, "step": 75000 }, { "epoch": 6.42, "learning_rate": 9.999999999999999e-06, "loss": 0.406, "step": 75500 }, { "epoch": 6.46, "learning_rate": 9.999999999999999e-06, "loss": 0.4054, "step": 76000 }, { "epoch": 6.46, "eval_loss": 0.3762701749801636, "eval_runtime": 15.2662, "eval_samples_per_second": 32.752, "eval_steps_per_second": 1.048, "step": 76000 }, { "epoch": 6.51, "learning_rate": 9.999999999999999e-06, "loss": 0.4047, "step": 76500 }, { "epoch": 6.55, "learning_rate": 9.999999999999999e-06, "loss": 0.4043, "step": 77000 }, { "epoch": 6.55, "eval_loss": 0.3773665130138397, "eval_runtime": 23.0339, "eval_samples_per_second": 21.707, "eval_steps_per_second": 0.695, "step": 77000 }, { "epoch": 6.59, "learning_rate": 9.999999999999999e-06, "loss": 0.4041, "step": 77500 }, { "epoch": 6.63, "learning_rate": 9.999999999999999e-06, "loss": 0.4044, "step": 78000 }, { "epoch": 6.63, "eval_loss": 0.3738757371902466, "eval_runtime": 16.5496, "eval_samples_per_second": 30.212, "eval_steps_per_second": 0.967, "step": 78000 }, { "epoch": 6.68, "learning_rate": 9.999999999999999e-06, "loss": 0.4038, "step": 78500 }, { "epoch": 6.72, "learning_rate": 9.999999999999999e-06, "loss": 0.4038, "step": 79000 }, { "epoch": 6.72, "eval_loss": 0.37452879548072815, "eval_runtime": 16.7684, "eval_samples_per_second": 29.818, "eval_steps_per_second": 0.954, "step": 79000 }, { "epoch": 6.76, "learning_rate": 9.999999999999999e-06, "loss": 0.4039, "step": 79500 }, { "epoch": 6.81, "learning_rate": 9.999999999999999e-06, "loss": 0.4045, "step": 80000 }, { "epoch": 6.81, "eval_loss": 0.3761942684650421, "eval_runtime": 16.6694, "eval_samples_per_second": 29.995, "eval_steps_per_second": 0.96, "step": 80000 }, { "epoch": 6.85, "learning_rate": 9.999999999999999e-06, "loss": 0.4036, "step": 80500 }, { "epoch": 6.89, "learning_rate": 9.999999999999999e-06, "loss": 0.4035, "step": 81000 }, { "epoch": 6.89, "eval_loss": 0.3746860921382904, "eval_runtime": 15.7109, "eval_samples_per_second": 31.825, "eval_steps_per_second": 1.018, "step": 81000 }, { "epoch": 6.93, "learning_rate": 9.999999999999999e-06, "loss": 0.4037, "step": 81500 }, { "epoch": 6.98, "learning_rate": 9.999999999999999e-06, "loss": 0.4045, "step": 82000 }, { "epoch": 6.98, "eval_loss": 0.37363681197166443, "eval_runtime": 22.9088, "eval_samples_per_second": 21.826, "eval_steps_per_second": 0.698, "step": 82000 }, { "epoch": 7.02, "learning_rate": 9.999999999999999e-06, "loss": 0.4035, "step": 82500 }, { "epoch": 7.06, "learning_rate": 9.999999999999999e-06, "loss": 0.4031, "step": 83000 }, { "epoch": 7.06, "eval_loss": 0.37529370188713074, "eval_runtime": 14.7314, "eval_samples_per_second": 33.941, "eval_steps_per_second": 1.086, "step": 83000 }, { "epoch": 7.1, "learning_rate": 9.999999999999999e-06, "loss": 0.402, "step": 83500 }, { "epoch": 7.15, "learning_rate": 9.999999999999999e-06, "loss": 0.4042, "step": 84000 }, { "epoch": 7.15, "eval_loss": 0.37475818395614624, "eval_runtime": 15.8331, "eval_samples_per_second": 31.579, "eval_steps_per_second": 1.011, "step": 84000 }, { "epoch": 7.19, "learning_rate": 9.999999999999999e-06, "loss": 0.4032, "step": 84500 }, { "epoch": 7.23, "learning_rate": 9.999999999999999e-06, "loss": 0.4029, "step": 85000 }, { "epoch": 7.23, "eval_loss": 0.3748987317085266, "eval_runtime": 17.2956, "eval_samples_per_second": 28.909, "eval_steps_per_second": 0.925, "step": 85000 }, { "epoch": 7.27, "learning_rate": 9.999999999999999e-06, "loss": 0.4034, "step": 85500 }, { "epoch": 7.32, "learning_rate": 9.999999999999999e-06, "loss": 0.4029, "step": 86000 }, { "epoch": 7.32, "eval_loss": 0.37344664335250854, "eval_runtime": 15.9881, "eval_samples_per_second": 31.273, "eval_steps_per_second": 1.001, "step": 86000 }, { "epoch": 7.36, "learning_rate": 9.999999999999999e-06, "loss": 0.4043, "step": 86500 }, { "epoch": 7.4, "learning_rate": 9.999999999999999e-06, "loss": 0.4019, "step": 87000 }, { "epoch": 7.4, "eval_loss": 0.3718353509902954, "eval_runtime": 15.2483, "eval_samples_per_second": 32.791, "eval_steps_per_second": 1.049, "step": 87000 }, { "epoch": 7.44, "learning_rate": 9.999999999999999e-06, "loss": 0.403, "step": 87500 }, { "epoch": 7.49, "learning_rate": 9.999999999999999e-06, "loss": 0.4023, "step": 88000 }, { "epoch": 7.49, "eval_loss": 0.37316328287124634, "eval_runtime": 26.8261, "eval_samples_per_second": 18.639, "eval_steps_per_second": 0.596, "step": 88000 }, { "epoch": 7.53, "learning_rate": 9.999999999999999e-06, "loss": 0.402, "step": 88500 }, { "epoch": 7.57, "learning_rate": 9.999999999999999e-06, "loss": 0.4022, "step": 89000 }, { "epoch": 7.57, "eval_loss": 0.3714210093021393, "eval_runtime": 15.634, "eval_samples_per_second": 31.982, "eval_steps_per_second": 1.023, "step": 89000 }, { "epoch": 7.61, "learning_rate": 9.999999999999999e-06, "loss": 0.4017, "step": 89500 }, { "epoch": 7.66, "learning_rate": 9.999999999999999e-06, "loss": 0.4019, "step": 90000 }, { "epoch": 7.66, "eval_loss": 0.3728122413158417, "eval_runtime": 16.3123, "eval_samples_per_second": 30.652, "eval_steps_per_second": 0.981, "step": 90000 }, { "epoch": 7.7, "learning_rate": 9.999999999999999e-06, "loss": 0.4017, "step": 90500 }, { "epoch": 7.74, "learning_rate": 9.999999999999999e-06, "loss": 0.4016, "step": 91000 }, { "epoch": 7.74, "eval_loss": 0.3734327256679535, "eval_runtime": 17.5409, "eval_samples_per_second": 28.505, "eval_steps_per_second": 0.912, "step": 91000 }, { "epoch": 7.78, "learning_rate": 9.999999999999999e-06, "loss": 0.3998, "step": 91500 }, { "epoch": 7.83, "learning_rate": 9.999999999999999e-06, "loss": 0.4006, "step": 92000 }, { "epoch": 7.83, "eval_loss": 0.3747243583202362, "eval_runtime": 17.4755, "eval_samples_per_second": 28.611, "eval_steps_per_second": 0.916, "step": 92000 }, { "epoch": 7.87, "learning_rate": 9.999999999999999e-06, "loss": 0.4013, "step": 92500 }, { "epoch": 7.91, "learning_rate": 9.999999999999999e-06, "loss": 0.4008, "step": 93000 }, { "epoch": 7.91, "eval_loss": 0.37303251028060913, "eval_runtime": 17.0856, "eval_samples_per_second": 29.264, "eval_steps_per_second": 0.936, "step": 93000 }, { "epoch": 7.95, "learning_rate": 9.999999999999999e-06, "loss": 0.4008, "step": 93500 }, { "epoch": 8.0, "learning_rate": 9.999999999999999e-06, "loss": 0.402, "step": 94000 }, { "epoch": 8.0, "eval_loss": 0.37281692028045654, "eval_runtime": 17.9894, "eval_samples_per_second": 27.794, "eval_steps_per_second": 0.889, "step": 94000 }, { "epoch": 8.04, "learning_rate": 9.999999999999999e-06, "loss": 0.4005, "step": 94500 }, { "epoch": 8.08, "learning_rate": 9.999999999999999e-06, "loss": 0.4008, "step": 95000 }, { "epoch": 8.08, "eval_loss": 0.37092164158821106, "eval_runtime": 17.2285, "eval_samples_per_second": 29.022, "eval_steps_per_second": 0.929, "step": 95000 }, { "epoch": 8.12, "learning_rate": 9.999999999999999e-06, "loss": 0.3997, "step": 95500 }, { "epoch": 8.17, "learning_rate": 9.999999999999999e-06, "loss": 0.4024, "step": 96000 }, { "epoch": 8.17, "eval_loss": 0.37120264768600464, "eval_runtime": 16.2125, "eval_samples_per_second": 30.84, "eval_steps_per_second": 0.987, "step": 96000 }, { "epoch": 8.21, "learning_rate": 9.999999999999999e-06, "loss": 0.3997, "step": 96500 }, { "epoch": 8.25, "learning_rate": 9.999999999999999e-06, "loss": 0.402, "step": 97000 }, { "epoch": 8.25, "eval_loss": 0.37261128425598145, "eval_runtime": 16.3463, "eval_samples_per_second": 30.588, "eval_steps_per_second": 0.979, "step": 97000 }, { "epoch": 8.29, "learning_rate": 9.999999999999999e-06, "loss": 0.4004, "step": 97500 }, { "epoch": 8.34, "learning_rate": 9.999999999999999e-06, "loss": 0.4003, "step": 98000 }, { "epoch": 8.34, "eval_loss": 0.37027257680892944, "eval_runtime": 20.0807, "eval_samples_per_second": 24.9, "eval_steps_per_second": 0.797, "step": 98000 }, { "epoch": 8.38, "learning_rate": 9.999999999999999e-06, "loss": 0.4002, "step": 98500 }, { "epoch": 8.42, "learning_rate": 9.999999999999999e-06, "loss": 0.4007, "step": 99000 }, { "epoch": 8.42, "eval_loss": 0.37140411138534546, "eval_runtime": 21.4309, "eval_samples_per_second": 23.331, "eval_steps_per_second": 0.747, "step": 99000 }, { "epoch": 8.46, "learning_rate": 9.999999999999999e-06, "loss": 0.4, "step": 99500 }, { "epoch": 8.51, "learning_rate": 9.999999999999999e-06, "loss": 0.3997, "step": 100000 }, { "epoch": 8.51, "eval_loss": 0.3693406283855438, "eval_runtime": 17.2208, "eval_samples_per_second": 29.035, "eval_steps_per_second": 0.929, "step": 100000 } ], "max_steps": 1000000, "num_train_epochs": 86, "total_flos": 4.600200440697905e+21, "trial_name": null, "trial_params": null }