code-ArithHardC11-240415 / trainer_state.json
slseanwu's picture
add weights
c9ff07c
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.8,
"eval_steps": 500,
"global_step": 19000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 5.5528106689453125,
"learning_rate": 4.000000000000001e-06,
"loss": 1.8395,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 5.654870986938477,
"learning_rate": 8.000000000000001e-06,
"loss": 1.6814,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 4.252018928527832,
"learning_rate": 1.2e-05,
"loss": 1.6776,
"step": 300
},
{
"epoch": 0.08,
"grad_norm": 4.398709774017334,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.6404,
"step": 400
},
{
"epoch": 0.1,
"grad_norm": 4.0121259689331055,
"learning_rate": 2e-05,
"loss": 1.6651,
"step": 500
},
{
"epoch": 0.1,
"eval_loss": 1.6371649503707886,
"eval_runtime": 32.351,
"eval_samples_per_second": 30.911,
"eval_steps_per_second": 7.728,
"step": 500
},
{
"epoch": 0.12,
"grad_norm": 3.420888900756836,
"learning_rate": 1.9932203389830512e-05,
"loss": 1.6651,
"step": 600
},
{
"epoch": 0.14,
"grad_norm": 2.4271743297576904,
"learning_rate": 1.986440677966102e-05,
"loss": 1.6271,
"step": 700
},
{
"epoch": 0.16,
"grad_norm": 5.406766891479492,
"learning_rate": 1.9796610169491527e-05,
"loss": 1.6201,
"step": 800
},
{
"epoch": 0.18,
"grad_norm": 3.1450321674346924,
"learning_rate": 1.9728813559322034e-05,
"loss": 1.6106,
"step": 900
},
{
"epoch": 0.2,
"grad_norm": 4.066416263580322,
"learning_rate": 1.9661016949152545e-05,
"loss": 1.5944,
"step": 1000
},
{
"epoch": 0.2,
"eval_loss": 1.6080243587493896,
"eval_runtime": 32.2909,
"eval_samples_per_second": 30.968,
"eval_steps_per_second": 7.742,
"step": 1000
},
{
"epoch": 0.22,
"grad_norm": 3.158970355987549,
"learning_rate": 1.9593220338983052e-05,
"loss": 1.5668,
"step": 1100
},
{
"epoch": 0.24,
"grad_norm": 4.5038371086120605,
"learning_rate": 1.9525423728813562e-05,
"loss": 1.5673,
"step": 1200
},
{
"epoch": 0.26,
"grad_norm": 3.8768396377563477,
"learning_rate": 1.945762711864407e-05,
"loss": 1.5599,
"step": 1300
},
{
"epoch": 0.28,
"grad_norm": 3.831594228744507,
"learning_rate": 1.938983050847458e-05,
"loss": 1.5649,
"step": 1400
},
{
"epoch": 0.3,
"grad_norm": 3.2771167755126953,
"learning_rate": 1.9322033898305087e-05,
"loss": 1.5328,
"step": 1500
},
{
"epoch": 0.3,
"eval_loss": 1.532382845878601,
"eval_runtime": 32.3114,
"eval_samples_per_second": 30.949,
"eval_steps_per_second": 7.737,
"step": 1500
},
{
"epoch": 0.32,
"grad_norm": 3.2504122257232666,
"learning_rate": 1.9254237288135595e-05,
"loss": 1.5412,
"step": 1600
},
{
"epoch": 0.34,
"grad_norm": 4.105425834655762,
"learning_rate": 1.9186440677966102e-05,
"loss": 1.5223,
"step": 1700
},
{
"epoch": 0.36,
"grad_norm": 3.0855212211608887,
"learning_rate": 1.9118644067796613e-05,
"loss": 1.5138,
"step": 1800
},
{
"epoch": 0.38,
"grad_norm": 3.650761604309082,
"learning_rate": 1.905084745762712e-05,
"loss": 1.5128,
"step": 1900
},
{
"epoch": 0.4,
"grad_norm": 2.0812368392944336,
"learning_rate": 1.898305084745763e-05,
"loss": 1.5115,
"step": 2000
},
{
"epoch": 0.4,
"eval_loss": 1.511965036392212,
"eval_runtime": 32.3915,
"eval_samples_per_second": 30.872,
"eval_steps_per_second": 7.718,
"step": 2000
},
{
"epoch": 0.42,
"grad_norm": 3.7894039154052734,
"learning_rate": 1.8915254237288138e-05,
"loss": 1.5003,
"step": 2100
},
{
"epoch": 0.44,
"grad_norm": 2.276301145553589,
"learning_rate": 1.8847457627118645e-05,
"loss": 1.4952,
"step": 2200
},
{
"epoch": 0.46,
"grad_norm": 3.70339035987854,
"learning_rate": 1.8779661016949152e-05,
"loss": 1.495,
"step": 2300
},
{
"epoch": 0.48,
"grad_norm": 2.6344492435455322,
"learning_rate": 1.8711864406779663e-05,
"loss": 1.4835,
"step": 2400
},
{
"epoch": 0.5,
"grad_norm": 3.2948801517486572,
"learning_rate": 1.8644745762711865e-05,
"loss": 1.474,
"step": 2500
},
{
"epoch": 0.5,
"eval_loss": 1.5233224630355835,
"eval_runtime": 32.3386,
"eval_samples_per_second": 30.923,
"eval_steps_per_second": 7.731,
"step": 2500
},
{
"epoch": 0.52,
"grad_norm": 2.458732843399048,
"learning_rate": 1.857762711864407e-05,
"loss": 1.4994,
"step": 2600
},
{
"epoch": 0.54,
"grad_norm": 2.2232306003570557,
"learning_rate": 1.850983050847458e-05,
"loss": 1.4879,
"step": 2700
},
{
"epoch": 0.56,
"grad_norm": 3.499060869216919,
"learning_rate": 1.8442033898305086e-05,
"loss": 1.4648,
"step": 2800
},
{
"epoch": 0.58,
"grad_norm": 3.17518949508667,
"learning_rate": 1.8374237288135593e-05,
"loss": 1.4717,
"step": 2900
},
{
"epoch": 0.6,
"grad_norm": 4.437788009643555,
"learning_rate": 1.8306440677966104e-05,
"loss": 1.4478,
"step": 3000
},
{
"epoch": 0.6,
"eval_loss": 1.4587255716323853,
"eval_runtime": 32.3711,
"eval_samples_per_second": 30.892,
"eval_steps_per_second": 7.723,
"step": 3000
},
{
"epoch": 0.62,
"grad_norm": 3.0833561420440674,
"learning_rate": 1.823864406779661e-05,
"loss": 1.4441,
"step": 3100
},
{
"epoch": 0.64,
"grad_norm": 2.600447416305542,
"learning_rate": 1.817084745762712e-05,
"loss": 1.4415,
"step": 3200
},
{
"epoch": 0.66,
"grad_norm": 3.669921636581421,
"learning_rate": 1.810305084745763e-05,
"loss": 1.4458,
"step": 3300
},
{
"epoch": 0.68,
"grad_norm": 3.342150926589966,
"learning_rate": 1.803525423728814e-05,
"loss": 1.4621,
"step": 3400
},
{
"epoch": 0.7,
"grad_norm": 4.060861110687256,
"learning_rate": 1.7967457627118647e-05,
"loss": 1.4491,
"step": 3500
},
{
"epoch": 0.7,
"eval_loss": 1.4404387474060059,
"eval_runtime": 32.3788,
"eval_samples_per_second": 30.884,
"eval_steps_per_second": 7.721,
"step": 3500
},
{
"epoch": 0.72,
"grad_norm": 3.0154595375061035,
"learning_rate": 1.7899661016949154e-05,
"loss": 1.4568,
"step": 3600
},
{
"epoch": 0.74,
"grad_norm": 2.4156243801116943,
"learning_rate": 1.783186440677966e-05,
"loss": 1.4254,
"step": 3700
},
{
"epoch": 0.76,
"grad_norm": 3.0124893188476562,
"learning_rate": 1.776406779661017e-05,
"loss": 1.4249,
"step": 3800
},
{
"epoch": 0.78,
"grad_norm": 3.8340814113616943,
"learning_rate": 1.769627118644068e-05,
"loss": 1.4386,
"step": 3900
},
{
"epoch": 0.8,
"grad_norm": 4.016916275024414,
"learning_rate": 1.762847457627119e-05,
"loss": 1.4098,
"step": 4000
},
{
"epoch": 0.8,
"eval_loss": 1.4542571306228638,
"eval_runtime": 32.2948,
"eval_samples_per_second": 30.965,
"eval_steps_per_second": 7.741,
"step": 4000
},
{
"epoch": 0.82,
"grad_norm": 4.036525249481201,
"learning_rate": 1.7560677966101697e-05,
"loss": 1.4232,
"step": 4100
},
{
"epoch": 0.84,
"grad_norm": 2.700068950653076,
"learning_rate": 1.74935593220339e-05,
"loss": 1.4081,
"step": 4200
},
{
"epoch": 0.86,
"grad_norm": 3.3095715045928955,
"learning_rate": 1.742576271186441e-05,
"loss": 1.4065,
"step": 4300
},
{
"epoch": 0.88,
"grad_norm": 2.9029970169067383,
"learning_rate": 1.7357966101694917e-05,
"loss": 1.4157,
"step": 4400
},
{
"epoch": 0.9,
"grad_norm": 3.567429542541504,
"learning_rate": 1.7290169491525424e-05,
"loss": 1.3841,
"step": 4500
},
{
"epoch": 0.9,
"eval_loss": 1.406548023223877,
"eval_runtime": 32.2949,
"eval_samples_per_second": 30.965,
"eval_steps_per_second": 7.741,
"step": 4500
},
{
"epoch": 0.92,
"grad_norm": 3.4792306423187256,
"learning_rate": 1.722237288135593e-05,
"loss": 1.393,
"step": 4600
},
{
"epoch": 0.94,
"grad_norm": 2.3991451263427734,
"learning_rate": 1.7154576271186442e-05,
"loss": 1.4066,
"step": 4700
},
{
"epoch": 0.96,
"grad_norm": 2.603165626525879,
"learning_rate": 1.708677966101695e-05,
"loss": 1.4169,
"step": 4800
},
{
"epoch": 0.98,
"grad_norm": 2.465501070022583,
"learning_rate": 1.701898305084746e-05,
"loss": 1.3909,
"step": 4900
},
{
"epoch": 1.0,
"grad_norm": 3.7463817596435547,
"learning_rate": 1.6951186440677967e-05,
"loss": 1.3661,
"step": 5000
},
{
"epoch": 1.0,
"eval_loss": 1.3650578260421753,
"eval_runtime": 32.2718,
"eval_samples_per_second": 30.987,
"eval_steps_per_second": 7.747,
"step": 5000
},
{
"epoch": 1.02,
"grad_norm": 2.694695472717285,
"learning_rate": 1.6883389830508478e-05,
"loss": 1.3752,
"step": 5100
},
{
"epoch": 1.04,
"grad_norm": 2.7569658756256104,
"learning_rate": 1.6815593220338985e-05,
"loss": 1.3567,
"step": 5200
},
{
"epoch": 1.06,
"grad_norm": 2.8121705055236816,
"learning_rate": 1.6747796610169492e-05,
"loss": 1.3727,
"step": 5300
},
{
"epoch": 1.08,
"grad_norm": 3.058004140853882,
"learning_rate": 1.668e-05,
"loss": 1.359,
"step": 5400
},
{
"epoch": 1.1,
"grad_norm": 4.126440525054932,
"learning_rate": 1.661220338983051e-05,
"loss": 1.3795,
"step": 5500
},
{
"epoch": 1.1,
"eval_loss": 1.4012497663497925,
"eval_runtime": 32.2512,
"eval_samples_per_second": 31.007,
"eval_steps_per_second": 7.752,
"step": 5500
},
{
"epoch": 1.12,
"grad_norm": 3.2426562309265137,
"learning_rate": 1.6544406779661017e-05,
"loss": 1.3641,
"step": 5600
},
{
"epoch": 1.14,
"grad_norm": 2.7895913124084473,
"learning_rate": 1.6476610169491528e-05,
"loss": 1.3548,
"step": 5700
},
{
"epoch": 1.16,
"grad_norm": 3.1663429737091064,
"learning_rate": 1.6408813559322035e-05,
"loss": 1.3569,
"step": 5800
},
{
"epoch": 1.18,
"grad_norm": 2.3783955574035645,
"learning_rate": 1.6341016949152542e-05,
"loss": 1.34,
"step": 5900
},
{
"epoch": 1.2,
"grad_norm": 2.695949077606201,
"learning_rate": 1.627322033898305e-05,
"loss": 1.3356,
"step": 6000
},
{
"epoch": 1.2,
"eval_loss": 1.3493778705596924,
"eval_runtime": 32.2453,
"eval_samples_per_second": 31.012,
"eval_steps_per_second": 7.753,
"step": 6000
},
{
"epoch": 1.22,
"grad_norm": 3.2092180252075195,
"learning_rate": 1.620542372881356e-05,
"loss": 1.3288,
"step": 6100
},
{
"epoch": 1.24,
"grad_norm": 2.4168381690979004,
"learning_rate": 1.6137627118644068e-05,
"loss": 1.3388,
"step": 6200
},
{
"epoch": 1.26,
"grad_norm": 3.560577154159546,
"learning_rate": 1.6069830508474578e-05,
"loss": 1.3515,
"step": 6300
},
{
"epoch": 1.28,
"grad_norm": 2.8059277534484863,
"learning_rate": 1.600203389830509e-05,
"loss": 1.3383,
"step": 6400
},
{
"epoch": 1.3,
"grad_norm": 3.603806495666504,
"learning_rate": 1.5934237288135596e-05,
"loss": 1.3303,
"step": 6500
},
{
"epoch": 1.3,
"eval_loss": 1.3483374118804932,
"eval_runtime": 32.2455,
"eval_samples_per_second": 31.012,
"eval_steps_per_second": 7.753,
"step": 6500
},
{
"epoch": 1.32,
"grad_norm": 2.532747268676758,
"learning_rate": 1.5866440677966103e-05,
"loss": 1.3303,
"step": 6600
},
{
"epoch": 1.34,
"grad_norm": 3.174362897872925,
"learning_rate": 1.579864406779661e-05,
"loss": 1.3436,
"step": 6700
},
{
"epoch": 1.36,
"grad_norm": 2.64054799079895,
"learning_rate": 1.573084745762712e-05,
"loss": 1.3435,
"step": 6800
},
{
"epoch": 1.38,
"grad_norm": 2.959552526473999,
"learning_rate": 1.5663050847457628e-05,
"loss": 1.3477,
"step": 6900
},
{
"epoch": 1.4,
"grad_norm": 3.166142225265503,
"learning_rate": 1.559525423728814e-05,
"loss": 1.3402,
"step": 7000
},
{
"epoch": 1.4,
"eval_loss": 1.357351303100586,
"eval_runtime": 32.3105,
"eval_samples_per_second": 30.95,
"eval_steps_per_second": 7.737,
"step": 7000
},
{
"epoch": 1.42,
"grad_norm": 4.034038543701172,
"learning_rate": 1.5527457627118646e-05,
"loss": 1.3141,
"step": 7100
},
{
"epoch": 1.44,
"grad_norm": 2.478321075439453,
"learning_rate": 1.5459661016949153e-05,
"loss": 1.3138,
"step": 7200
},
{
"epoch": 1.46,
"grad_norm": 3.06643009185791,
"learning_rate": 1.539186440677966e-05,
"loss": 1.3212,
"step": 7300
},
{
"epoch": 1.48,
"grad_norm": 2.68947434425354,
"learning_rate": 1.532406779661017e-05,
"loss": 1.3197,
"step": 7400
},
{
"epoch": 1.5,
"grad_norm": 2.618062973022461,
"learning_rate": 1.5256271186440678e-05,
"loss": 1.2973,
"step": 7500
},
{
"epoch": 1.5,
"eval_loss": 1.343194842338562,
"eval_runtime": 32.349,
"eval_samples_per_second": 30.913,
"eval_steps_per_second": 7.728,
"step": 7500
},
{
"epoch": 1.52,
"grad_norm": 2.5092427730560303,
"learning_rate": 1.5188474576271189e-05,
"loss": 1.3291,
"step": 7600
},
{
"epoch": 1.54,
"grad_norm": 2.6408796310424805,
"learning_rate": 1.5120677966101696e-05,
"loss": 1.3014,
"step": 7700
},
{
"epoch": 1.56,
"grad_norm": 3.846283197402954,
"learning_rate": 1.5052881355932205e-05,
"loss": 1.2986,
"step": 7800
},
{
"epoch": 1.58,
"grad_norm": 2.641146659851074,
"learning_rate": 1.4985084745762712e-05,
"loss": 1.2978,
"step": 7900
},
{
"epoch": 1.6,
"grad_norm": 2.446991443634033,
"learning_rate": 1.4917288135593221e-05,
"loss": 1.3095,
"step": 8000
},
{
"epoch": 1.6,
"eval_loss": 1.2667104005813599,
"eval_runtime": 32.376,
"eval_samples_per_second": 30.887,
"eval_steps_per_second": 7.722,
"step": 8000
},
{
"epoch": 1.62,
"grad_norm": 3.4024269580841064,
"learning_rate": 1.4849491525423729e-05,
"loss": 1.2927,
"step": 8100
},
{
"epoch": 1.64,
"grad_norm": 2.4591922760009766,
"learning_rate": 1.478169491525424e-05,
"loss": 1.2934,
"step": 8200
},
{
"epoch": 1.66,
"grad_norm": 3.165149450302124,
"learning_rate": 1.4714576271186442e-05,
"loss": 1.3119,
"step": 8300
},
{
"epoch": 1.68,
"grad_norm": 3.2600440979003906,
"learning_rate": 1.464677966101695e-05,
"loss": 1.2825,
"step": 8400
},
{
"epoch": 1.7,
"grad_norm": 4.034482479095459,
"learning_rate": 1.457898305084746e-05,
"loss": 1.2913,
"step": 8500
},
{
"epoch": 1.7,
"eval_loss": 1.276153326034546,
"eval_runtime": 32.3382,
"eval_samples_per_second": 30.923,
"eval_steps_per_second": 7.731,
"step": 8500
},
{
"epoch": 1.72,
"grad_norm": 4.266259670257568,
"learning_rate": 1.4511186440677967e-05,
"loss": 1.3002,
"step": 8600
},
{
"epoch": 1.74,
"grad_norm": 3.357360601425171,
"learning_rate": 1.4443389830508476e-05,
"loss": 1.2685,
"step": 8700
},
{
"epoch": 1.76,
"grad_norm": 3.463027000427246,
"learning_rate": 1.4375593220338983e-05,
"loss": 1.2937,
"step": 8800
},
{
"epoch": 1.78,
"grad_norm": 2.545639991760254,
"learning_rate": 1.4307796610169494e-05,
"loss": 1.2895,
"step": 8900
},
{
"epoch": 1.8,
"grad_norm": 3.091081142425537,
"learning_rate": 1.4240000000000001e-05,
"loss": 1.2932,
"step": 9000
},
{
"epoch": 1.8,
"eval_loss": 1.2490341663360596,
"eval_runtime": 32.3654,
"eval_samples_per_second": 30.897,
"eval_steps_per_second": 7.724,
"step": 9000
},
{
"epoch": 1.82,
"grad_norm": 2.9936749935150146,
"learning_rate": 1.417220338983051e-05,
"loss": 1.2867,
"step": 9100
},
{
"epoch": 1.84,
"grad_norm": 2.4961957931518555,
"learning_rate": 1.4104406779661017e-05,
"loss": 1.2899,
"step": 9200
},
{
"epoch": 1.86,
"grad_norm": 2.526224136352539,
"learning_rate": 1.4036610169491528e-05,
"loss": 1.2855,
"step": 9300
},
{
"epoch": 1.88,
"grad_norm": 3.532458543777466,
"learning_rate": 1.3968813559322035e-05,
"loss": 1.2566,
"step": 9400
},
{
"epoch": 1.9,
"grad_norm": 3.7112512588500977,
"learning_rate": 1.3901016949152544e-05,
"loss": 1.2645,
"step": 9500
},
{
"epoch": 1.9,
"eval_loss": 1.2843515872955322,
"eval_runtime": 32.3071,
"eval_samples_per_second": 30.953,
"eval_steps_per_second": 7.738,
"step": 9500
},
{
"epoch": 1.92,
"grad_norm": 2.6405515670776367,
"learning_rate": 1.3833220338983051e-05,
"loss": 1.2673,
"step": 9600
},
{
"epoch": 1.94,
"grad_norm": 3.5349998474121094,
"learning_rate": 1.376542372881356e-05,
"loss": 1.2811,
"step": 9700
},
{
"epoch": 1.96,
"grad_norm": 3.587463140487671,
"learning_rate": 1.3697627118644067e-05,
"loss": 1.2501,
"step": 9800
},
{
"epoch": 1.98,
"grad_norm": 2.2374660968780518,
"learning_rate": 1.3629830508474578e-05,
"loss": 1.2726,
"step": 9900
},
{
"epoch": 2.0,
"grad_norm": 2.9396588802337646,
"learning_rate": 1.3562033898305085e-05,
"loss": 1.2564,
"step": 10000
},
{
"epoch": 2.0,
"eval_loss": 1.3057665824890137,
"eval_runtime": 32.3132,
"eval_samples_per_second": 30.947,
"eval_steps_per_second": 7.737,
"step": 10000
},
{
"epoch": 2.02,
"grad_norm": 2.5827131271362305,
"learning_rate": 1.3494237288135594e-05,
"loss": 1.2423,
"step": 10100
},
{
"epoch": 2.04,
"grad_norm": 3.370476722717285,
"learning_rate": 1.3426440677966105e-05,
"loss": 1.2493,
"step": 10200
},
{
"epoch": 2.06,
"grad_norm": 3.8779571056365967,
"learning_rate": 1.3358644067796612e-05,
"loss": 1.2331,
"step": 10300
},
{
"epoch": 2.08,
"grad_norm": 2.460205078125,
"learning_rate": 1.329084745762712e-05,
"loss": 1.2425,
"step": 10400
},
{
"epoch": 2.1,
"grad_norm": 3.0094106197357178,
"learning_rate": 1.3223050847457628e-05,
"loss": 1.2036,
"step": 10500
},
{
"epoch": 2.1,
"eval_loss": 1.2363600730895996,
"eval_runtime": 32.2863,
"eval_samples_per_second": 30.973,
"eval_steps_per_second": 7.743,
"step": 10500
},
{
"epoch": 2.12,
"grad_norm": 3.705883741378784,
"learning_rate": 1.3155254237288137e-05,
"loss": 1.221,
"step": 10600
},
{
"epoch": 2.14,
"grad_norm": 4.502602577209473,
"learning_rate": 1.3087457627118644e-05,
"loss": 1.2481,
"step": 10700
},
{
"epoch": 2.16,
"grad_norm": 3.3677573204040527,
"learning_rate": 1.3019661016949155e-05,
"loss": 1.2156,
"step": 10800
},
{
"epoch": 2.18,
"grad_norm": 4.022857666015625,
"learning_rate": 1.2951864406779662e-05,
"loss": 1.2154,
"step": 10900
},
{
"epoch": 2.2,
"grad_norm": 3.1049187183380127,
"learning_rate": 1.2884067796610171e-05,
"loss": 1.2385,
"step": 11000
},
{
"epoch": 2.2,
"eval_loss": 1.284387469291687,
"eval_runtime": 32.2655,
"eval_samples_per_second": 30.993,
"eval_steps_per_second": 7.748,
"step": 11000
},
{
"epoch": 2.22,
"grad_norm": 2.980409622192383,
"learning_rate": 1.2816271186440678e-05,
"loss": 1.2451,
"step": 11100
},
{
"epoch": 2.24,
"grad_norm": 3.34755539894104,
"learning_rate": 1.2748474576271189e-05,
"loss": 1.2361,
"step": 11200
},
{
"epoch": 2.26,
"grad_norm": 2.9254653453826904,
"learning_rate": 1.2680677966101696e-05,
"loss": 1.2031,
"step": 11300
},
{
"epoch": 2.28,
"grad_norm": 4.1410698890686035,
"learning_rate": 1.2612881355932205e-05,
"loss": 1.2119,
"step": 11400
},
{
"epoch": 2.3,
"grad_norm": 3.1164631843566895,
"learning_rate": 1.2545084745762712e-05,
"loss": 1.2355,
"step": 11500
},
{
"epoch": 2.3,
"eval_loss": 1.2927731275558472,
"eval_runtime": 32.2864,
"eval_samples_per_second": 30.973,
"eval_steps_per_second": 7.743,
"step": 11500
},
{
"epoch": 2.32,
"grad_norm": 3.6003384590148926,
"learning_rate": 1.2477288135593221e-05,
"loss": 1.2143,
"step": 11600
},
{
"epoch": 2.34,
"grad_norm": 3.513211488723755,
"learning_rate": 1.240949152542373e-05,
"loss": 1.2193,
"step": 11700
},
{
"epoch": 2.36,
"grad_norm": 4.301449298858643,
"learning_rate": 1.2341694915254239e-05,
"loss": 1.2109,
"step": 11800
},
{
"epoch": 2.38,
"grad_norm": 2.6304187774658203,
"learning_rate": 1.2273898305084746e-05,
"loss": 1.2171,
"step": 11900
},
{
"epoch": 2.4,
"grad_norm": 3.6256394386291504,
"learning_rate": 1.220677966101695e-05,
"loss": 1.2273,
"step": 12000
},
{
"epoch": 2.4,
"eval_loss": 1.2177479267120361,
"eval_runtime": 32.2863,
"eval_samples_per_second": 30.973,
"eval_steps_per_second": 7.743,
"step": 12000
},
{
"epoch": 2.42,
"grad_norm": 3.3076181411743164,
"learning_rate": 1.213898305084746e-05,
"loss": 1.2202,
"step": 12100
},
{
"epoch": 2.44,
"grad_norm": 3.64410400390625,
"learning_rate": 1.2071864406779664e-05,
"loss": 1.1953,
"step": 12200
},
{
"epoch": 2.46,
"grad_norm": 3.324385643005371,
"learning_rate": 1.200406779661017e-05,
"loss": 1.2154,
"step": 12300
},
{
"epoch": 2.48,
"grad_norm": 4.0625176429748535,
"learning_rate": 1.193627118644068e-05,
"loss": 1.2229,
"step": 12400
},
{
"epoch": 2.5,
"grad_norm": 2.672346830368042,
"learning_rate": 1.1868474576271187e-05,
"loss": 1.214,
"step": 12500
},
{
"epoch": 2.5,
"eval_loss": 1.2213943004608154,
"eval_runtime": 32.2947,
"eval_samples_per_second": 30.965,
"eval_steps_per_second": 7.741,
"step": 12500
},
{
"epoch": 2.52,
"grad_norm": 3.8866512775421143,
"learning_rate": 1.1800677966101698e-05,
"loss": 1.1915,
"step": 12600
},
{
"epoch": 2.54,
"grad_norm": 3.414454460144043,
"learning_rate": 1.1732881355932205e-05,
"loss": 1.1973,
"step": 12700
},
{
"epoch": 2.56,
"grad_norm": 2.820164442062378,
"learning_rate": 1.1665084745762714e-05,
"loss": 1.1943,
"step": 12800
},
{
"epoch": 2.58,
"grad_norm": 3.2248144149780273,
"learning_rate": 1.1597288135593221e-05,
"loss": 1.2034,
"step": 12900
},
{
"epoch": 2.6,
"grad_norm": 2.916104793548584,
"learning_rate": 1.1530169491525425e-05,
"loss": 1.2177,
"step": 13000
},
{
"epoch": 2.6,
"eval_loss": 1.2506352663040161,
"eval_runtime": 32.3181,
"eval_samples_per_second": 30.942,
"eval_steps_per_second": 7.736,
"step": 13000
},
{
"epoch": 2.62,
"grad_norm": 2.8287951946258545,
"learning_rate": 1.1462372881355932e-05,
"loss": 1.1992,
"step": 13100
},
{
"epoch": 2.64,
"grad_norm": 3.4299583435058594,
"learning_rate": 1.1394576271186441e-05,
"loss": 1.2133,
"step": 13200
},
{
"epoch": 2.66,
"grad_norm": 2.9985878467559814,
"learning_rate": 1.132677966101695e-05,
"loss": 1.1709,
"step": 13300
},
{
"epoch": 2.68,
"grad_norm": 3.1843700408935547,
"learning_rate": 1.125898305084746e-05,
"loss": 1.2063,
"step": 13400
},
{
"epoch": 2.7,
"grad_norm": 3.3279943466186523,
"learning_rate": 1.1191186440677968e-05,
"loss": 1.1935,
"step": 13500
},
{
"epoch": 2.7,
"eval_loss": 1.2275168895721436,
"eval_runtime": 32.3349,
"eval_samples_per_second": 30.926,
"eval_steps_per_second": 7.732,
"step": 13500
},
{
"epoch": 2.72,
"grad_norm": 3.5022013187408447,
"learning_rate": 1.1123389830508475e-05,
"loss": 1.2001,
"step": 13600
},
{
"epoch": 2.74,
"grad_norm": 4.141532897949219,
"learning_rate": 1.1055593220338984e-05,
"loss": 1.1882,
"step": 13700
},
{
"epoch": 2.76,
"grad_norm": 3.668473958969116,
"learning_rate": 1.0987796610169492e-05,
"loss": 1.2004,
"step": 13800
},
{
"epoch": 2.78,
"grad_norm": 2.9693169593811035,
"learning_rate": 1.0920000000000002e-05,
"loss": 1.1908,
"step": 13900
},
{
"epoch": 2.8,
"grad_norm": 2.173802137374878,
"learning_rate": 1.085220338983051e-05,
"loss": 1.1891,
"step": 14000
},
{
"epoch": 2.8,
"eval_loss": 1.2043945789337158,
"eval_runtime": 32.3786,
"eval_samples_per_second": 30.885,
"eval_steps_per_second": 7.721,
"step": 14000
},
{
"epoch": 2.82,
"grad_norm": 2.77329158782959,
"learning_rate": 1.0784406779661018e-05,
"loss": 1.1952,
"step": 14100
},
{
"epoch": 2.84,
"grad_norm": 2.3273532390594482,
"learning_rate": 1.0716610169491526e-05,
"loss": 1.1894,
"step": 14200
},
{
"epoch": 2.86,
"grad_norm": 2.993412971496582,
"learning_rate": 1.0648813559322036e-05,
"loss": 1.168,
"step": 14300
},
{
"epoch": 2.88,
"grad_norm": 3.5041513442993164,
"learning_rate": 1.0581016949152543e-05,
"loss": 1.1891,
"step": 14400
},
{
"epoch": 2.9,
"grad_norm": 2.8928792476654053,
"learning_rate": 1.0513220338983052e-05,
"loss": 1.1771,
"step": 14500
},
{
"epoch": 2.9,
"eval_loss": 1.2012468576431274,
"eval_runtime": 32.3112,
"eval_samples_per_second": 30.949,
"eval_steps_per_second": 7.737,
"step": 14500
},
{
"epoch": 2.92,
"grad_norm": 2.976024627685547,
"learning_rate": 1.044542372881356e-05,
"loss": 1.1836,
"step": 14600
},
{
"epoch": 2.94,
"grad_norm": 3.1303913593292236,
"learning_rate": 1.0377627118644068e-05,
"loss": 1.1851,
"step": 14700
},
{
"epoch": 2.96,
"grad_norm": 2.8638460636138916,
"learning_rate": 1.0309830508474576e-05,
"loss": 1.1732,
"step": 14800
},
{
"epoch": 2.98,
"grad_norm": 2.416059732437134,
"learning_rate": 1.0242033898305086e-05,
"loss": 1.1905,
"step": 14900
},
{
"epoch": 3.0,
"grad_norm": 3.996770143508911,
"learning_rate": 1.0174237288135594e-05,
"loss": 1.2044,
"step": 15000
},
{
"epoch": 3.0,
"eval_loss": 1.1813915967941284,
"eval_runtime": 32.2878,
"eval_samples_per_second": 30.971,
"eval_steps_per_second": 7.743,
"step": 15000
},
{
"epoch": 3.02,
"grad_norm": 2.469172477722168,
"learning_rate": 1.0106440677966102e-05,
"loss": 1.148,
"step": 15100
},
{
"epoch": 3.04,
"grad_norm": 8.27697467803955,
"learning_rate": 1.003864406779661e-05,
"loss": 1.1733,
"step": 15200
},
{
"epoch": 3.06,
"grad_norm": 3.0315303802490234,
"learning_rate": 9.97084745762712e-06,
"loss": 1.1478,
"step": 15300
},
{
"epoch": 3.08,
"grad_norm": 2.41133189201355,
"learning_rate": 9.903050847457628e-06,
"loss": 1.122,
"step": 15400
},
{
"epoch": 3.1,
"grad_norm": 3.001695394515991,
"learning_rate": 9.835254237288136e-06,
"loss": 1.1345,
"step": 15500
},
{
"epoch": 3.1,
"eval_loss": 1.2133294343948364,
"eval_runtime": 32.2811,
"eval_samples_per_second": 30.978,
"eval_steps_per_second": 7.744,
"step": 15500
},
{
"epoch": 3.12,
"grad_norm": 5.538024425506592,
"learning_rate": 9.767457627118645e-06,
"loss": 1.1493,
"step": 15600
},
{
"epoch": 3.14,
"grad_norm": 4.214341640472412,
"learning_rate": 9.699661016949153e-06,
"loss": 1.1327,
"step": 15700
},
{
"epoch": 3.16,
"grad_norm": 3.586280345916748,
"learning_rate": 9.631864406779662e-06,
"loss": 1.1381,
"step": 15800
},
{
"epoch": 3.18,
"grad_norm": 4.103856563568115,
"learning_rate": 9.56406779661017e-06,
"loss": 1.1471,
"step": 15900
},
{
"epoch": 3.2,
"grad_norm": 3.964653491973877,
"learning_rate": 9.49627118644068e-06,
"loss": 1.1299,
"step": 16000
},
{
"epoch": 3.2,
"eval_loss": 1.2055881023406982,
"eval_runtime": 32.3191,
"eval_samples_per_second": 30.941,
"eval_steps_per_second": 7.735,
"step": 16000
},
{
"epoch": 3.22,
"grad_norm": 2.7966806888580322,
"learning_rate": 9.428474576271187e-06,
"loss": 1.1144,
"step": 16100
},
{
"epoch": 3.24,
"grad_norm": 3.176314115524292,
"learning_rate": 9.360677966101696e-06,
"loss": 1.1405,
"step": 16200
},
{
"epoch": 3.26,
"grad_norm": 4.957722187042236,
"learning_rate": 9.292881355932204e-06,
"loss": 1.1171,
"step": 16300
},
{
"epoch": 3.28,
"grad_norm": 3.398547410964966,
"learning_rate": 9.225084745762712e-06,
"loss": 1.1289,
"step": 16400
},
{
"epoch": 3.3,
"grad_norm": 3.477339267730713,
"learning_rate": 9.15728813559322e-06,
"loss": 1.1132,
"step": 16500
},
{
"epoch": 3.3,
"eval_loss": 1.2293468713760376,
"eval_runtime": 32.2612,
"eval_samples_per_second": 30.997,
"eval_steps_per_second": 7.749,
"step": 16500
},
{
"epoch": 3.32,
"grad_norm": 4.367581844329834,
"learning_rate": 9.08949152542373e-06,
"loss": 1.1241,
"step": 16600
},
{
"epoch": 3.34,
"grad_norm": 3.551278591156006,
"learning_rate": 9.021694915254238e-06,
"loss": 1.1188,
"step": 16700
},
{
"epoch": 3.36,
"grad_norm": 3.29950213432312,
"learning_rate": 8.953898305084746e-06,
"loss": 1.1299,
"step": 16800
},
{
"epoch": 3.38,
"grad_norm": 3.1226329803466797,
"learning_rate": 8.886101694915255e-06,
"loss": 1.1239,
"step": 16900
},
{
"epoch": 3.4,
"grad_norm": 2.9976165294647217,
"learning_rate": 8.818305084745764e-06,
"loss": 1.1329,
"step": 17000
},
{
"epoch": 3.4,
"eval_loss": 1.1932790279388428,
"eval_runtime": 32.2514,
"eval_samples_per_second": 31.006,
"eval_steps_per_second": 7.752,
"step": 17000
},
{
"epoch": 3.42,
"grad_norm": 2.9511375427246094,
"learning_rate": 8.75050847457627e-06,
"loss": 1.1306,
"step": 17100
},
{
"epoch": 3.44,
"grad_norm": 3.326470375061035,
"learning_rate": 8.68271186440678e-06,
"loss": 1.1232,
"step": 17200
},
{
"epoch": 3.46,
"grad_norm": 3.6301770210266113,
"learning_rate": 8.614915254237289e-06,
"loss": 1.1215,
"step": 17300
},
{
"epoch": 3.48,
"grad_norm": 3.658932685852051,
"learning_rate": 8.547118644067798e-06,
"loss": 1.115,
"step": 17400
},
{
"epoch": 3.5,
"grad_norm": 2.451982021331787,
"learning_rate": 8.479322033898306e-06,
"loss": 1.1253,
"step": 17500
},
{
"epoch": 3.5,
"eval_loss": 1.2195427417755127,
"eval_runtime": 32.2665,
"eval_samples_per_second": 30.992,
"eval_steps_per_second": 7.748,
"step": 17500
},
{
"epoch": 3.52,
"grad_norm": 3.728940725326538,
"learning_rate": 8.411525423728815e-06,
"loss": 1.1191,
"step": 17600
},
{
"epoch": 3.54,
"grad_norm": 4.087761878967285,
"learning_rate": 8.343728813559323e-06,
"loss": 1.1239,
"step": 17700
},
{
"epoch": 3.56,
"grad_norm": 3.1904852390289307,
"learning_rate": 8.275932203389832e-06,
"loss": 1.1036,
"step": 17800
},
{
"epoch": 3.58,
"grad_norm": 4.449623107910156,
"learning_rate": 8.20813559322034e-06,
"loss": 1.1139,
"step": 17900
},
{
"epoch": 3.6,
"grad_norm": 2.611001968383789,
"learning_rate": 8.140338983050848e-06,
"loss": 1.1096,
"step": 18000
},
{
"epoch": 3.6,
"eval_loss": 1.1555566787719727,
"eval_runtime": 32.2418,
"eval_samples_per_second": 31.016,
"eval_steps_per_second": 7.754,
"step": 18000
},
{
"epoch": 3.62,
"grad_norm": 3.380537271499634,
"learning_rate": 8.072542372881357e-06,
"loss": 1.1244,
"step": 18100
},
{
"epoch": 3.64,
"grad_norm": 3.485279083251953,
"learning_rate": 8.004745762711866e-06,
"loss": 1.1103,
"step": 18200
},
{
"epoch": 3.66,
"grad_norm": 3.244032382965088,
"learning_rate": 7.93762711864407e-06,
"loss": 1.1164,
"step": 18300
},
{
"epoch": 3.68,
"grad_norm": 4.062005996704102,
"learning_rate": 7.869830508474577e-06,
"loss": 1.0872,
"step": 18400
},
{
"epoch": 3.7,
"grad_norm": 4.482209205627441,
"learning_rate": 7.802033898305086e-06,
"loss": 1.111,
"step": 18500
},
{
"epoch": 3.7,
"eval_loss": 1.174954891204834,
"eval_runtime": 32.2583,
"eval_samples_per_second": 31.0,
"eval_steps_per_second": 7.75,
"step": 18500
},
{
"epoch": 3.72,
"grad_norm": 3.1390604972839355,
"learning_rate": 7.734237288135595e-06,
"loss": 1.1059,
"step": 18600
},
{
"epoch": 3.74,
"grad_norm": 3.1146981716156006,
"learning_rate": 7.666440677966102e-06,
"loss": 1.1409,
"step": 18700
},
{
"epoch": 3.76,
"grad_norm": 4.213539123535156,
"learning_rate": 7.598644067796611e-06,
"loss": 1.0965,
"step": 18800
},
{
"epoch": 3.78,
"grad_norm": 4.170618057250977,
"learning_rate": 7.53084745762712e-06,
"loss": 1.1003,
"step": 18900
},
{
"epoch": 3.8,
"grad_norm": 3.52750301361084,
"learning_rate": 7.463050847457628e-06,
"loss": 1.1183,
"step": 19000
},
{
"epoch": 3.8,
"eval_loss": 1.1892881393432617,
"eval_runtime": 32.2511,
"eval_samples_per_second": 31.007,
"eval_steps_per_second": 7.752,
"step": 19000
}
],
"logging_steps": 100,
"max_steps": 30000,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"total_flos": 1.793641609691136e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}