{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8, "eval_steps": 500, "global_step": 19000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 5.5528106689453125, "learning_rate": 4.000000000000001e-06, "loss": 1.8395, "step": 100 }, { "epoch": 0.04, "grad_norm": 5.654870986938477, "learning_rate": 8.000000000000001e-06, "loss": 1.6814, "step": 200 }, { "epoch": 0.06, "grad_norm": 4.252018928527832, "learning_rate": 1.2e-05, "loss": 1.6776, "step": 300 }, { "epoch": 0.08, "grad_norm": 4.398709774017334, "learning_rate": 1.6000000000000003e-05, "loss": 1.6404, "step": 400 }, { "epoch": 0.1, "grad_norm": 4.0121259689331055, "learning_rate": 2e-05, "loss": 1.6651, "step": 500 }, { "epoch": 0.1, "eval_loss": 1.6371649503707886, "eval_runtime": 32.351, "eval_samples_per_second": 30.911, "eval_steps_per_second": 7.728, "step": 500 }, { "epoch": 0.12, "grad_norm": 3.420888900756836, "learning_rate": 1.9932203389830512e-05, "loss": 1.6651, "step": 600 }, { "epoch": 0.14, "grad_norm": 2.4271743297576904, "learning_rate": 1.986440677966102e-05, "loss": 1.6271, "step": 700 }, { "epoch": 0.16, "grad_norm": 5.406766891479492, "learning_rate": 1.9796610169491527e-05, "loss": 1.6201, "step": 800 }, { "epoch": 0.18, "grad_norm": 3.1450321674346924, "learning_rate": 1.9728813559322034e-05, "loss": 1.6106, "step": 900 }, { "epoch": 0.2, "grad_norm": 4.066416263580322, "learning_rate": 1.9661016949152545e-05, "loss": 1.5944, "step": 1000 }, { "epoch": 0.2, "eval_loss": 1.6080243587493896, "eval_runtime": 32.2909, "eval_samples_per_second": 30.968, "eval_steps_per_second": 7.742, "step": 1000 }, { "epoch": 0.22, "grad_norm": 3.158970355987549, "learning_rate": 1.9593220338983052e-05, "loss": 1.5668, "step": 1100 }, { "epoch": 0.24, "grad_norm": 4.5038371086120605, "learning_rate": 1.9525423728813562e-05, "loss": 1.5673, "step": 1200 }, { "epoch": 0.26, "grad_norm": 3.8768396377563477, "learning_rate": 1.945762711864407e-05, "loss": 1.5599, "step": 1300 }, { "epoch": 0.28, "grad_norm": 3.831594228744507, "learning_rate": 1.938983050847458e-05, "loss": 1.5649, "step": 1400 }, { "epoch": 0.3, "grad_norm": 3.2771167755126953, "learning_rate": 1.9322033898305087e-05, "loss": 1.5328, "step": 1500 }, { "epoch": 0.3, "eval_loss": 1.532382845878601, "eval_runtime": 32.3114, "eval_samples_per_second": 30.949, "eval_steps_per_second": 7.737, "step": 1500 }, { "epoch": 0.32, "grad_norm": 3.2504122257232666, "learning_rate": 1.9254237288135595e-05, "loss": 1.5412, "step": 1600 }, { "epoch": 0.34, "grad_norm": 4.105425834655762, "learning_rate": 1.9186440677966102e-05, "loss": 1.5223, "step": 1700 }, { "epoch": 0.36, "grad_norm": 3.0855212211608887, "learning_rate": 1.9118644067796613e-05, "loss": 1.5138, "step": 1800 }, { "epoch": 0.38, "grad_norm": 3.650761604309082, "learning_rate": 1.905084745762712e-05, "loss": 1.5128, "step": 1900 }, { "epoch": 0.4, "grad_norm": 2.0812368392944336, "learning_rate": 1.898305084745763e-05, "loss": 1.5115, "step": 2000 }, { "epoch": 0.4, "eval_loss": 1.511965036392212, "eval_runtime": 32.3915, "eval_samples_per_second": 30.872, "eval_steps_per_second": 7.718, "step": 2000 }, { "epoch": 0.42, "grad_norm": 3.7894039154052734, "learning_rate": 1.8915254237288138e-05, "loss": 1.5003, "step": 2100 }, { "epoch": 0.44, "grad_norm": 2.276301145553589, "learning_rate": 1.8847457627118645e-05, "loss": 1.4952, "step": 2200 }, { "epoch": 0.46, "grad_norm": 3.70339035987854, "learning_rate": 1.8779661016949152e-05, "loss": 1.495, "step": 2300 }, { "epoch": 0.48, "grad_norm": 2.6344492435455322, "learning_rate": 1.8711864406779663e-05, "loss": 1.4835, "step": 2400 }, { "epoch": 0.5, "grad_norm": 3.2948801517486572, "learning_rate": 1.8644745762711865e-05, "loss": 1.474, "step": 2500 }, { "epoch": 0.5, "eval_loss": 1.5233224630355835, "eval_runtime": 32.3386, "eval_samples_per_second": 30.923, "eval_steps_per_second": 7.731, "step": 2500 }, { "epoch": 0.52, "grad_norm": 2.458732843399048, "learning_rate": 1.857762711864407e-05, "loss": 1.4994, "step": 2600 }, { "epoch": 0.54, "grad_norm": 2.2232306003570557, "learning_rate": 1.850983050847458e-05, "loss": 1.4879, "step": 2700 }, { "epoch": 0.56, "grad_norm": 3.499060869216919, "learning_rate": 1.8442033898305086e-05, "loss": 1.4648, "step": 2800 }, { "epoch": 0.58, "grad_norm": 3.17518949508667, "learning_rate": 1.8374237288135593e-05, "loss": 1.4717, "step": 2900 }, { "epoch": 0.6, "grad_norm": 4.437788009643555, "learning_rate": 1.8306440677966104e-05, "loss": 1.4478, "step": 3000 }, { "epoch": 0.6, "eval_loss": 1.4587255716323853, "eval_runtime": 32.3711, "eval_samples_per_second": 30.892, "eval_steps_per_second": 7.723, "step": 3000 }, { "epoch": 0.62, "grad_norm": 3.0833561420440674, "learning_rate": 1.823864406779661e-05, "loss": 1.4441, "step": 3100 }, { "epoch": 0.64, "grad_norm": 2.600447416305542, "learning_rate": 1.817084745762712e-05, "loss": 1.4415, "step": 3200 }, { "epoch": 0.66, "grad_norm": 3.669921636581421, "learning_rate": 1.810305084745763e-05, "loss": 1.4458, "step": 3300 }, { "epoch": 0.68, "grad_norm": 3.342150926589966, "learning_rate": 1.803525423728814e-05, "loss": 1.4621, "step": 3400 }, { "epoch": 0.7, "grad_norm": 4.060861110687256, "learning_rate": 1.7967457627118647e-05, "loss": 1.4491, "step": 3500 }, { "epoch": 0.7, "eval_loss": 1.4404387474060059, "eval_runtime": 32.3788, "eval_samples_per_second": 30.884, "eval_steps_per_second": 7.721, "step": 3500 }, { "epoch": 0.72, "grad_norm": 3.0154595375061035, "learning_rate": 1.7899661016949154e-05, "loss": 1.4568, "step": 3600 }, { "epoch": 0.74, "grad_norm": 2.4156243801116943, "learning_rate": 1.783186440677966e-05, "loss": 1.4254, "step": 3700 }, { "epoch": 0.76, "grad_norm": 3.0124893188476562, "learning_rate": 1.776406779661017e-05, "loss": 1.4249, "step": 3800 }, { "epoch": 0.78, "grad_norm": 3.8340814113616943, "learning_rate": 1.769627118644068e-05, "loss": 1.4386, "step": 3900 }, { "epoch": 0.8, "grad_norm": 4.016916275024414, "learning_rate": 1.762847457627119e-05, "loss": 1.4098, "step": 4000 }, { "epoch": 0.8, "eval_loss": 1.4542571306228638, "eval_runtime": 32.2948, "eval_samples_per_second": 30.965, "eval_steps_per_second": 7.741, "step": 4000 }, { "epoch": 0.82, "grad_norm": 4.036525249481201, "learning_rate": 1.7560677966101697e-05, "loss": 1.4232, "step": 4100 }, { "epoch": 0.84, "grad_norm": 2.700068950653076, "learning_rate": 1.74935593220339e-05, "loss": 1.4081, "step": 4200 }, { "epoch": 0.86, "grad_norm": 3.3095715045928955, "learning_rate": 1.742576271186441e-05, "loss": 1.4065, "step": 4300 }, { "epoch": 0.88, "grad_norm": 2.9029970169067383, "learning_rate": 1.7357966101694917e-05, "loss": 1.4157, "step": 4400 }, { "epoch": 0.9, "grad_norm": 3.567429542541504, "learning_rate": 1.7290169491525424e-05, "loss": 1.3841, "step": 4500 }, { "epoch": 0.9, "eval_loss": 1.406548023223877, "eval_runtime": 32.2949, "eval_samples_per_second": 30.965, "eval_steps_per_second": 7.741, "step": 4500 }, { "epoch": 0.92, "grad_norm": 3.4792306423187256, "learning_rate": 1.722237288135593e-05, "loss": 1.393, "step": 4600 }, { "epoch": 0.94, "grad_norm": 2.3991451263427734, "learning_rate": 1.7154576271186442e-05, "loss": 1.4066, "step": 4700 }, { "epoch": 0.96, "grad_norm": 2.603165626525879, "learning_rate": 1.708677966101695e-05, "loss": 1.4169, "step": 4800 }, { "epoch": 0.98, "grad_norm": 2.465501070022583, "learning_rate": 1.701898305084746e-05, "loss": 1.3909, "step": 4900 }, { "epoch": 1.0, "grad_norm": 3.7463817596435547, "learning_rate": 1.6951186440677967e-05, "loss": 1.3661, "step": 5000 }, { "epoch": 1.0, "eval_loss": 1.3650578260421753, "eval_runtime": 32.2718, "eval_samples_per_second": 30.987, "eval_steps_per_second": 7.747, "step": 5000 }, { "epoch": 1.02, "grad_norm": 2.694695472717285, "learning_rate": 1.6883389830508478e-05, "loss": 1.3752, "step": 5100 }, { "epoch": 1.04, "grad_norm": 2.7569658756256104, "learning_rate": 1.6815593220338985e-05, "loss": 1.3567, "step": 5200 }, { "epoch": 1.06, "grad_norm": 2.8121705055236816, "learning_rate": 1.6747796610169492e-05, "loss": 1.3727, "step": 5300 }, { "epoch": 1.08, "grad_norm": 3.058004140853882, "learning_rate": 1.668e-05, "loss": 1.359, "step": 5400 }, { "epoch": 1.1, "grad_norm": 4.126440525054932, "learning_rate": 1.661220338983051e-05, "loss": 1.3795, "step": 5500 }, { "epoch": 1.1, "eval_loss": 1.4012497663497925, "eval_runtime": 32.2512, "eval_samples_per_second": 31.007, "eval_steps_per_second": 7.752, "step": 5500 }, { "epoch": 1.12, "grad_norm": 3.2426562309265137, "learning_rate": 1.6544406779661017e-05, "loss": 1.3641, "step": 5600 }, { "epoch": 1.14, "grad_norm": 2.7895913124084473, "learning_rate": 1.6476610169491528e-05, "loss": 1.3548, "step": 5700 }, { "epoch": 1.16, "grad_norm": 3.1663429737091064, "learning_rate": 1.6408813559322035e-05, "loss": 1.3569, "step": 5800 }, { "epoch": 1.18, "grad_norm": 2.3783955574035645, "learning_rate": 1.6341016949152542e-05, "loss": 1.34, "step": 5900 }, { "epoch": 1.2, "grad_norm": 2.695949077606201, "learning_rate": 1.627322033898305e-05, "loss": 1.3356, "step": 6000 }, { "epoch": 1.2, "eval_loss": 1.3493778705596924, "eval_runtime": 32.2453, "eval_samples_per_second": 31.012, "eval_steps_per_second": 7.753, "step": 6000 }, { "epoch": 1.22, "grad_norm": 3.2092180252075195, "learning_rate": 1.620542372881356e-05, "loss": 1.3288, "step": 6100 }, { "epoch": 1.24, "grad_norm": 2.4168381690979004, "learning_rate": 1.6137627118644068e-05, "loss": 1.3388, "step": 6200 }, { "epoch": 1.26, "grad_norm": 3.560577154159546, "learning_rate": 1.6069830508474578e-05, "loss": 1.3515, "step": 6300 }, { "epoch": 1.28, "grad_norm": 2.8059277534484863, "learning_rate": 1.600203389830509e-05, "loss": 1.3383, "step": 6400 }, { "epoch": 1.3, "grad_norm": 3.603806495666504, "learning_rate": 1.5934237288135596e-05, "loss": 1.3303, "step": 6500 }, { "epoch": 1.3, "eval_loss": 1.3483374118804932, "eval_runtime": 32.2455, "eval_samples_per_second": 31.012, "eval_steps_per_second": 7.753, "step": 6500 }, { "epoch": 1.32, "grad_norm": 2.532747268676758, "learning_rate": 1.5866440677966103e-05, "loss": 1.3303, "step": 6600 }, { "epoch": 1.34, "grad_norm": 3.174362897872925, "learning_rate": 1.579864406779661e-05, "loss": 1.3436, "step": 6700 }, { "epoch": 1.36, "grad_norm": 2.64054799079895, "learning_rate": 1.573084745762712e-05, "loss": 1.3435, "step": 6800 }, { "epoch": 1.38, "grad_norm": 2.959552526473999, "learning_rate": 1.5663050847457628e-05, "loss": 1.3477, "step": 6900 }, { "epoch": 1.4, "grad_norm": 3.166142225265503, "learning_rate": 1.559525423728814e-05, "loss": 1.3402, "step": 7000 }, { "epoch": 1.4, "eval_loss": 1.357351303100586, "eval_runtime": 32.3105, "eval_samples_per_second": 30.95, "eval_steps_per_second": 7.737, "step": 7000 }, { "epoch": 1.42, "grad_norm": 4.034038543701172, "learning_rate": 1.5527457627118646e-05, "loss": 1.3141, "step": 7100 }, { "epoch": 1.44, "grad_norm": 2.478321075439453, "learning_rate": 1.5459661016949153e-05, "loss": 1.3138, "step": 7200 }, { "epoch": 1.46, "grad_norm": 3.06643009185791, "learning_rate": 1.539186440677966e-05, "loss": 1.3212, "step": 7300 }, { "epoch": 1.48, "grad_norm": 2.68947434425354, "learning_rate": 1.532406779661017e-05, "loss": 1.3197, "step": 7400 }, { "epoch": 1.5, "grad_norm": 2.618062973022461, "learning_rate": 1.5256271186440678e-05, "loss": 1.2973, "step": 7500 }, { "epoch": 1.5, "eval_loss": 1.343194842338562, "eval_runtime": 32.349, "eval_samples_per_second": 30.913, "eval_steps_per_second": 7.728, "step": 7500 }, { "epoch": 1.52, "grad_norm": 2.5092427730560303, "learning_rate": 1.5188474576271189e-05, "loss": 1.3291, "step": 7600 }, { "epoch": 1.54, "grad_norm": 2.6408796310424805, "learning_rate": 1.5120677966101696e-05, "loss": 1.3014, "step": 7700 }, { "epoch": 1.56, "grad_norm": 3.846283197402954, "learning_rate": 1.5052881355932205e-05, "loss": 1.2986, "step": 7800 }, { "epoch": 1.58, "grad_norm": 2.641146659851074, "learning_rate": 1.4985084745762712e-05, "loss": 1.2978, "step": 7900 }, { "epoch": 1.6, "grad_norm": 2.446991443634033, "learning_rate": 1.4917288135593221e-05, "loss": 1.3095, "step": 8000 }, { "epoch": 1.6, "eval_loss": 1.2667104005813599, "eval_runtime": 32.376, "eval_samples_per_second": 30.887, "eval_steps_per_second": 7.722, "step": 8000 }, { "epoch": 1.62, "grad_norm": 3.4024269580841064, "learning_rate": 1.4849491525423729e-05, "loss": 1.2927, "step": 8100 }, { "epoch": 1.64, "grad_norm": 2.4591922760009766, "learning_rate": 1.478169491525424e-05, "loss": 1.2934, "step": 8200 }, { "epoch": 1.66, "grad_norm": 3.165149450302124, "learning_rate": 1.4714576271186442e-05, "loss": 1.3119, "step": 8300 }, { "epoch": 1.68, "grad_norm": 3.2600440979003906, "learning_rate": 1.464677966101695e-05, "loss": 1.2825, "step": 8400 }, { "epoch": 1.7, "grad_norm": 4.034482479095459, "learning_rate": 1.457898305084746e-05, "loss": 1.2913, "step": 8500 }, { "epoch": 1.7, "eval_loss": 1.276153326034546, "eval_runtime": 32.3382, "eval_samples_per_second": 30.923, "eval_steps_per_second": 7.731, "step": 8500 }, { "epoch": 1.72, "grad_norm": 4.266259670257568, "learning_rate": 1.4511186440677967e-05, "loss": 1.3002, "step": 8600 }, { "epoch": 1.74, "grad_norm": 3.357360601425171, "learning_rate": 1.4443389830508476e-05, "loss": 1.2685, "step": 8700 }, { "epoch": 1.76, "grad_norm": 3.463027000427246, "learning_rate": 1.4375593220338983e-05, "loss": 1.2937, "step": 8800 }, { "epoch": 1.78, "grad_norm": 2.545639991760254, "learning_rate": 1.4307796610169494e-05, "loss": 1.2895, "step": 8900 }, { "epoch": 1.8, "grad_norm": 3.091081142425537, "learning_rate": 1.4240000000000001e-05, "loss": 1.2932, "step": 9000 }, { "epoch": 1.8, "eval_loss": 1.2490341663360596, "eval_runtime": 32.3654, "eval_samples_per_second": 30.897, "eval_steps_per_second": 7.724, "step": 9000 }, { "epoch": 1.82, "grad_norm": 2.9936749935150146, "learning_rate": 1.417220338983051e-05, "loss": 1.2867, "step": 9100 }, { "epoch": 1.84, "grad_norm": 2.4961957931518555, "learning_rate": 1.4104406779661017e-05, "loss": 1.2899, "step": 9200 }, { "epoch": 1.86, "grad_norm": 2.526224136352539, "learning_rate": 1.4036610169491528e-05, "loss": 1.2855, "step": 9300 }, { "epoch": 1.88, "grad_norm": 3.532458543777466, "learning_rate": 1.3968813559322035e-05, "loss": 1.2566, "step": 9400 }, { "epoch": 1.9, "grad_norm": 3.7112512588500977, "learning_rate": 1.3901016949152544e-05, "loss": 1.2645, "step": 9500 }, { "epoch": 1.9, "eval_loss": 1.2843515872955322, "eval_runtime": 32.3071, "eval_samples_per_second": 30.953, "eval_steps_per_second": 7.738, "step": 9500 }, { "epoch": 1.92, "grad_norm": 2.6405515670776367, "learning_rate": 1.3833220338983051e-05, "loss": 1.2673, "step": 9600 }, { "epoch": 1.94, "grad_norm": 3.5349998474121094, "learning_rate": 1.376542372881356e-05, "loss": 1.2811, "step": 9700 }, { "epoch": 1.96, "grad_norm": 3.587463140487671, "learning_rate": 1.3697627118644067e-05, "loss": 1.2501, "step": 9800 }, { "epoch": 1.98, "grad_norm": 2.2374660968780518, "learning_rate": 1.3629830508474578e-05, "loss": 1.2726, "step": 9900 }, { "epoch": 2.0, "grad_norm": 2.9396588802337646, "learning_rate": 1.3562033898305085e-05, "loss": 1.2564, "step": 10000 }, { "epoch": 2.0, "eval_loss": 1.3057665824890137, "eval_runtime": 32.3132, "eval_samples_per_second": 30.947, "eval_steps_per_second": 7.737, "step": 10000 }, { "epoch": 2.02, "grad_norm": 2.5827131271362305, "learning_rate": 1.3494237288135594e-05, "loss": 1.2423, "step": 10100 }, { "epoch": 2.04, "grad_norm": 3.370476722717285, "learning_rate": 1.3426440677966105e-05, "loss": 1.2493, "step": 10200 }, { "epoch": 2.06, "grad_norm": 3.8779571056365967, "learning_rate": 1.3358644067796612e-05, "loss": 1.2331, "step": 10300 }, { "epoch": 2.08, "grad_norm": 2.460205078125, "learning_rate": 1.329084745762712e-05, "loss": 1.2425, "step": 10400 }, { "epoch": 2.1, "grad_norm": 3.0094106197357178, "learning_rate": 1.3223050847457628e-05, "loss": 1.2036, "step": 10500 }, { "epoch": 2.1, "eval_loss": 1.2363600730895996, "eval_runtime": 32.2863, "eval_samples_per_second": 30.973, "eval_steps_per_second": 7.743, "step": 10500 }, { "epoch": 2.12, "grad_norm": 3.705883741378784, "learning_rate": 1.3155254237288137e-05, "loss": 1.221, "step": 10600 }, { "epoch": 2.14, "grad_norm": 4.502602577209473, "learning_rate": 1.3087457627118644e-05, "loss": 1.2481, "step": 10700 }, { "epoch": 2.16, "grad_norm": 3.3677573204040527, "learning_rate": 1.3019661016949155e-05, "loss": 1.2156, "step": 10800 }, { "epoch": 2.18, "grad_norm": 4.022857666015625, "learning_rate": 1.2951864406779662e-05, "loss": 1.2154, "step": 10900 }, { "epoch": 2.2, "grad_norm": 3.1049187183380127, "learning_rate": 1.2884067796610171e-05, "loss": 1.2385, "step": 11000 }, { "epoch": 2.2, "eval_loss": 1.284387469291687, "eval_runtime": 32.2655, "eval_samples_per_second": 30.993, "eval_steps_per_second": 7.748, "step": 11000 }, { "epoch": 2.22, "grad_norm": 2.980409622192383, "learning_rate": 1.2816271186440678e-05, "loss": 1.2451, "step": 11100 }, { "epoch": 2.24, "grad_norm": 3.34755539894104, "learning_rate": 1.2748474576271189e-05, "loss": 1.2361, "step": 11200 }, { "epoch": 2.26, "grad_norm": 2.9254653453826904, "learning_rate": 1.2680677966101696e-05, "loss": 1.2031, "step": 11300 }, { "epoch": 2.28, "grad_norm": 4.1410698890686035, "learning_rate": 1.2612881355932205e-05, "loss": 1.2119, "step": 11400 }, { "epoch": 2.3, "grad_norm": 3.1164631843566895, "learning_rate": 1.2545084745762712e-05, "loss": 1.2355, "step": 11500 }, { "epoch": 2.3, "eval_loss": 1.2927731275558472, "eval_runtime": 32.2864, "eval_samples_per_second": 30.973, "eval_steps_per_second": 7.743, "step": 11500 }, { "epoch": 2.32, "grad_norm": 3.6003384590148926, "learning_rate": 1.2477288135593221e-05, "loss": 1.2143, "step": 11600 }, { "epoch": 2.34, "grad_norm": 3.513211488723755, "learning_rate": 1.240949152542373e-05, "loss": 1.2193, "step": 11700 }, { "epoch": 2.36, "grad_norm": 4.301449298858643, "learning_rate": 1.2341694915254239e-05, "loss": 1.2109, "step": 11800 }, { "epoch": 2.38, "grad_norm": 2.6304187774658203, "learning_rate": 1.2273898305084746e-05, "loss": 1.2171, "step": 11900 }, { "epoch": 2.4, "grad_norm": 3.6256394386291504, "learning_rate": 1.220677966101695e-05, "loss": 1.2273, "step": 12000 }, { "epoch": 2.4, "eval_loss": 1.2177479267120361, "eval_runtime": 32.2863, "eval_samples_per_second": 30.973, "eval_steps_per_second": 7.743, "step": 12000 }, { "epoch": 2.42, "grad_norm": 3.3076181411743164, "learning_rate": 1.213898305084746e-05, "loss": 1.2202, "step": 12100 }, { "epoch": 2.44, "grad_norm": 3.64410400390625, "learning_rate": 1.2071864406779664e-05, "loss": 1.1953, "step": 12200 }, { "epoch": 2.46, "grad_norm": 3.324385643005371, "learning_rate": 1.200406779661017e-05, "loss": 1.2154, "step": 12300 }, { "epoch": 2.48, "grad_norm": 4.0625176429748535, "learning_rate": 1.193627118644068e-05, "loss": 1.2229, "step": 12400 }, { "epoch": 2.5, "grad_norm": 2.672346830368042, "learning_rate": 1.1868474576271187e-05, "loss": 1.214, "step": 12500 }, { "epoch": 2.5, "eval_loss": 1.2213943004608154, "eval_runtime": 32.2947, "eval_samples_per_second": 30.965, "eval_steps_per_second": 7.741, "step": 12500 }, { "epoch": 2.52, "grad_norm": 3.8866512775421143, "learning_rate": 1.1800677966101698e-05, "loss": 1.1915, "step": 12600 }, { "epoch": 2.54, "grad_norm": 3.414454460144043, "learning_rate": 1.1732881355932205e-05, "loss": 1.1973, "step": 12700 }, { "epoch": 2.56, "grad_norm": 2.820164442062378, "learning_rate": 1.1665084745762714e-05, "loss": 1.1943, "step": 12800 }, { "epoch": 2.58, "grad_norm": 3.2248144149780273, "learning_rate": 1.1597288135593221e-05, "loss": 1.2034, "step": 12900 }, { "epoch": 2.6, "grad_norm": 2.916104793548584, "learning_rate": 1.1530169491525425e-05, "loss": 1.2177, "step": 13000 }, { "epoch": 2.6, "eval_loss": 1.2506352663040161, "eval_runtime": 32.3181, "eval_samples_per_second": 30.942, "eval_steps_per_second": 7.736, "step": 13000 }, { "epoch": 2.62, "grad_norm": 2.8287951946258545, "learning_rate": 1.1462372881355932e-05, "loss": 1.1992, "step": 13100 }, { "epoch": 2.64, "grad_norm": 3.4299583435058594, "learning_rate": 1.1394576271186441e-05, "loss": 1.2133, "step": 13200 }, { "epoch": 2.66, "grad_norm": 2.9985878467559814, "learning_rate": 1.132677966101695e-05, "loss": 1.1709, "step": 13300 }, { "epoch": 2.68, "grad_norm": 3.1843700408935547, "learning_rate": 1.125898305084746e-05, "loss": 1.2063, "step": 13400 }, { "epoch": 2.7, "grad_norm": 3.3279943466186523, "learning_rate": 1.1191186440677968e-05, "loss": 1.1935, "step": 13500 }, { "epoch": 2.7, "eval_loss": 1.2275168895721436, "eval_runtime": 32.3349, "eval_samples_per_second": 30.926, "eval_steps_per_second": 7.732, "step": 13500 }, { "epoch": 2.72, "grad_norm": 3.5022013187408447, "learning_rate": 1.1123389830508475e-05, "loss": 1.2001, "step": 13600 }, { "epoch": 2.74, "grad_norm": 4.141532897949219, "learning_rate": 1.1055593220338984e-05, "loss": 1.1882, "step": 13700 }, { "epoch": 2.76, "grad_norm": 3.668473958969116, "learning_rate": 1.0987796610169492e-05, "loss": 1.2004, "step": 13800 }, { "epoch": 2.78, "grad_norm": 2.9693169593811035, "learning_rate": 1.0920000000000002e-05, "loss": 1.1908, "step": 13900 }, { "epoch": 2.8, "grad_norm": 2.173802137374878, "learning_rate": 1.085220338983051e-05, "loss": 1.1891, "step": 14000 }, { "epoch": 2.8, "eval_loss": 1.2043945789337158, "eval_runtime": 32.3786, "eval_samples_per_second": 30.885, "eval_steps_per_second": 7.721, "step": 14000 }, { "epoch": 2.82, "grad_norm": 2.77329158782959, "learning_rate": 1.0784406779661018e-05, "loss": 1.1952, "step": 14100 }, { "epoch": 2.84, "grad_norm": 2.3273532390594482, "learning_rate": 1.0716610169491526e-05, "loss": 1.1894, "step": 14200 }, { "epoch": 2.86, "grad_norm": 2.993412971496582, "learning_rate": 1.0648813559322036e-05, "loss": 1.168, "step": 14300 }, { "epoch": 2.88, "grad_norm": 3.5041513442993164, "learning_rate": 1.0581016949152543e-05, "loss": 1.1891, "step": 14400 }, { "epoch": 2.9, "grad_norm": 2.8928792476654053, "learning_rate": 1.0513220338983052e-05, "loss": 1.1771, "step": 14500 }, { "epoch": 2.9, "eval_loss": 1.2012468576431274, "eval_runtime": 32.3112, "eval_samples_per_second": 30.949, "eval_steps_per_second": 7.737, "step": 14500 }, { "epoch": 2.92, "grad_norm": 2.976024627685547, "learning_rate": 1.044542372881356e-05, "loss": 1.1836, "step": 14600 }, { "epoch": 2.94, "grad_norm": 3.1303913593292236, "learning_rate": 1.0377627118644068e-05, "loss": 1.1851, "step": 14700 }, { "epoch": 2.96, "grad_norm": 2.8638460636138916, "learning_rate": 1.0309830508474576e-05, "loss": 1.1732, "step": 14800 }, { "epoch": 2.98, "grad_norm": 2.416059732437134, "learning_rate": 1.0242033898305086e-05, "loss": 1.1905, "step": 14900 }, { "epoch": 3.0, "grad_norm": 3.996770143508911, "learning_rate": 1.0174237288135594e-05, "loss": 1.2044, "step": 15000 }, { "epoch": 3.0, "eval_loss": 1.1813915967941284, "eval_runtime": 32.2878, "eval_samples_per_second": 30.971, "eval_steps_per_second": 7.743, "step": 15000 }, { "epoch": 3.02, "grad_norm": 2.469172477722168, "learning_rate": 1.0106440677966102e-05, "loss": 1.148, "step": 15100 }, { "epoch": 3.04, "grad_norm": 8.27697467803955, "learning_rate": 1.003864406779661e-05, "loss": 1.1733, "step": 15200 }, { "epoch": 3.06, "grad_norm": 3.0315303802490234, "learning_rate": 9.97084745762712e-06, "loss": 1.1478, "step": 15300 }, { "epoch": 3.08, "grad_norm": 2.41133189201355, "learning_rate": 9.903050847457628e-06, "loss": 1.122, "step": 15400 }, { "epoch": 3.1, "grad_norm": 3.001695394515991, "learning_rate": 9.835254237288136e-06, "loss": 1.1345, "step": 15500 }, { "epoch": 3.1, "eval_loss": 1.2133294343948364, "eval_runtime": 32.2811, "eval_samples_per_second": 30.978, "eval_steps_per_second": 7.744, "step": 15500 }, { "epoch": 3.12, "grad_norm": 5.538024425506592, "learning_rate": 9.767457627118645e-06, "loss": 1.1493, "step": 15600 }, { "epoch": 3.14, "grad_norm": 4.214341640472412, "learning_rate": 9.699661016949153e-06, "loss": 1.1327, "step": 15700 }, { "epoch": 3.16, "grad_norm": 3.586280345916748, "learning_rate": 9.631864406779662e-06, "loss": 1.1381, "step": 15800 }, { "epoch": 3.18, "grad_norm": 4.103856563568115, "learning_rate": 9.56406779661017e-06, "loss": 1.1471, "step": 15900 }, { "epoch": 3.2, "grad_norm": 3.964653491973877, "learning_rate": 9.49627118644068e-06, "loss": 1.1299, "step": 16000 }, { "epoch": 3.2, "eval_loss": 1.2055881023406982, "eval_runtime": 32.3191, "eval_samples_per_second": 30.941, "eval_steps_per_second": 7.735, "step": 16000 }, { "epoch": 3.22, "grad_norm": 2.7966806888580322, "learning_rate": 9.428474576271187e-06, "loss": 1.1144, "step": 16100 }, { "epoch": 3.24, "grad_norm": 3.176314115524292, "learning_rate": 9.360677966101696e-06, "loss": 1.1405, "step": 16200 }, { "epoch": 3.26, "grad_norm": 4.957722187042236, "learning_rate": 9.292881355932204e-06, "loss": 1.1171, "step": 16300 }, { "epoch": 3.28, "grad_norm": 3.398547410964966, "learning_rate": 9.225084745762712e-06, "loss": 1.1289, "step": 16400 }, { "epoch": 3.3, "grad_norm": 3.477339267730713, "learning_rate": 9.15728813559322e-06, "loss": 1.1132, "step": 16500 }, { "epoch": 3.3, "eval_loss": 1.2293468713760376, "eval_runtime": 32.2612, "eval_samples_per_second": 30.997, "eval_steps_per_second": 7.749, "step": 16500 }, { "epoch": 3.32, "grad_norm": 4.367581844329834, "learning_rate": 9.08949152542373e-06, "loss": 1.1241, "step": 16600 }, { "epoch": 3.34, "grad_norm": 3.551278591156006, "learning_rate": 9.021694915254238e-06, "loss": 1.1188, "step": 16700 }, { "epoch": 3.36, "grad_norm": 3.29950213432312, "learning_rate": 8.953898305084746e-06, "loss": 1.1299, "step": 16800 }, { "epoch": 3.38, "grad_norm": 3.1226329803466797, "learning_rate": 8.886101694915255e-06, "loss": 1.1239, "step": 16900 }, { "epoch": 3.4, "grad_norm": 2.9976165294647217, "learning_rate": 8.818305084745764e-06, "loss": 1.1329, "step": 17000 }, { "epoch": 3.4, "eval_loss": 1.1932790279388428, "eval_runtime": 32.2514, "eval_samples_per_second": 31.006, "eval_steps_per_second": 7.752, "step": 17000 }, { "epoch": 3.42, "grad_norm": 2.9511375427246094, "learning_rate": 8.75050847457627e-06, "loss": 1.1306, "step": 17100 }, { "epoch": 3.44, "grad_norm": 3.326470375061035, "learning_rate": 8.68271186440678e-06, "loss": 1.1232, "step": 17200 }, { "epoch": 3.46, "grad_norm": 3.6301770210266113, "learning_rate": 8.614915254237289e-06, "loss": 1.1215, "step": 17300 }, { "epoch": 3.48, "grad_norm": 3.658932685852051, "learning_rate": 8.547118644067798e-06, "loss": 1.115, "step": 17400 }, { "epoch": 3.5, "grad_norm": 2.451982021331787, "learning_rate": 8.479322033898306e-06, "loss": 1.1253, "step": 17500 }, { "epoch": 3.5, "eval_loss": 1.2195427417755127, "eval_runtime": 32.2665, "eval_samples_per_second": 30.992, "eval_steps_per_second": 7.748, "step": 17500 }, { "epoch": 3.52, "grad_norm": 3.728940725326538, "learning_rate": 8.411525423728815e-06, "loss": 1.1191, "step": 17600 }, { "epoch": 3.54, "grad_norm": 4.087761878967285, "learning_rate": 8.343728813559323e-06, "loss": 1.1239, "step": 17700 }, { "epoch": 3.56, "grad_norm": 3.1904852390289307, "learning_rate": 8.275932203389832e-06, "loss": 1.1036, "step": 17800 }, { "epoch": 3.58, "grad_norm": 4.449623107910156, "learning_rate": 8.20813559322034e-06, "loss": 1.1139, "step": 17900 }, { "epoch": 3.6, "grad_norm": 2.611001968383789, "learning_rate": 8.140338983050848e-06, "loss": 1.1096, "step": 18000 }, { "epoch": 3.6, "eval_loss": 1.1555566787719727, "eval_runtime": 32.2418, "eval_samples_per_second": 31.016, "eval_steps_per_second": 7.754, "step": 18000 }, { "epoch": 3.62, "grad_norm": 3.380537271499634, "learning_rate": 8.072542372881357e-06, "loss": 1.1244, "step": 18100 }, { "epoch": 3.64, "grad_norm": 3.485279083251953, "learning_rate": 8.004745762711866e-06, "loss": 1.1103, "step": 18200 }, { "epoch": 3.66, "grad_norm": 3.244032382965088, "learning_rate": 7.93762711864407e-06, "loss": 1.1164, "step": 18300 }, { "epoch": 3.68, "grad_norm": 4.062005996704102, "learning_rate": 7.869830508474577e-06, "loss": 1.0872, "step": 18400 }, { "epoch": 3.7, "grad_norm": 4.482209205627441, "learning_rate": 7.802033898305086e-06, "loss": 1.111, "step": 18500 }, { "epoch": 3.7, "eval_loss": 1.174954891204834, "eval_runtime": 32.2583, "eval_samples_per_second": 31.0, "eval_steps_per_second": 7.75, "step": 18500 }, { "epoch": 3.72, "grad_norm": 3.1390604972839355, "learning_rate": 7.734237288135595e-06, "loss": 1.1059, "step": 18600 }, { "epoch": 3.74, "grad_norm": 3.1146981716156006, "learning_rate": 7.666440677966102e-06, "loss": 1.1409, "step": 18700 }, { "epoch": 3.76, "grad_norm": 4.213539123535156, "learning_rate": 7.598644067796611e-06, "loss": 1.0965, "step": 18800 }, { "epoch": 3.78, "grad_norm": 4.170618057250977, "learning_rate": 7.53084745762712e-06, "loss": 1.1003, "step": 18900 }, { "epoch": 3.8, "grad_norm": 3.52750301361084, "learning_rate": 7.463050847457628e-06, "loss": 1.1183, "step": 19000 }, { "epoch": 3.8, "eval_loss": 1.1892881393432617, "eval_runtime": 32.2511, "eval_samples_per_second": 31.007, "eval_steps_per_second": 7.752, "step": 19000 } ], "logging_steps": 100, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "total_flos": 1.793641609691136e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }