| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.8, | |
| "eval_steps": 500, | |
| "global_step": 19000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 5.5528106689453125, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.8395, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 5.654870986938477, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.6814, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 4.252018928527832, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.6776, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.398709774017334, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.6404, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.0121259689331055, | |
| "learning_rate": 2e-05, | |
| "loss": 1.6651, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 1.6371649503707886, | |
| "eval_runtime": 32.351, | |
| "eval_samples_per_second": 30.911, | |
| "eval_steps_per_second": 7.728, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.420888900756836, | |
| "learning_rate": 1.9932203389830512e-05, | |
| "loss": 1.6651, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.4271743297576904, | |
| "learning_rate": 1.986440677966102e-05, | |
| "loss": 1.6271, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.406766891479492, | |
| "learning_rate": 1.9796610169491527e-05, | |
| "loss": 1.6201, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 3.1450321674346924, | |
| "learning_rate": 1.9728813559322034e-05, | |
| "loss": 1.6106, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.066416263580322, | |
| "learning_rate": 1.9661016949152545e-05, | |
| "loss": 1.5944, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 1.6080243587493896, | |
| "eval_runtime": 32.2909, | |
| "eval_samples_per_second": 30.968, | |
| "eval_steps_per_second": 7.742, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 3.158970355987549, | |
| "learning_rate": 1.9593220338983052e-05, | |
| "loss": 1.5668, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.5038371086120605, | |
| "learning_rate": 1.9525423728813562e-05, | |
| "loss": 1.5673, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 3.8768396377563477, | |
| "learning_rate": 1.945762711864407e-05, | |
| "loss": 1.5599, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 3.831594228744507, | |
| "learning_rate": 1.938983050847458e-05, | |
| "loss": 1.5649, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 3.2771167755126953, | |
| "learning_rate": 1.9322033898305087e-05, | |
| "loss": 1.5328, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 1.532382845878601, | |
| "eval_runtime": 32.3114, | |
| "eval_samples_per_second": 30.949, | |
| "eval_steps_per_second": 7.737, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.2504122257232666, | |
| "learning_rate": 1.9254237288135595e-05, | |
| "loss": 1.5412, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.105425834655762, | |
| "learning_rate": 1.9186440677966102e-05, | |
| "loss": 1.5223, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.0855212211608887, | |
| "learning_rate": 1.9118644067796613e-05, | |
| "loss": 1.5138, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 3.650761604309082, | |
| "learning_rate": 1.905084745762712e-05, | |
| "loss": 1.5128, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.0812368392944336, | |
| "learning_rate": 1.898305084745763e-05, | |
| "loss": 1.5115, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.511965036392212, | |
| "eval_runtime": 32.3915, | |
| "eval_samples_per_second": 30.872, | |
| "eval_steps_per_second": 7.718, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 3.7894039154052734, | |
| "learning_rate": 1.8915254237288138e-05, | |
| "loss": 1.5003, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.276301145553589, | |
| "learning_rate": 1.8847457627118645e-05, | |
| "loss": 1.4952, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 3.70339035987854, | |
| "learning_rate": 1.8779661016949152e-05, | |
| "loss": 1.495, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.6344492435455322, | |
| "learning_rate": 1.8711864406779663e-05, | |
| "loss": 1.4835, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.2948801517486572, | |
| "learning_rate": 1.8644745762711865e-05, | |
| "loss": 1.474, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.5233224630355835, | |
| "eval_runtime": 32.3386, | |
| "eval_samples_per_second": 30.923, | |
| "eval_steps_per_second": 7.731, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.458732843399048, | |
| "learning_rate": 1.857762711864407e-05, | |
| "loss": 1.4994, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.2232306003570557, | |
| "learning_rate": 1.850983050847458e-05, | |
| "loss": 1.4879, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.499060869216919, | |
| "learning_rate": 1.8442033898305086e-05, | |
| "loss": 1.4648, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 3.17518949508667, | |
| "learning_rate": 1.8374237288135593e-05, | |
| "loss": 1.4717, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 4.437788009643555, | |
| "learning_rate": 1.8306440677966104e-05, | |
| "loss": 1.4478, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 1.4587255716323853, | |
| "eval_runtime": 32.3711, | |
| "eval_samples_per_second": 30.892, | |
| "eval_steps_per_second": 7.723, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 3.0833561420440674, | |
| "learning_rate": 1.823864406779661e-05, | |
| "loss": 1.4441, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.600447416305542, | |
| "learning_rate": 1.817084745762712e-05, | |
| "loss": 1.4415, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 3.669921636581421, | |
| "learning_rate": 1.810305084745763e-05, | |
| "loss": 1.4458, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 3.342150926589966, | |
| "learning_rate": 1.803525423728814e-05, | |
| "loss": 1.4621, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 4.060861110687256, | |
| "learning_rate": 1.7967457627118647e-05, | |
| "loss": 1.4491, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 1.4404387474060059, | |
| "eval_runtime": 32.3788, | |
| "eval_samples_per_second": 30.884, | |
| "eval_steps_per_second": 7.721, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.0154595375061035, | |
| "learning_rate": 1.7899661016949154e-05, | |
| "loss": 1.4568, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.4156243801116943, | |
| "learning_rate": 1.783186440677966e-05, | |
| "loss": 1.4254, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 3.0124893188476562, | |
| "learning_rate": 1.776406779661017e-05, | |
| "loss": 1.4249, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.8340814113616943, | |
| "learning_rate": 1.769627118644068e-05, | |
| "loss": 1.4386, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 4.016916275024414, | |
| "learning_rate": 1.762847457627119e-05, | |
| "loss": 1.4098, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.4542571306228638, | |
| "eval_runtime": 32.2948, | |
| "eval_samples_per_second": 30.965, | |
| "eval_steps_per_second": 7.741, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 4.036525249481201, | |
| "learning_rate": 1.7560677966101697e-05, | |
| "loss": 1.4232, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.700068950653076, | |
| "learning_rate": 1.74935593220339e-05, | |
| "loss": 1.4081, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 3.3095715045928955, | |
| "learning_rate": 1.742576271186441e-05, | |
| "loss": 1.4065, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.9029970169067383, | |
| "learning_rate": 1.7357966101694917e-05, | |
| "loss": 1.4157, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 3.567429542541504, | |
| "learning_rate": 1.7290169491525424e-05, | |
| "loss": 1.3841, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": 1.406548023223877, | |
| "eval_runtime": 32.2949, | |
| "eval_samples_per_second": 30.965, | |
| "eval_steps_per_second": 7.741, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 3.4792306423187256, | |
| "learning_rate": 1.722237288135593e-05, | |
| "loss": 1.393, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.3991451263427734, | |
| "learning_rate": 1.7154576271186442e-05, | |
| "loss": 1.4066, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.603165626525879, | |
| "learning_rate": 1.708677966101695e-05, | |
| "loss": 1.4169, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.465501070022583, | |
| "learning_rate": 1.701898305084746e-05, | |
| "loss": 1.3909, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.7463817596435547, | |
| "learning_rate": 1.6951186440677967e-05, | |
| "loss": 1.3661, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.3650578260421753, | |
| "eval_runtime": 32.2718, | |
| "eval_samples_per_second": 30.987, | |
| "eval_steps_per_second": 7.747, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.694695472717285, | |
| "learning_rate": 1.6883389830508478e-05, | |
| "loss": 1.3752, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.7569658756256104, | |
| "learning_rate": 1.6815593220338985e-05, | |
| "loss": 1.3567, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.8121705055236816, | |
| "learning_rate": 1.6747796610169492e-05, | |
| "loss": 1.3727, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 3.058004140853882, | |
| "learning_rate": 1.668e-05, | |
| "loss": 1.359, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 4.126440525054932, | |
| "learning_rate": 1.661220338983051e-05, | |
| "loss": 1.3795, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "eval_loss": 1.4012497663497925, | |
| "eval_runtime": 32.2512, | |
| "eval_samples_per_second": 31.007, | |
| "eval_steps_per_second": 7.752, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 3.2426562309265137, | |
| "learning_rate": 1.6544406779661017e-05, | |
| "loss": 1.3641, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.7895913124084473, | |
| "learning_rate": 1.6476610169491528e-05, | |
| "loss": 1.3548, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 3.1663429737091064, | |
| "learning_rate": 1.6408813559322035e-05, | |
| "loss": 1.3569, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.3783955574035645, | |
| "learning_rate": 1.6341016949152542e-05, | |
| "loss": 1.34, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.695949077606201, | |
| "learning_rate": 1.627322033898305e-05, | |
| "loss": 1.3356, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 1.3493778705596924, | |
| "eval_runtime": 32.2453, | |
| "eval_samples_per_second": 31.012, | |
| "eval_steps_per_second": 7.753, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 3.2092180252075195, | |
| "learning_rate": 1.620542372881356e-05, | |
| "loss": 1.3288, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 2.4168381690979004, | |
| "learning_rate": 1.6137627118644068e-05, | |
| "loss": 1.3388, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 3.560577154159546, | |
| "learning_rate": 1.6069830508474578e-05, | |
| "loss": 1.3515, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.8059277534484863, | |
| "learning_rate": 1.600203389830509e-05, | |
| "loss": 1.3383, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 3.603806495666504, | |
| "learning_rate": 1.5934237288135596e-05, | |
| "loss": 1.3303, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "eval_loss": 1.3483374118804932, | |
| "eval_runtime": 32.2455, | |
| "eval_samples_per_second": 31.012, | |
| "eval_steps_per_second": 7.753, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.532747268676758, | |
| "learning_rate": 1.5866440677966103e-05, | |
| "loss": 1.3303, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 3.174362897872925, | |
| "learning_rate": 1.579864406779661e-05, | |
| "loss": 1.3436, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.64054799079895, | |
| "learning_rate": 1.573084745762712e-05, | |
| "loss": 1.3435, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 2.959552526473999, | |
| "learning_rate": 1.5663050847457628e-05, | |
| "loss": 1.3477, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 3.166142225265503, | |
| "learning_rate": 1.559525423728814e-05, | |
| "loss": 1.3402, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_loss": 1.357351303100586, | |
| "eval_runtime": 32.3105, | |
| "eval_samples_per_second": 30.95, | |
| "eval_steps_per_second": 7.737, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 4.034038543701172, | |
| "learning_rate": 1.5527457627118646e-05, | |
| "loss": 1.3141, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.478321075439453, | |
| "learning_rate": 1.5459661016949153e-05, | |
| "loss": 1.3138, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 3.06643009185791, | |
| "learning_rate": 1.539186440677966e-05, | |
| "loss": 1.3212, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.68947434425354, | |
| "learning_rate": 1.532406779661017e-05, | |
| "loss": 1.3197, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.618062973022461, | |
| "learning_rate": 1.5256271186440678e-05, | |
| "loss": 1.2973, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 1.343194842338562, | |
| "eval_runtime": 32.349, | |
| "eval_samples_per_second": 30.913, | |
| "eval_steps_per_second": 7.728, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.5092427730560303, | |
| "learning_rate": 1.5188474576271189e-05, | |
| "loss": 1.3291, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.6408796310424805, | |
| "learning_rate": 1.5120677966101696e-05, | |
| "loss": 1.3014, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 3.846283197402954, | |
| "learning_rate": 1.5052881355932205e-05, | |
| "loss": 1.2986, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.641146659851074, | |
| "learning_rate": 1.4985084745762712e-05, | |
| "loss": 1.2978, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.446991443634033, | |
| "learning_rate": 1.4917288135593221e-05, | |
| "loss": 1.3095, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.2667104005813599, | |
| "eval_runtime": 32.376, | |
| "eval_samples_per_second": 30.887, | |
| "eval_steps_per_second": 7.722, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 3.4024269580841064, | |
| "learning_rate": 1.4849491525423729e-05, | |
| "loss": 1.2927, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.4591922760009766, | |
| "learning_rate": 1.478169491525424e-05, | |
| "loss": 1.2934, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 3.165149450302124, | |
| "learning_rate": 1.4714576271186442e-05, | |
| "loss": 1.3119, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 3.2600440979003906, | |
| "learning_rate": 1.464677966101695e-05, | |
| "loss": 1.2825, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 4.034482479095459, | |
| "learning_rate": 1.457898305084746e-05, | |
| "loss": 1.2913, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "eval_loss": 1.276153326034546, | |
| "eval_runtime": 32.3382, | |
| "eval_samples_per_second": 30.923, | |
| "eval_steps_per_second": 7.731, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 4.266259670257568, | |
| "learning_rate": 1.4511186440677967e-05, | |
| "loss": 1.3002, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 3.357360601425171, | |
| "learning_rate": 1.4443389830508476e-05, | |
| "loss": 1.2685, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 3.463027000427246, | |
| "learning_rate": 1.4375593220338983e-05, | |
| "loss": 1.2937, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.545639991760254, | |
| "learning_rate": 1.4307796610169494e-05, | |
| "loss": 1.2895, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 3.091081142425537, | |
| "learning_rate": 1.4240000000000001e-05, | |
| "loss": 1.2932, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "eval_loss": 1.2490341663360596, | |
| "eval_runtime": 32.3654, | |
| "eval_samples_per_second": 30.897, | |
| "eval_steps_per_second": 7.724, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 2.9936749935150146, | |
| "learning_rate": 1.417220338983051e-05, | |
| "loss": 1.2867, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.4961957931518555, | |
| "learning_rate": 1.4104406779661017e-05, | |
| "loss": 1.2899, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.526224136352539, | |
| "learning_rate": 1.4036610169491528e-05, | |
| "loss": 1.2855, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 3.532458543777466, | |
| "learning_rate": 1.3968813559322035e-05, | |
| "loss": 1.2566, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 3.7112512588500977, | |
| "learning_rate": 1.3901016949152544e-05, | |
| "loss": 1.2645, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_loss": 1.2843515872955322, | |
| "eval_runtime": 32.3071, | |
| "eval_samples_per_second": 30.953, | |
| "eval_steps_per_second": 7.738, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.6405515670776367, | |
| "learning_rate": 1.3833220338983051e-05, | |
| "loss": 1.2673, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 3.5349998474121094, | |
| "learning_rate": 1.376542372881356e-05, | |
| "loss": 1.2811, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 3.587463140487671, | |
| "learning_rate": 1.3697627118644067e-05, | |
| "loss": 1.2501, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.2374660968780518, | |
| "learning_rate": 1.3629830508474578e-05, | |
| "loss": 1.2726, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.9396588802337646, | |
| "learning_rate": 1.3562033898305085e-05, | |
| "loss": 1.2564, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.3057665824890137, | |
| "eval_runtime": 32.3132, | |
| "eval_samples_per_second": 30.947, | |
| "eval_steps_per_second": 7.737, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 2.5827131271362305, | |
| "learning_rate": 1.3494237288135594e-05, | |
| "loss": 1.2423, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 3.370476722717285, | |
| "learning_rate": 1.3426440677966105e-05, | |
| "loss": 1.2493, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 3.8779571056365967, | |
| "learning_rate": 1.3358644067796612e-05, | |
| "loss": 1.2331, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.460205078125, | |
| "learning_rate": 1.329084745762712e-05, | |
| "loss": 1.2425, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 3.0094106197357178, | |
| "learning_rate": 1.3223050847457628e-05, | |
| "loss": 1.2036, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "eval_loss": 1.2363600730895996, | |
| "eval_runtime": 32.2863, | |
| "eval_samples_per_second": 30.973, | |
| "eval_steps_per_second": 7.743, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 3.705883741378784, | |
| "learning_rate": 1.3155254237288137e-05, | |
| "loss": 1.221, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 4.502602577209473, | |
| "learning_rate": 1.3087457627118644e-05, | |
| "loss": 1.2481, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 3.3677573204040527, | |
| "learning_rate": 1.3019661016949155e-05, | |
| "loss": 1.2156, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 4.022857666015625, | |
| "learning_rate": 1.2951864406779662e-05, | |
| "loss": 1.2154, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 3.1049187183380127, | |
| "learning_rate": 1.2884067796610171e-05, | |
| "loss": 1.2385, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "eval_loss": 1.284387469291687, | |
| "eval_runtime": 32.2655, | |
| "eval_samples_per_second": 30.993, | |
| "eval_steps_per_second": 7.748, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 2.980409622192383, | |
| "learning_rate": 1.2816271186440678e-05, | |
| "loss": 1.2451, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 3.34755539894104, | |
| "learning_rate": 1.2748474576271189e-05, | |
| "loss": 1.2361, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 2.9254653453826904, | |
| "learning_rate": 1.2680677966101696e-05, | |
| "loss": 1.2031, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 4.1410698890686035, | |
| "learning_rate": 1.2612881355932205e-05, | |
| "loss": 1.2119, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 3.1164631843566895, | |
| "learning_rate": 1.2545084745762712e-05, | |
| "loss": 1.2355, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "eval_loss": 1.2927731275558472, | |
| "eval_runtime": 32.2864, | |
| "eval_samples_per_second": 30.973, | |
| "eval_steps_per_second": 7.743, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 3.6003384590148926, | |
| "learning_rate": 1.2477288135593221e-05, | |
| "loss": 1.2143, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 3.513211488723755, | |
| "learning_rate": 1.240949152542373e-05, | |
| "loss": 1.2193, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 4.301449298858643, | |
| "learning_rate": 1.2341694915254239e-05, | |
| "loss": 1.2109, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 2.6304187774658203, | |
| "learning_rate": 1.2273898305084746e-05, | |
| "loss": 1.2171, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.6256394386291504, | |
| "learning_rate": 1.220677966101695e-05, | |
| "loss": 1.2273, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 1.2177479267120361, | |
| "eval_runtime": 32.2863, | |
| "eval_samples_per_second": 30.973, | |
| "eval_steps_per_second": 7.743, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 3.3076181411743164, | |
| "learning_rate": 1.213898305084746e-05, | |
| "loss": 1.2202, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 3.64410400390625, | |
| "learning_rate": 1.2071864406779664e-05, | |
| "loss": 1.1953, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 3.324385643005371, | |
| "learning_rate": 1.200406779661017e-05, | |
| "loss": 1.2154, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 4.0625176429748535, | |
| "learning_rate": 1.193627118644068e-05, | |
| "loss": 1.2229, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.672346830368042, | |
| "learning_rate": 1.1868474576271187e-05, | |
| "loss": 1.214, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 1.2213943004608154, | |
| "eval_runtime": 32.2947, | |
| "eval_samples_per_second": 30.965, | |
| "eval_steps_per_second": 7.741, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.8866512775421143, | |
| "learning_rate": 1.1800677966101698e-05, | |
| "loss": 1.1915, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 3.414454460144043, | |
| "learning_rate": 1.1732881355932205e-05, | |
| "loss": 1.1973, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 2.820164442062378, | |
| "learning_rate": 1.1665084745762714e-05, | |
| "loss": 1.1943, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 3.2248144149780273, | |
| "learning_rate": 1.1597288135593221e-05, | |
| "loss": 1.2034, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 2.916104793548584, | |
| "learning_rate": 1.1530169491525425e-05, | |
| "loss": 1.2177, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "eval_loss": 1.2506352663040161, | |
| "eval_runtime": 32.3181, | |
| "eval_samples_per_second": 30.942, | |
| "eval_steps_per_second": 7.736, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.8287951946258545, | |
| "learning_rate": 1.1462372881355932e-05, | |
| "loss": 1.1992, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 3.4299583435058594, | |
| "learning_rate": 1.1394576271186441e-05, | |
| "loss": 1.2133, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 2.9985878467559814, | |
| "learning_rate": 1.132677966101695e-05, | |
| "loss": 1.1709, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 3.1843700408935547, | |
| "learning_rate": 1.125898305084746e-05, | |
| "loss": 1.2063, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 3.3279943466186523, | |
| "learning_rate": 1.1191186440677968e-05, | |
| "loss": 1.1935, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "eval_loss": 1.2275168895721436, | |
| "eval_runtime": 32.3349, | |
| "eval_samples_per_second": 30.926, | |
| "eval_steps_per_second": 7.732, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 3.5022013187408447, | |
| "learning_rate": 1.1123389830508475e-05, | |
| "loss": 1.2001, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 4.141532897949219, | |
| "learning_rate": 1.1055593220338984e-05, | |
| "loss": 1.1882, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 3.668473958969116, | |
| "learning_rate": 1.0987796610169492e-05, | |
| "loss": 1.2004, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 2.9693169593811035, | |
| "learning_rate": 1.0920000000000002e-05, | |
| "loss": 1.1908, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.173802137374878, | |
| "learning_rate": 1.085220338983051e-05, | |
| "loss": 1.1891, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_loss": 1.2043945789337158, | |
| "eval_runtime": 32.3786, | |
| "eval_samples_per_second": 30.885, | |
| "eval_steps_per_second": 7.721, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 2.77329158782959, | |
| "learning_rate": 1.0784406779661018e-05, | |
| "loss": 1.1952, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.3273532390594482, | |
| "learning_rate": 1.0716610169491526e-05, | |
| "loss": 1.1894, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 2.993412971496582, | |
| "learning_rate": 1.0648813559322036e-05, | |
| "loss": 1.168, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 3.5041513442993164, | |
| "learning_rate": 1.0581016949152543e-05, | |
| "loss": 1.1891, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 2.8928792476654053, | |
| "learning_rate": 1.0513220338983052e-05, | |
| "loss": 1.1771, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "eval_loss": 1.2012468576431274, | |
| "eval_runtime": 32.3112, | |
| "eval_samples_per_second": 30.949, | |
| "eval_steps_per_second": 7.737, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.976024627685547, | |
| "learning_rate": 1.044542372881356e-05, | |
| "loss": 1.1836, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 3.1303913593292236, | |
| "learning_rate": 1.0377627118644068e-05, | |
| "loss": 1.1851, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 2.8638460636138916, | |
| "learning_rate": 1.0309830508474576e-05, | |
| "loss": 1.1732, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.416059732437134, | |
| "learning_rate": 1.0242033898305086e-05, | |
| "loss": 1.1905, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 3.996770143508911, | |
| "learning_rate": 1.0174237288135594e-05, | |
| "loss": 1.2044, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.1813915967941284, | |
| "eval_runtime": 32.2878, | |
| "eval_samples_per_second": 30.971, | |
| "eval_steps_per_second": 7.743, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 2.469172477722168, | |
| "learning_rate": 1.0106440677966102e-05, | |
| "loss": 1.148, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 8.27697467803955, | |
| "learning_rate": 1.003864406779661e-05, | |
| "loss": 1.1733, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 3.0315303802490234, | |
| "learning_rate": 9.97084745762712e-06, | |
| "loss": 1.1478, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 2.41133189201355, | |
| "learning_rate": 9.903050847457628e-06, | |
| "loss": 1.122, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 3.001695394515991, | |
| "learning_rate": 9.835254237288136e-06, | |
| "loss": 1.1345, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "eval_loss": 1.2133294343948364, | |
| "eval_runtime": 32.2811, | |
| "eval_samples_per_second": 30.978, | |
| "eval_steps_per_second": 7.744, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 5.538024425506592, | |
| "learning_rate": 9.767457627118645e-06, | |
| "loss": 1.1493, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 4.214341640472412, | |
| "learning_rate": 9.699661016949153e-06, | |
| "loss": 1.1327, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 3.586280345916748, | |
| "learning_rate": 9.631864406779662e-06, | |
| "loss": 1.1381, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 4.103856563568115, | |
| "learning_rate": 9.56406779661017e-06, | |
| "loss": 1.1471, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 3.964653491973877, | |
| "learning_rate": 9.49627118644068e-06, | |
| "loss": 1.1299, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 1.2055881023406982, | |
| "eval_runtime": 32.3191, | |
| "eval_samples_per_second": 30.941, | |
| "eval_steps_per_second": 7.735, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 2.7966806888580322, | |
| "learning_rate": 9.428474576271187e-06, | |
| "loss": 1.1144, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 3.176314115524292, | |
| "learning_rate": 9.360677966101696e-06, | |
| "loss": 1.1405, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 4.957722187042236, | |
| "learning_rate": 9.292881355932204e-06, | |
| "loss": 1.1171, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 3.398547410964966, | |
| "learning_rate": 9.225084745762712e-06, | |
| "loss": 1.1289, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 3.477339267730713, | |
| "learning_rate": 9.15728813559322e-06, | |
| "loss": 1.1132, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "eval_loss": 1.2293468713760376, | |
| "eval_runtime": 32.2612, | |
| "eval_samples_per_second": 30.997, | |
| "eval_steps_per_second": 7.749, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 4.367581844329834, | |
| "learning_rate": 9.08949152542373e-06, | |
| "loss": 1.1241, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 3.551278591156006, | |
| "learning_rate": 9.021694915254238e-06, | |
| "loss": 1.1188, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 3.29950213432312, | |
| "learning_rate": 8.953898305084746e-06, | |
| "loss": 1.1299, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 3.1226329803466797, | |
| "learning_rate": 8.886101694915255e-06, | |
| "loss": 1.1239, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 2.9976165294647217, | |
| "learning_rate": 8.818305084745764e-06, | |
| "loss": 1.1329, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "eval_loss": 1.1932790279388428, | |
| "eval_runtime": 32.2514, | |
| "eval_samples_per_second": 31.006, | |
| "eval_steps_per_second": 7.752, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 2.9511375427246094, | |
| "learning_rate": 8.75050847457627e-06, | |
| "loss": 1.1306, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 3.326470375061035, | |
| "learning_rate": 8.68271186440678e-06, | |
| "loss": 1.1232, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 3.6301770210266113, | |
| "learning_rate": 8.614915254237289e-06, | |
| "loss": 1.1215, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 3.658932685852051, | |
| "learning_rate": 8.547118644067798e-06, | |
| "loss": 1.115, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 2.451982021331787, | |
| "learning_rate": 8.479322033898306e-06, | |
| "loss": 1.1253, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 1.2195427417755127, | |
| "eval_runtime": 32.2665, | |
| "eval_samples_per_second": 30.992, | |
| "eval_steps_per_second": 7.748, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 3.728940725326538, | |
| "learning_rate": 8.411525423728815e-06, | |
| "loss": 1.1191, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 4.087761878967285, | |
| "learning_rate": 8.343728813559323e-06, | |
| "loss": 1.1239, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 3.1904852390289307, | |
| "learning_rate": 8.275932203389832e-06, | |
| "loss": 1.1036, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 4.449623107910156, | |
| "learning_rate": 8.20813559322034e-06, | |
| "loss": 1.1139, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.611001968383789, | |
| "learning_rate": 8.140338983050848e-06, | |
| "loss": 1.1096, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "eval_loss": 1.1555566787719727, | |
| "eval_runtime": 32.2418, | |
| "eval_samples_per_second": 31.016, | |
| "eval_steps_per_second": 7.754, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 3.380537271499634, | |
| "learning_rate": 8.072542372881357e-06, | |
| "loss": 1.1244, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 3.485279083251953, | |
| "learning_rate": 8.004745762711866e-06, | |
| "loss": 1.1103, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 3.244032382965088, | |
| "learning_rate": 7.93762711864407e-06, | |
| "loss": 1.1164, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 4.062005996704102, | |
| "learning_rate": 7.869830508474577e-06, | |
| "loss": 1.0872, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 4.482209205627441, | |
| "learning_rate": 7.802033898305086e-06, | |
| "loss": 1.111, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "eval_loss": 1.174954891204834, | |
| "eval_runtime": 32.2583, | |
| "eval_samples_per_second": 31.0, | |
| "eval_steps_per_second": 7.75, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 3.1390604972839355, | |
| "learning_rate": 7.734237288135595e-06, | |
| "loss": 1.1059, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 3.1146981716156006, | |
| "learning_rate": 7.666440677966102e-06, | |
| "loss": 1.1409, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 4.213539123535156, | |
| "learning_rate": 7.598644067796611e-06, | |
| "loss": 1.0965, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 4.170618057250977, | |
| "learning_rate": 7.53084745762712e-06, | |
| "loss": 1.1003, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 3.52750301361084, | |
| "learning_rate": 7.463050847457628e-06, | |
| "loss": 1.1183, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_loss": 1.1892881393432617, | |
| "eval_runtime": 32.2511, | |
| "eval_samples_per_second": 31.007, | |
| "eval_steps_per_second": 7.752, | |
| "step": 19000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 30000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "total_flos": 1.793641609691136e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |