Spaces:
Sleeping
Sleeping
| [ | |
| { | |
| "loss": -0.24362421035766602, | |
| "grad_norm": 29.5, | |
| "learning_rate": 1e-06, | |
| "num_tokens": 3848.0, | |
| "completions/mean_length": 32.0, | |
| "completions/min_length": 32.0, | |
| "completions/max_length": 32.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 32.0, | |
| "completions/min_terminated_length": 32.0, | |
| "completions/max_terminated_length": 32.0, | |
| "rewards/reward_fn/mean": -0.5217045545578003, | |
| "rewards/reward_fn/std": 0.10863915830850601, | |
| "reward": -0.5217045545578003, | |
| "reward_std": 0.10863915830850601, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.32034843415021896, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.747019057991565, | |
| "epoch": 0.0125, | |
| "step": 1 | |
| }, | |
| { | |
| "loss": -0.40457093715667725, | |
| "grad_norm": 31.125, | |
| "learning_rate": 9.916666666666666e-07, | |
| "num_tokens": 8127.0, | |
| "completions/mean_length": 39.875, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 39.833335876464844, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.8922222256660461, | |
| "rewards/reward_fn/std": 0.8594094514846802, | |
| "reward": 0.8922222256660461, | |
| "reward_std": 0.8594093918800354, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.45131851360201836, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4345435979957983, | |
| "epoch": 0.025, | |
| "step": 2 | |
| }, | |
| { | |
| "loss": 0.43936318159103394, | |
| "grad_norm": 67.0, | |
| "learning_rate": 9.833333333333332e-07, | |
| "num_tokens": 13847.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5597727298736572, | |
| "rewards/reward_fn/std": 0.06792201846837997, | |
| "reward": -0.5597727298736572, | |
| "reward_std": 0.06792200356721878, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3411939814686775, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.8157157319947146, | |
| "epoch": 0.0375, | |
| "step": 3 | |
| }, | |
| { | |
| "loss": -0.02084091305732727, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 9.75e-07, | |
| "num_tokens": 17575.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.352373719215393, | |
| "rewards/reward_fn/std": 0.03594788536429405, | |
| "reward": 1.352373719215393, | |
| "reward_std": 0.03594787418842316, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5832894779741764, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3254435310045665, | |
| "epoch": 0.05, | |
| "step": 4 | |
| }, | |
| { | |
| "loss": -0.03099835105240345, | |
| "grad_norm": 21.625, | |
| "learning_rate": 9.666666666666666e-07, | |
| "num_tokens": 21295.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 39.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 39.0, | |
| "rewards/reward_fn/mean": 1.3622727394104004, | |
| "rewards/reward_fn/std": 0.06405245512723923, | |
| "reward": 1.3622727394104004, | |
| "reward_std": 0.06405249983072281, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5233770348131657, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3173207020008704, | |
| "epoch": 0.0625, | |
| "step": 5 | |
| }, | |
| { | |
| "loss": -0.34160181879997253, | |
| "grad_norm": 64.5, | |
| "learning_rate": 9.583333333333334e-07, | |
| "num_tokens": 25487.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.7122727036476135, | |
| "rewards/reward_fn/std": 0.8870973587036133, | |
| "reward": 0.7122727036476135, | |
| "reward_std": 0.8870972990989685, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.6138700321316719, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4490324840007816, | |
| "epoch": 0.075, | |
| "step": 6 | |
| }, | |
| { | |
| "loss": -0.3949429988861084, | |
| "grad_norm": 83.5, | |
| "learning_rate": 9.499999999999999e-07, | |
| "num_tokens": 29759.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.10791665315628052, | |
| "rewards/reward_fn/std": 0.7444694638252258, | |
| "reward": -0.10791665315628052, | |
| "reward_std": 0.7444694638252258, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.18287423998117447, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4863086949990247, | |
| "epoch": 0.0875, | |
| "step": 7 | |
| }, | |
| { | |
| "loss": -0.27955716848373413, | |
| "grad_norm": 42.5, | |
| "learning_rate": 9.416666666666666e-07, | |
| "num_tokens": 33847.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.8062247037887573, | |
| "rewards/reward_fn/std": 0.5839599370956421, | |
| "reward": 0.8062247037887573, | |
| "reward_std": 0.5839598774909973, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.6909047141671181, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4232795009957044, | |
| "epoch": 0.1, | |
| "step": 8 | |
| }, | |
| { | |
| "loss": -0.20937372744083405, | |
| "grad_norm": 24.375, | |
| "learning_rate": 9.333333333333333e-07, | |
| "num_tokens": 37687.0, | |
| "completions/mean_length": 32.0, | |
| "completions/min_length": 32.0, | |
| "completions/max_length": 32.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 32.0, | |
| "completions/min_terminated_length": 32.0, | |
| "completions/max_terminated_length": 32.0, | |
| "rewards/reward_fn/mean": -0.5013636350631714, | |
| "rewards/reward_fn/std": 0.09515166282653809, | |
| "reward": -0.5013636350631714, | |
| "reward_std": 0.09515164792537689, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.25736842304468155, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.1515931880094286, | |
| "epoch": 0.1125, | |
| "step": 9 | |
| }, | |
| { | |
| "loss": -0.308257520198822, | |
| "grad_norm": 51.0, | |
| "learning_rate": 9.25e-07, | |
| "num_tokens": 41714.0, | |
| "completions/mean_length": 39.375, | |
| "completions/min_length": 35.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.28571701049805, | |
| "completions/min_terminated_length": 35.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.1064772605895996, | |
| "rewards/reward_fn/std": 0.6493226885795593, | |
| "reward": 1.1064772605895996, | |
| "reward_std": 0.6493226885795593, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.661864310503006, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3802175810051267, | |
| "epoch": 0.125, | |
| "step": 10 | |
| }, | |
| { | |
| "loss": -0.09232643246650696, | |
| "grad_norm": 139.0, | |
| "learning_rate": 9.166666666666665e-07, | |
| "num_tokens": 45626.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 39.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 39.0, | |
| "rewards/reward_fn/mean": -0.5396956205368042, | |
| "rewards/reward_fn/std": 0.12056975066661835, | |
| "reward": -0.5396956205368042, | |
| "reward_std": 0.12056975811719894, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5227020159363747, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6614730110013625, | |
| "epoch": 0.1375, | |
| "step": 11 | |
| }, | |
| { | |
| "loss": -0.3088584542274475, | |
| "grad_norm": 95.0, | |
| "learning_rate": 9.083333333333332e-07, | |
| "num_tokens": 49522.0, | |
| "completions/mean_length": 38.0, | |
| "completions/min_length": 38.0, | |
| "completions/max_length": 38.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "completions/max_terminated_length": 38.0, | |
| "rewards/reward_fn/mean": -0.47181814908981323, | |
| "rewards/reward_fn/std": 0.0617111437022686, | |
| "reward": -0.47181814908981323, | |
| "reward_std": 0.061711132526397705, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4710083231329918, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.329280840996944, | |
| "epoch": 0.15, | |
| "step": 12 | |
| }, | |
| { | |
| "loss": -0.025727063417434692, | |
| "grad_norm": 116.0, | |
| "learning_rate": 9e-07, | |
| "num_tokens": 53434.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.4795833230018616, | |
| "rewards/reward_fn/std": 0.09909811615943909, | |
| "reward": -0.4795833230018616, | |
| "reward_std": 0.09909811615943909, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.503271073102951, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.385940207001113, | |
| "epoch": 0.1625, | |
| "step": 13 | |
| }, | |
| { | |
| "loss": -0.18920525908470154, | |
| "grad_norm": 40.25, | |
| "learning_rate": 8.916666666666667e-07, | |
| "num_tokens": 57298.0, | |
| "completions/mean_length": 34.0, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 34.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 34.0, | |
| "rewards/reward_fn/mean": -0.5394047498703003, | |
| "rewards/reward_fn/std": 0.1302226483821869, | |
| "reward": -0.5394047498703003, | |
| "reward_std": 0.13022266328334808, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.38320475071668625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.225233670003945, | |
| "epoch": 0.175, | |
| "step": 14 | |
| }, | |
| { | |
| "loss": -0.28319409489631653, | |
| "grad_norm": 97.0, | |
| "learning_rate": 8.833333333333333e-07, | |
| "num_tokens": 61194.0, | |
| "completions/mean_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/max_length": 37.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "completions/max_terminated_length": 37.0, | |
| "rewards/reward_fn/mean": -0.5492045879364014, | |
| "rewards/reward_fn/std": 0.13743428885936737, | |
| "reward": -0.5492045879364014, | |
| "reward_std": 0.13743430376052856, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.41958795487880707, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3236613720000605, | |
| "epoch": 0.1875, | |
| "step": 15 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.75e-07, | |
| "num_tokens": 65786.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_terminated_length": 0.0, | |
| "completions/max_terminated_length": 0.0, | |
| "rewards/reward_fn/mean": -0.5, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": -0.5, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.581262234998576, | |
| "epoch": 0.2, | |
| "step": 16 | |
| }, | |
| { | |
| "loss": -0.21241119503974915, | |
| "grad_norm": 83.0, | |
| "learning_rate": 8.666666666666667e-07, | |
| "num_tokens": 69698.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5100653767585754, | |
| "rewards/reward_fn/std": 0.1113269254565239, | |
| "reward": -0.5100653767585754, | |
| "reward_std": 0.1113269254565239, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.48778772354125977, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.361430439999822, | |
| "epoch": 0.2125, | |
| "step": 17 | |
| }, | |
| { | |
| "loss": -0.46610546112060547, | |
| "grad_norm": 70.0, | |
| "learning_rate": 8.583333333333332e-07, | |
| "num_tokens": 74258.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.20722222328186035, | |
| "rewards/reward_fn/std": 0.3139681816101074, | |
| "reward": -0.20722222328186035, | |
| "reward_std": 0.3139681816101074, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3570665195584297, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.5692794189999404, | |
| "epoch": 0.225, | |
| "step": 18 | |
| }, | |
| { | |
| "loss": 0.003845594823360443, | |
| "grad_norm": 86.0, | |
| "learning_rate": 8.499999999999999e-07, | |
| "num_tokens": 78178.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5035227537155151, | |
| "rewards/reward_fn/std": 0.11439842730760574, | |
| "reward": -0.5035227537155151, | |
| "reward_std": 0.11439841985702515, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5367328524589539, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.36526739999681, | |
| "epoch": 0.2375, | |
| "step": 19 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.416666666666666e-07, | |
| "num_tokens": 81994.0, | |
| "completions/mean_length": 29.0, | |
| "completions/min_length": 29.0, | |
| "completions/max_length": 29.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_terminated_length": 0.0, | |
| "completions/max_terminated_length": 0.0, | |
| "rewards/reward_fn/mean": -0.44999998807907104, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": -0.44999998807907104, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.1127085680018354, | |
| "epoch": 0.25, | |
| "step": 20 | |
| }, | |
| { | |
| "loss": 0.008403703570365906, | |
| "grad_norm": 30.5, | |
| "learning_rate": 8.333333333333333e-07, | |
| "num_tokens": 87594.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.504485011100769, | |
| "rewards/reward_fn/std": 0.26674723625183105, | |
| "reward": -0.504485011100769, | |
| "reward_std": 0.26674723625183105, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.26860835030674934, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.8157774250066723, | |
| "epoch": 0.2625, | |
| "step": 21 | |
| }, | |
| { | |
| "loss": -0.20221471786499023, | |
| "grad_norm": 13.875, | |
| "learning_rate": 8.249999999999999e-07, | |
| "num_tokens": 92234.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.35162317752838135, | |
| "rewards/reward_fn/std": 0.39250901341438293, | |
| "reward": -0.35162317752838135, | |
| "reward_std": 0.39250901341438293, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3838598020374775, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.667156387997238, | |
| "epoch": 0.275, | |
| "step": 22 | |
| }, | |
| { | |
| "loss": -0.05744682252407074, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8.166666666666666e-07, | |
| "num_tokens": 96058.0, | |
| "completions/mean_length": 29.0, | |
| "completions/min_length": 29.0, | |
| "completions/max_length": 29.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 29.0, | |
| "completions/min_terminated_length": 29.0, | |
| "completions/max_terminated_length": 29.0, | |
| "rewards/reward_fn/mean": -0.4466666579246521, | |
| "rewards/reward_fn/std": 0.03616540506482124, | |
| "reward": -0.4466666579246521, | |
| "reward_std": 0.03616539388895035, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.22461054101586342, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.0968586019989743, | |
| "epoch": 0.2875, | |
| "step": 23 | |
| }, | |
| { | |
| "loss": 0.4449924826622009, | |
| "grad_norm": 114.0, | |
| "learning_rate": 8.083333333333334e-07, | |
| "num_tokens": 101818.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.6441667079925537, | |
| "rewards/reward_fn/std": 0.12139075994491577, | |
| "reward": -0.6441667079925537, | |
| "reward_std": 0.12139075249433517, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.44333361089229584, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.830795905996638, | |
| "epoch": 0.3, | |
| "step": 24 | |
| }, | |
| { | |
| "loss": -0.40479978919029236, | |
| "grad_norm": 100.0, | |
| "learning_rate": 8e-07, | |
| "num_tokens": 105906.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.8778408765792847, | |
| "rewards/reward_fn/std": 0.8508397340774536, | |
| "reward": 0.8778408765792847, | |
| "reward_std": 0.8508396744728088, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5950741171836853, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4058330659972853, | |
| "epoch": 0.3125, | |
| "step": 25 | |
| }, | |
| { | |
| "loss": -0.18043455481529236, | |
| "grad_norm": 86.0, | |
| "learning_rate": 7.916666666666666e-07, | |
| "num_tokens": 109818.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5217658281326294, | |
| "rewards/reward_fn/std": 0.09584333747625351, | |
| "reward": -0.5217658281326294, | |
| "reward_std": 0.09584332257509232, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.28228095918893814, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.391602182000497, | |
| "epoch": 0.325, | |
| "step": 26 | |
| }, | |
| { | |
| "loss": -0.11674918234348297, | |
| "grad_norm": 73.5, | |
| "learning_rate": 7.833333333333333e-07, | |
| "num_tokens": 113794.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.0622727870941162, | |
| "rewards/reward_fn/std": 0.2846008241176605, | |
| "reward": 1.0622727870941162, | |
| "reward_std": 0.2846008241176605, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.7084969580173492, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3755679489986505, | |
| "epoch": 0.3375, | |
| "step": 27 | |
| }, | |
| { | |
| "loss": -0.46756893396377563, | |
| "grad_norm": 92.0, | |
| "learning_rate": 7.75e-07, | |
| "num_tokens": 117914.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.4236363172531128, | |
| "rewards/reward_fn/std": 0.9876008033752441, | |
| "reward": 0.4236363172531128, | |
| "reward_std": 0.9876007437705994, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3736693635582924, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.506384505999449, | |
| "epoch": 0.35, | |
| "step": 28 | |
| }, | |
| { | |
| "loss": -0.23215201497077942, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 7.666666666666667e-07, | |
| "num_tokens": 122474.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.18716387450695038, | |
| "rewards/reward_fn/std": 0.33529403805732727, | |
| "reward": -0.18716387450695038, | |
| "reward_std": 0.33529403805732727, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.15797100216150284, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.591686038998887, | |
| "epoch": 0.3625, | |
| "step": 29 | |
| }, | |
| { | |
| "loss": -0.30917486548423767, | |
| "grad_norm": 25.25, | |
| "learning_rate": 7.583333333333333e-07, | |
| "num_tokens": 126202.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.1155681610107422, | |
| "rewards/reward_fn/std": 0.6530774235725403, | |
| "reward": 1.1155681610107422, | |
| "reward_std": 0.6530774235725403, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5457391068339348, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.346527051005978, | |
| "epoch": 0.375, | |
| "step": 30 | |
| }, | |
| { | |
| "loss": -0.3087140619754791, | |
| "grad_norm": 63.0, | |
| "learning_rate": 7.5e-07, | |
| "num_tokens": 130066.0, | |
| "completions/mean_length": 34.0, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 34.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 34.0, | |
| "rewards/reward_fn/mean": -0.4669230580329895, | |
| "rewards/reward_fn/std": 0.04786568880081177, | |
| "reward": -0.4669230580329895, | |
| "reward_std": 0.047865696251392365, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4225667715072632, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.267208800996741, | |
| "epoch": 0.3875, | |
| "step": 31 | |
| }, | |
| { | |
| "loss": -0.26029127836227417, | |
| "grad_norm": 19.0, | |
| "learning_rate": 7.416666666666666e-07, | |
| "num_tokens": 135602.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.01060507446527481, | |
| "rewards/reward_fn/std": 0.30294692516326904, | |
| "reward": -0.01060507446527481, | |
| "reward_std": 0.30294692516326904, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.28579793497920036, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 3.156891425001959, | |
| "epoch": 0.4, | |
| "step": 32 | |
| }, | |
| { | |
| "loss": 0.45159193873405457, | |
| "grad_norm": 62.75, | |
| "learning_rate": 7.333333333333332e-07, | |
| "num_tokens": 141282.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.6371731758117676, | |
| "rewards/reward_fn/std": 0.1138080582022667, | |
| "reward": -0.6371731758117676, | |
| "reward_std": 0.11380806565284729, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.39423390477895737, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.8461303939984646, | |
| "epoch": 0.4125, | |
| "step": 33 | |
| }, | |
| { | |
| "loss": -0.29366767406463623, | |
| "grad_norm": 64.5, | |
| "learning_rate": 7.249999999999999e-07, | |
| "num_tokens": 145290.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.042272686958313, | |
| "rewards/reward_fn/std": 0.6563705205917358, | |
| "reward": 1.042272686958313, | |
| "reward_std": 0.6563704609870911, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.714435301721096, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.399032964996877, | |
| "epoch": 0.425, | |
| "step": 34 | |
| }, | |
| { | |
| "loss": -0.23082980513572693, | |
| "grad_norm": 69.5, | |
| "learning_rate": 7.166666666666667e-07, | |
| "num_tokens": 149186.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 39.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 39.0, | |
| "rewards/reward_fn/mean": -0.49791663885116577, | |
| "rewards/reward_fn/std": 0.08947711437940598, | |
| "reward": -0.49791663885116577, | |
| "reward_std": 0.08947710692882538, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4640081375837326, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.390058962995681, | |
| "epoch": 0.4375, | |
| "step": 35 | |
| }, | |
| { | |
| "loss": -0.10919760167598724, | |
| "grad_norm": 107.0, | |
| "learning_rate": 7.083333333333334e-07, | |
| "num_tokens": 153057.0, | |
| "completions/mean_length": 34.875, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 35.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 34.85714340209961, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 35.0, | |
| "rewards/reward_fn/mean": -0.5643702149391174, | |
| "rewards/reward_fn/std": 0.1681353896856308, | |
| "reward": -0.5643702149391174, | |
| "reward_std": 0.1681353896856308, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.543829470872879, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.2662610129918903, | |
| "epoch": 0.45, | |
| "step": 36 | |
| }, | |
| { | |
| "loss": -0.40484946966171265, | |
| "grad_norm": 54.5, | |
| "learning_rate": 7e-07, | |
| "num_tokens": 157089.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.880227267742157, | |
| "rewards/reward_fn/std": 0.8522089123725891, | |
| "reward": 0.880227267742157, | |
| "reward_std": 0.8522088527679443, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5011838786303997, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3901128889956453, | |
| "epoch": 0.4625, | |
| "step": 37 | |
| }, | |
| { | |
| "loss": -0.3632303476333618, | |
| "grad_norm": 79.0, | |
| "learning_rate": 6.916666666666666e-07, | |
| "num_tokens": 162737.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.11722371727228165, | |
| "rewards/reward_fn/std": 0.3178960382938385, | |
| "reward": -0.11722371727228165, | |
| "reward_std": 0.3178960382938385, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3668777346611023, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 3.1727475369989406, | |
| "epoch": 0.475, | |
| "step": 38 | |
| }, | |
| { | |
| "loss": -0.10303743183612823, | |
| "grad_norm": 114.0, | |
| "learning_rate": 6.833333333333333e-07, | |
| "num_tokens": 166640.0, | |
| "completions/mean_length": 38.875, | |
| "completions/min_length": 31.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 38.5, | |
| "completions/min_terminated_length": 31.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.4853571355342865, | |
| "rewards/reward_fn/std": 0.10035142302513123, | |
| "reward": -0.4853571355342865, | |
| "reward_std": 0.10035141557455063, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.41357941925525665, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4096208860028128, | |
| "epoch": 0.4875, | |
| "step": 39 | |
| }, | |
| { | |
| "loss": -0.29694727063179016, | |
| "grad_norm": 30.25, | |
| "learning_rate": 6.75e-07, | |
| "num_tokens": 171232.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.2764253616333008, | |
| "rewards/reward_fn/std": 0.30904126167297363, | |
| "reward": -0.2764253616333008, | |
| "reward_std": 0.30904126167297363, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.1710418350994587, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6378187500013155, | |
| "epoch": 0.5, | |
| "step": 40 | |
| }, | |
| { | |
| "loss": -0.3691960871219635, | |
| "grad_norm": 46.0, | |
| "learning_rate": 6.666666666666666e-07, | |
| "num_tokens": 175304.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 32.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 38.400001525878906, | |
| "completions/min_terminated_length": 32.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.6830682158470154, | |
| "rewards/reward_fn/std": 0.771815836429596, | |
| "reward": 0.6830682158470154, | |
| "reward_std": 0.771815836429596, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.45744267851114273, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4562132069950167, | |
| "epoch": 0.5125, | |
| "step": 41 | |
| }, | |
| { | |
| "loss": 1.1920928955078125e-07, | |
| "grad_norm": 50.5, | |
| "learning_rate": 6.583333333333333e-07, | |
| "num_tokens": 179128.0, | |
| "completions/mean_length": 29.0, | |
| "completions/min_length": 29.0, | |
| "completions/max_length": 29.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 29.0, | |
| "completions/min_terminated_length": 29.0, | |
| "completions/max_terminated_length": 29.0, | |
| "rewards/reward_fn/mean": -0.44999998807907104, | |
| "rewards/reward_fn/std": 0.07406561076641083, | |
| "reward": -0.44999998807907104, | |
| "reward_std": 0.07406560331583023, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.19315754994750023, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.1436785939986294, | |
| "epoch": 0.525, | |
| "step": 42 | |
| }, | |
| { | |
| "loss": -0.2808665633201599, | |
| "grad_norm": 79.0, | |
| "learning_rate": 6.5e-07, | |
| "num_tokens": 183008.0, | |
| "completions/mean_length": 35.0, | |
| "completions/min_length": 35.0, | |
| "completions/max_length": 35.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 35.0, | |
| "completions/min_terminated_length": 35.0, | |
| "completions/max_terminated_length": 35.0, | |
| "rewards/reward_fn/mean": -0.45499998331069946, | |
| "rewards/reward_fn/std": 0.08668497204780579, | |
| "reward": -0.45499998331069946, | |
| "reward_std": 0.08668495714664459, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.41308148205280304, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6254635349978344, | |
| "epoch": 0.5375, | |
| "step": 43 | |
| }, | |
| { | |
| "loss": -0.2648897171020508, | |
| "grad_norm": 40.25, | |
| "learning_rate": 6.416666666666667e-07, | |
| "num_tokens": 186736.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.1155681610107422, | |
| "rewards/reward_fn/std": 0.6533666253089905, | |
| "reward": 1.1155681610107422, | |
| "reward_std": 0.6533666253089905, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5097345933318138, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3735133270020015, | |
| "epoch": 0.55, | |
| "step": 44 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.333333333333332e-07, | |
| "num_tokens": 190568.0, | |
| "completions/mean_length": 29.0, | |
| "completions/min_length": 29.0, | |
| "completions/max_length": 29.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/mean_terminated_length": 29.0, | |
| "completions/min_terminated_length": 29.0, | |
| "completions/max_terminated_length": 29.0, | |
| "rewards/reward_fn/mean": -0.44999998807907104, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": -0.44999998807907104, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 0.05631678178906441, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.1480028179939836, | |
| "epoch": 0.5625, | |
| "step": 45 | |
| }, | |
| { | |
| "loss": 0.307823121547699, | |
| "grad_norm": 9.875, | |
| "learning_rate": 6.249999999999999e-07, | |
| "num_tokens": 196128.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5070832967758179, | |
| "rewards/reward_fn/std": 0.02003469504415989, | |
| "reward": -0.5070832967758179, | |
| "reward_std": 0.020034685730934143, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.04418742656707764, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.881791396997869, | |
| "epoch": 0.575, | |
| "step": 46 | |
| }, | |
| { | |
| "loss": -0.1570288985967636, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 6.166666666666667e-07, | |
| "num_tokens": 201710.0, | |
| "completions/mean_length": 39.75, | |
| "completions/min_length": 38.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 39.333335876464844, | |
| "completions/min_terminated_length": 38.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.2866935729980469, | |
| "rewards/reward_fn/std": 0.3386446237564087, | |
| "reward": -0.2866935729980469, | |
| "reward_std": 0.3386445939540863, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.17347081750631332, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.9082759960074327, | |
| "epoch": 0.5875, | |
| "step": 47 | |
| }, | |
| { | |
| "loss": -0.4048246145248413, | |
| "grad_norm": 40.25, | |
| "learning_rate": 6.083333333333333e-07, | |
| "num_tokens": 205422.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.898863673210144, | |
| "rewards/reward_fn/std": 0.8637701272964478, | |
| "reward": 0.898863673210144, | |
| "reward_std": 0.8637701869010925, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.44472044333815575, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.390659141001379, | |
| "epoch": 0.6, | |
| "step": 48 | |
| }, | |
| { | |
| "loss": 0.24084210395812988, | |
| "grad_norm": 29.25, | |
| "learning_rate": 6e-07, | |
| "num_tokens": 209142.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 39.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 39.0, | |
| "rewards/reward_fn/mean": 1.35715913772583, | |
| "rewards/reward_fn/std": 0.032043490558862686, | |
| "reward": 1.35715913772583, | |
| "reward_std": 0.03204350918531418, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5551739931106567, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6876753989999997, | |
| "epoch": 0.6125, | |
| "step": 49 | |
| }, | |
| { | |
| "loss": -0.21526648104190826, | |
| "grad_norm": 21.375, | |
| "learning_rate": 5.916666666666667e-07, | |
| "num_tokens": 213638.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.4264035224914551, | |
| "rewards/reward_fn/std": 0.213578462600708, | |
| "reward": -0.4264035224914551, | |
| "reward_std": 0.213578462600708, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3001219779253006, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.618189713000902, | |
| "epoch": 0.625, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": -0.4049317240715027, | |
| "grad_norm": 20.75, | |
| "learning_rate": 5.833333333333334e-07, | |
| "num_tokens": 217910.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.1808333396911621, | |
| "rewards/reward_fn/std": 0.5910489559173584, | |
| "reward": -0.1808333396911621, | |
| "reward_std": 0.5910490155220032, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.18206892162561417, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.568286646004708, | |
| "epoch": 0.6375, | |
| "step": 51 | |
| }, | |
| { | |
| "loss": -0.03264091908931732, | |
| "grad_norm": 41.25, | |
| "learning_rate": 5.749999999999999e-07, | |
| "num_tokens": 221654.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.3533837795257568, | |
| "rewards/reward_fn/std": 0.026786180213093758, | |
| "reward": 1.3533837795257568, | |
| "reward_std": 0.026786169037222862, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5506760738790035, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.386201836001419, | |
| "epoch": 0.65, | |
| "step": 52 | |
| }, | |
| { | |
| "loss": -0.13223615288734436, | |
| "grad_norm": 32.0, | |
| "learning_rate": 5.666666666666666e-07, | |
| "num_tokens": 225494.0, | |
| "completions/mean_length": 32.0, | |
| "completions/min_length": 32.0, | |
| "completions/max_length": 32.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 32.0, | |
| "completions/min_terminated_length": 32.0, | |
| "completions/max_terminated_length": 32.0, | |
| "rewards/reward_fn/mean": -0.5251136422157288, | |
| "rewards/reward_fn/std": 0.12343082576990128, | |
| "reward": -0.5251136422157288, | |
| "reward_std": 0.12343082576990128, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.26780910789966583, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.225144078001904, | |
| "epoch": 0.6625, | |
| "step": 53 | |
| }, | |
| { | |
| "loss": -0.4489619731903076, | |
| "grad_norm": 103.0, | |
| "learning_rate": 5.583333333333333e-07, | |
| "num_tokens": 230030.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.12071428447961807, | |
| "rewards/reward_fn/std": 0.3167022168636322, | |
| "reward": -0.12071428447961807, | |
| "reward_std": 0.3167022168636322, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4115525260567665, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6585675659989647, | |
| "epoch": 0.675, | |
| "step": 54 | |
| }, | |
| { | |
| "loss": -0.1266467571258545, | |
| "grad_norm": 102.5, | |
| "learning_rate": 5.5e-07, | |
| "num_tokens": 233918.0, | |
| "completions/mean_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/max_length": 36.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "completions/max_terminated_length": 36.0, | |
| "rewards/reward_fn/mean": -0.5192856788635254, | |
| "rewards/reward_fn/std": 0.11575444787740707, | |
| "reward": -0.5192856788635254, | |
| "reward_std": 0.11575444787740707, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5234384797513485, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3386452439917775, | |
| "epoch": 0.6875, | |
| "step": 55 | |
| }, | |
| { | |
| "loss": -0.131612628698349, | |
| "grad_norm": 85.0, | |
| "learning_rate": 5.416666666666666e-07, | |
| "num_tokens": 237798.0, | |
| "completions/mean_length": 35.0, | |
| "completions/min_length": 35.0, | |
| "completions/max_length": 35.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 35.0, | |
| "completions/min_terminated_length": 35.0, | |
| "completions/max_terminated_length": 35.0, | |
| "rewards/reward_fn/mean": -0.5029292702674866, | |
| "rewards/reward_fn/std": 0.11540570855140686, | |
| "reward": -0.5029292702674866, | |
| "reward_std": 0.11540570855140686, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5023137032985687, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.614732401998481, | |
| "epoch": 0.7, | |
| "step": 56 | |
| }, | |
| { | |
| "loss": -0.4357653558254242, | |
| "grad_norm": 53.0, | |
| "learning_rate": 5.333333333333333e-07, | |
| "num_tokens": 242182.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.511174201965332, | |
| "rewards/reward_fn/std": 0.8700709342956543, | |
| "reward": 0.511174201965332, | |
| "reward_std": 0.8700709342956543, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.44251416251063347, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.592752026001108, | |
| "epoch": 0.7125, | |
| "step": 57 | |
| }, | |
| { | |
| "loss": -0.06370481848716736, | |
| "grad_norm": 40.5, | |
| "learning_rate": 5.25e-07, | |
| "num_tokens": 246086.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.47402775287628174, | |
| "rewards/reward_fn/std": 0.050861842930316925, | |
| "reward": -0.47402775287628174, | |
| "reward_std": 0.050861842930316925, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4770357385277748, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4163402110025345, | |
| "epoch": 0.725, | |
| "step": 58 | |
| }, | |
| { | |
| "loss": 0.09347647428512573, | |
| "grad_norm": 20.625, | |
| "learning_rate": 5.166666666666667e-07, | |
| "num_tokens": 249918.0, | |
| "completions/mean_length": 29.0, | |
| "completions/min_length": 29.0, | |
| "completions/max_length": 29.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 29.0, | |
| "completions/min_terminated_length": 29.0, | |
| "completions/max_terminated_length": 29.0, | |
| "rewards/reward_fn/mean": -0.4599999785423279, | |
| "rewards/reward_fn/std": 0.06676184386014938, | |
| "reward": -0.4599999785423279, | |
| "reward_std": 0.06676182895898819, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.1226105373352766, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.1366954979930597, | |
| "epoch": 0.7375, | |
| "step": 59 | |
| }, | |
| { | |
| "loss": -0.4433984160423279, | |
| "grad_norm": 85.0, | |
| "learning_rate": 5.083333333333333e-07, | |
| "num_tokens": 253950.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.11454544961452484, | |
| "rewards/reward_fn/std": 0.8661433458328247, | |
| "reward": 0.11454544961452484, | |
| "reward_std": 0.8661432862281799, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.30511655658483505, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.449583667992556, | |
| "epoch": 0.75, | |
| "step": 60 | |
| }, | |
| { | |
| "loss": -0.025469139218330383, | |
| "grad_norm": 66.5, | |
| "learning_rate": 5e-07, | |
| "num_tokens": 257838.0, | |
| "completions/mean_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/max_length": 37.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "completions/max_terminated_length": 37.0, | |
| "rewards/reward_fn/mean": -0.5622727274894714, | |
| "rewards/reward_fn/std": 0.12705931067466736, | |
| "reward": -0.5622727274894714, | |
| "reward_std": 0.12705931067466736, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4662681147456169, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3535230979978223, | |
| "epoch": 0.7625, | |
| "step": 61 | |
| }, | |
| { | |
| "loss": -0.16131633520126343, | |
| "grad_norm": 6.375, | |
| "learning_rate": 4.916666666666666e-07, | |
| "num_tokens": 261966.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.08727272599935532, | |
| "rewards/reward_fn/std": 0.8163363933563232, | |
| "reward": 0.08727272599935532, | |
| "reward_std": 0.816336452960968, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.2436109185218811, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.553159698996751, | |
| "epoch": 0.775, | |
| "step": 62 | |
| }, | |
| { | |
| "loss": -0.27563101053237915, | |
| "grad_norm": 119.5, | |
| "learning_rate": 4.833333333333333e-07, | |
| "num_tokens": 265878.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 39.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 39.0, | |
| "rewards/reward_fn/mean": -0.44333332777023315, | |
| "rewards/reward_fn/std": 0.08757762610912323, | |
| "reward": -0.44333332777023315, | |
| "reward_std": 0.08757761865854263, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.49050506949424744, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.412015636993601, | |
| "epoch": 0.7875, | |
| "step": 63 | |
| }, | |
| { | |
| "loss": -0.30919110774993896, | |
| "grad_norm": 33.0, | |
| "learning_rate": 4.7499999999999995e-07, | |
| "num_tokens": 269606.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.097386360168457, | |
| "rewards/reward_fn/std": 0.6456923484802246, | |
| "reward": 1.097386360168457, | |
| "reward_std": 0.6456924080848694, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5607278198003769, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.402838498001074, | |
| "epoch": 0.8, | |
| "step": 64 | |
| }, | |
| { | |
| "loss": -0.1975964903831482, | |
| "grad_norm": 40.75, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "num_tokens": 273478.0, | |
| "completions/mean_length": 35.0, | |
| "completions/min_length": 35.0, | |
| "completions/max_length": 35.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 35.0, | |
| "completions/min_terminated_length": 35.0, | |
| "completions/max_terminated_length": 35.0, | |
| "rewards/reward_fn/mean": -0.4833928346633911, | |
| "rewards/reward_fn/std": 0.0908365324139595, | |
| "reward": -0.4833928346633911, | |
| "reward_std": 0.0908365249633789, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3946021981537342, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.312598282002, | |
| "epoch": 0.8125, | |
| "step": 65 | |
| }, | |
| { | |
| "loss": -0.4660513401031494, | |
| "grad_norm": 29.5, | |
| "learning_rate": 4.5833333333333327e-07, | |
| "num_tokens": 278022.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.1836332380771637, | |
| "rewards/reward_fn/std": 0.3393118977546692, | |
| "reward": -0.1836332380771637, | |
| "reward_std": 0.3393118977546692, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.28370077535510063, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.630913541004702, | |
| "epoch": 0.825, | |
| "step": 66 | |
| }, | |
| { | |
| "loss": -0.15911135077476501, | |
| "grad_norm": 95.5, | |
| "learning_rate": 4.5e-07, | |
| "num_tokens": 281894.0, | |
| "completions/mean_length": 34.0, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 34.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 34.0, | |
| "rewards/reward_fn/mean": -0.48636361956596375, | |
| "rewards/reward_fn/std": 0.09703028947114944, | |
| "reward": -0.48636361956596375, | |
| "reward_std": 0.09703028202056885, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.40127798169851303, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.5582268299949646, | |
| "epoch": 0.8375, | |
| "step": 67 | |
| }, | |
| { | |
| "loss": -0.39602887630462646, | |
| "grad_norm": 50.5, | |
| "learning_rate": 4.4166666666666664e-07, | |
| "num_tokens": 287638.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.36242133378982544, | |
| "rewards/reward_fn/std": 0.3204170763492584, | |
| "reward": -0.36242133378982544, | |
| "reward_std": 0.3204170763492584, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.22999491542577744, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.94327915100439, | |
| "epoch": 0.85, | |
| "step": 68 | |
| }, | |
| { | |
| "loss": -0.308975487947464, | |
| "grad_norm": 123.0, | |
| "learning_rate": 4.3333333333333335e-07, | |
| "num_tokens": 291534.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 39.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 39.0, | |
| "rewards/reward_fn/mean": -0.47846153378486633, | |
| "rewards/reward_fn/std": 0.08050138503313065, | |
| "reward": -0.47846153378486633, | |
| "reward_std": 0.08050138503313065, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.48380351811647415, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3797209980002663, | |
| "epoch": 0.8625, | |
| "step": 69 | |
| }, | |
| { | |
| "loss": -0.2298320233821869, | |
| "grad_norm": 25.875, | |
| "learning_rate": 4.2499999999999995e-07, | |
| "num_tokens": 295502.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.017727263271808624, | |
| "rewards/reward_fn/std": 0.5630583167076111, | |
| "reward": 0.017727263271808624, | |
| "reward_std": 0.5630583167076111, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.6311597526073456, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4387354840037005, | |
| "epoch": 0.875, | |
| "step": 70 | |
| }, | |
| { | |
| "loss": -0.3377363085746765, | |
| "grad_norm": 44.75, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "num_tokens": 299774.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.5791666507720947, | |
| "rewards/reward_fn/std": 0.9153085350990295, | |
| "reward": 0.5791666507720947, | |
| "reward_std": 0.9153084754943848, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.40923601388931274, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.5313861870054097, | |
| "epoch": 0.8875, | |
| "step": 71 | |
| }, | |
| { | |
| "loss": -0.4048517942428589, | |
| "grad_norm": 54.5, | |
| "learning_rate": 4.083333333333333e-07, | |
| "num_tokens": 303518.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.8893181085586548, | |
| "rewards/reward_fn/std": 0.8578178286552429, | |
| "reward": 0.8893181085586548, | |
| "reward_std": 0.8578178286552429, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.48572198301553726, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.367873762999807, | |
| "epoch": 0.9, | |
| "step": 72 | |
| }, | |
| { | |
| "loss": -0.2366514503955841, | |
| "grad_norm": 29.0, | |
| "learning_rate": 4e-07, | |
| "num_tokens": 307998.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.1987352967262268, | |
| "rewards/reward_fn/std": 0.3228420913219452, | |
| "reward": -0.1987352967262268, | |
| "reward_std": 0.3228420913219452, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.1645362675189972, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6045698630005063, | |
| "epoch": 0.9125, | |
| "step": 73 | |
| }, | |
| { | |
| "loss": -0.45248183608055115, | |
| "grad_norm": 55.75, | |
| "learning_rate": 3.9166666666666664e-07, | |
| "num_tokens": 313854.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.10006745159626007, | |
| "rewards/reward_fn/std": 0.3313491642475128, | |
| "reward": -0.10006745159626007, | |
| "reward_std": 0.33134913444519043, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.46212853491306305, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.9352203579946945, | |
| "epoch": 0.925, | |
| "step": 74 | |
| }, | |
| { | |
| "loss": -0.3091174364089966, | |
| "grad_norm": 33.5, | |
| "learning_rate": 3.8333333333333335e-07, | |
| "num_tokens": 317566.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.1201136112213135, | |
| "rewards/reward_fn/std": 0.6550368666648865, | |
| "reward": 1.1201136112213135, | |
| "reward_std": 0.6550368666648865, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5967733189463615, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3943060280034842, | |
| "epoch": 0.9375, | |
| "step": 75 | |
| }, | |
| { | |
| "loss": -0.35074472427368164, | |
| "grad_norm": 25.5, | |
| "learning_rate": 3.75e-07, | |
| "num_tokens": 321918.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.42166668176651, | |
| "rewards/reward_fn/std": 0.9853031635284424, | |
| "reward": 0.42166668176651, | |
| "reward_std": 0.9853031635284424, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.24720394611358643, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.535821080004098, | |
| "epoch": 0.95, | |
| "step": 76 | |
| }, | |
| { | |
| "loss": -0.3965933918952942, | |
| "grad_norm": 25.5, | |
| "learning_rate": 3.666666666666666e-07, | |
| "num_tokens": 325942.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.11625000834465027, | |
| "rewards/reward_fn/std": 0.7256118655204773, | |
| "reward": -0.11625000834465027, | |
| "reward_std": 0.7256118059158325, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.18184397369623184, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4522922530013602, | |
| "epoch": 0.9625, | |
| "step": 77 | |
| }, | |
| { | |
| "loss": -0.2389850914478302, | |
| "grad_norm": 90.0, | |
| "learning_rate": 3.583333333333333e-07, | |
| "num_tokens": 329822.0, | |
| "completions/mean_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/max_length": 36.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "completions/max_terminated_length": 36.0, | |
| "rewards/reward_fn/mean": -0.49060603976249695, | |
| "rewards/reward_fn/std": 0.07629626989364624, | |
| "reward": -0.49060603976249695, | |
| "reward_std": 0.07629625499248505, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.44026637449860573, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3386678639944876, | |
| "epoch": 0.975, | |
| "step": 78 | |
| }, | |
| { | |
| "loss": -0.3037688434123993, | |
| "grad_norm": 37.75, | |
| "learning_rate": 3.5e-07, | |
| "num_tokens": 334326.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.27223682403564453, | |
| "rewards/reward_fn/std": 0.31447866559028625, | |
| "reward": -0.27223682403564453, | |
| "reward_std": 0.31447869539260864, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.1894683912396431, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.647535642998264, | |
| "epoch": 0.9875, | |
| "step": 79 | |
| }, | |
| { | |
| "loss": -0.19757390022277832, | |
| "grad_norm": 16.125, | |
| "learning_rate": 3.4166666666666664e-07, | |
| "num_tokens": 338718.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.03916667401790619, | |
| "rewards/reward_fn/std": 0.8534835577011108, | |
| "reward": -0.03916667401790619, | |
| "reward_std": 0.8534834980964661, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.07164555788040161, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.61853914000676, | |
| "epoch": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "loss": -0.3919941186904907, | |
| "grad_norm": 26.875, | |
| "learning_rate": 3.333333333333333e-07, | |
| "num_tokens": 342558.0, | |
| "completions/mean_length": 31.0, | |
| "completions/min_length": 31.0, | |
| "completions/max_length": 31.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 31.0, | |
| "completions/min_terminated_length": 31.0, | |
| "completions/max_terminated_length": 31.0, | |
| "rewards/reward_fn/mean": -0.49666666984558105, | |
| "rewards/reward_fn/std": 0.11469767242670059, | |
| "reward": -0.49666666984558105, | |
| "reward_std": 0.11469767242670059, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.19183612614870071, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3189109419981833, | |
| "epoch": 1.0125, | |
| "step": 81 | |
| }, | |
| { | |
| "loss": -0.3513515293598175, | |
| "grad_norm": 52.75, | |
| "learning_rate": 3.25e-07, | |
| "num_tokens": 348414.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.019565992057323456, | |
| "rewards/reward_fn/std": 0.2976280748844147, | |
| "reward": -0.019565992057323456, | |
| "reward_std": 0.2976280748844147, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.45254190266132355, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.9623393389993, | |
| "epoch": 1.025, | |
| "step": 82 | |
| }, | |
| { | |
| "loss": -0.26606613397598267, | |
| "grad_norm": 40.75, | |
| "learning_rate": 3.166666666666666e-07, | |
| "num_tokens": 352142.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.1201136112213135, | |
| "rewards/reward_fn/std": 0.6547484397888184, | |
| "reward": 1.1201136112213135, | |
| "reward_std": 0.6547484397888184, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5292974896728992, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.733958220000204, | |
| "epoch": 1.0375, | |
| "step": 83 | |
| }, | |
| { | |
| "loss": 0.06032121181488037, | |
| "grad_norm": 73.0, | |
| "learning_rate": 3.0833333333333333e-07, | |
| "num_tokens": 356054.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5698502659797668, | |
| "rewards/reward_fn/std": 0.14464668929576874, | |
| "reward": -0.5698502659797668, | |
| "reward_std": 0.14464668929576874, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5359103605151176, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4494938460084086, | |
| "epoch": 1.05, | |
| "step": 84 | |
| }, | |
| { | |
| "loss": -0.284755140542984, | |
| "grad_norm": 57.25, | |
| "learning_rate": 3e-07, | |
| "num_tokens": 360694.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.2969146966934204, | |
| "rewards/reward_fn/std": 0.3564964234828949, | |
| "reward": -0.2969146966934204, | |
| "reward_std": 0.3564964234828949, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.34993216395378113, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.7467824280029163, | |
| "epoch": 1.0625, | |
| "step": 85 | |
| }, | |
| { | |
| "loss": 0.02751055359840393, | |
| "grad_norm": 17.375, | |
| "learning_rate": 2.916666666666667e-07, | |
| "num_tokens": 365286.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5153676271438599, | |
| "rewards/reward_fn/std": 0.2792046368122101, | |
| "reward": -0.5153676271438599, | |
| "reward_std": 0.2792046368122101, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3163418546319008, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.679692724999768, | |
| "epoch": 1.075, | |
| "step": 86 | |
| }, | |
| { | |
| "loss": -0.17915111780166626, | |
| "grad_norm": 116.5, | |
| "learning_rate": 2.833333333333333e-07, | |
| "num_tokens": 369198.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.4362500011920929, | |
| "rewards/reward_fn/std": 0.04438065364956856, | |
| "reward": -0.4362500011920929, | |
| "reward_std": 0.04438065364956856, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5028216391801834, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.460423249995074, | |
| "epoch": 1.0875, | |
| "step": 87 | |
| }, | |
| { | |
| "loss": -0.4358249008655548, | |
| "grad_norm": 72.0, | |
| "learning_rate": 2.75e-07, | |
| "num_tokens": 373278.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.43409088253974915, | |
| "rewards/reward_fn/std": 0.8036266565322876, | |
| "reward": 0.43409088253974915, | |
| "reward_std": 0.8036266565322876, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4655003324151039, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4649747210023634, | |
| "epoch": 1.1, | |
| "step": 88 | |
| }, | |
| { | |
| "loss": -0.4519768953323364, | |
| "grad_norm": 55.5, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "num_tokens": 377838.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.11609291285276413, | |
| "rewards/reward_fn/std": 0.3184233009815216, | |
| "reward": -0.11609291285276413, | |
| "reward_std": 0.3184233009815216, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.44290829449892044, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6480709760071477, | |
| "epoch": 1.1125, | |
| "step": 89 | |
| }, | |
| { | |
| "loss": -0.2852037250995636, | |
| "grad_norm": 42.0, | |
| "learning_rate": 2.5833333333333333e-07, | |
| "num_tokens": 381920.0, | |
| "completions/mean_length": 39.25, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 38.79999923706055, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.7518181800842285, | |
| "rewards/reward_fn/std": 0.8103408217430115, | |
| "reward": 0.7518181800842285, | |
| "reward_std": 0.8103407621383667, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.47666456177830696, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4954591059940867, | |
| "epoch": 1.125, | |
| "step": 90 | |
| }, | |
| { | |
| "loss": -0.4666173756122589, | |
| "grad_norm": 54.25, | |
| "learning_rate": 2.5e-07, | |
| "num_tokens": 387456.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.18147876858711243, | |
| "rewards/reward_fn/std": 0.3412088453769684, | |
| "reward": -0.18147876858711243, | |
| "reward_std": 0.3412088453769684, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3644842356443405, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.9168457080049848, | |
| "epoch": 1.1375, | |
| "step": 91 | |
| }, | |
| { | |
| "loss": -0.19452103972434998, | |
| "grad_norm": 35.0, | |
| "learning_rate": 2.4166666666666665e-07, | |
| "num_tokens": 391272.0, | |
| "completions/mean_length": 29.0, | |
| "completions/min_length": 29.0, | |
| "completions/max_length": 29.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 29.0, | |
| "completions/min_terminated_length": 29.0, | |
| "completions/max_terminated_length": 29.0, | |
| "rewards/reward_fn/mean": -0.48826920986175537, | |
| "rewards/reward_fn/std": 0.10049224644899368, | |
| "reward": -0.48826920986175537, | |
| "reward_std": 0.10049223154783249, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.10749578103423119, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.1976117929953034, | |
| "epoch": 1.15, | |
| "step": 92 | |
| }, | |
| { | |
| "loss": -0.32232654094696045, | |
| "grad_norm": 40.0, | |
| "learning_rate": 2.3333333333333333e-07, | |
| "num_tokens": 395304.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.8205681443214417, | |
| "rewards/reward_fn/std": 0.8343429565429688, | |
| "reward": 0.8205681443214417, | |
| "reward_std": 0.8343429565429688, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4315156787633896, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.468535397998494, | |
| "epoch": 1.1625, | |
| "step": 93 | |
| }, | |
| { | |
| "loss": -0.4523668885231018, | |
| "grad_norm": 84.0, | |
| "learning_rate": 2.25e-07, | |
| "num_tokens": 399896.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.27434343099594116, | |
| "rewards/reward_fn/std": 0.3116720914840698, | |
| "reward": -0.27434343099594116, | |
| "reward_std": 0.31167206168174744, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.2968396469950676, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6608880270032387, | |
| "epoch": 1.175, | |
| "step": 94 | |
| }, | |
| { | |
| "loss": -0.21401086449623108, | |
| "grad_norm": 125.5, | |
| "learning_rate": 2.1666666666666667e-07, | |
| "num_tokens": 403784.0, | |
| "completions/mean_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/max_length": 37.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "completions/max_terminated_length": 37.0, | |
| "rewards/reward_fn/mean": -0.477613627910614, | |
| "rewards/reward_fn/std": 0.10695964097976685, | |
| "reward": -0.477613627910614, | |
| "reward_std": 0.10695964097976685, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.53290855884552, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.322921327995573, | |
| "epoch": 1.1875, | |
| "step": 95 | |
| }, | |
| { | |
| "loss": -0.4509161114692688, | |
| "grad_norm": 82.5, | |
| "learning_rate": 2.0833333333333333e-07, | |
| "num_tokens": 408328.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.10536706447601318, | |
| "rewards/reward_fn/std": 0.32809269428253174, | |
| "reward": -0.10536706447601318, | |
| "reward_std": 0.32809266448020935, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4744805321097374, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.620484232000308, | |
| "epoch": 1.2, | |
| "step": 96 | |
| }, | |
| { | |
| "loss": -0.1650632917881012, | |
| "grad_norm": 14.75, | |
| "learning_rate": 2e-07, | |
| "num_tokens": 413928.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.44751623272895813, | |
| "rewards/reward_fn/std": 0.2383711040019989, | |
| "reward": -0.44751623272895813, | |
| "reward_std": 0.2383711040019989, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.13679387792944908, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.9071075929969084, | |
| "epoch": 1.2125, | |
| "step": 97 | |
| }, | |
| { | |
| "loss": -0.21456944942474365, | |
| "grad_norm": 69.5, | |
| "learning_rate": 1.9166666666666668e-07, | |
| "num_tokens": 417808.0, | |
| "completions/mean_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/max_length": 36.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "completions/max_terminated_length": 36.0, | |
| "rewards/reward_fn/mean": -0.4945833384990692, | |
| "rewards/reward_fn/std": 0.08267238736152649, | |
| "reward": -0.4945833384990692, | |
| "reward_std": 0.08267239481210709, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.47412487864494324, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.633686770001077, | |
| "epoch": 1.225, | |
| "step": 98 | |
| }, | |
| { | |
| "loss": -0.46756917238235474, | |
| "grad_norm": 71.5, | |
| "learning_rate": 1.833333333333333e-07, | |
| "num_tokens": 421552.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.42318177223205566, | |
| "rewards/reward_fn/std": 0.9871143102645874, | |
| "reward": 0.42318177223205566, | |
| "reward_std": 0.9871143102645874, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3287414312362671, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3927637800043158, | |
| "epoch": 1.2375, | |
| "step": 99 | |
| }, | |
| { | |
| "loss": -0.261552095413208, | |
| "grad_norm": 52.5, | |
| "learning_rate": 1.75e-07, | |
| "num_tokens": 425408.0, | |
| "completions/mean_length": 34.0, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 34.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 34.0, | |
| "rewards/reward_fn/mean": -0.5133333206176758, | |
| "rewards/reward_fn/std": 0.12256517261266708, | |
| "reward": -0.5133333206176758, | |
| "reward_std": 0.12256518006324768, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.31978046894073486, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.273552115992061, | |
| "epoch": 1.25, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": 0.11666551977396011, | |
| "grad_norm": 52.25, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "num_tokens": 429488.0, | |
| "completions/mean_length": 39.0, | |
| "completions/min_length": 39.0, | |
| "completions/max_length": 39.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "completions/max_terminated_length": 39.0, | |
| "rewards/reward_fn/mean": 1.089545488357544, | |
| "rewards/reward_fn/std": 0.3140266537666321, | |
| "reward": 1.089545488357544, | |
| "reward_std": 0.3140266537666321, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.6426612436771393, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3935730929988495, | |
| "epoch": 1.2625, | |
| "step": 101 | |
| }, | |
| { | |
| "loss": -0.2449040561914444, | |
| "grad_norm": 37.75, | |
| "learning_rate": 1.583333333333333e-07, | |
| "num_tokens": 433464.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.9644317626953125, | |
| "rewards/reward_fn/std": 0.6381950378417969, | |
| "reward": 0.9644317626953125, | |
| "reward_std": 0.6381949782371521, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4328523129224777, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.428149282994127, | |
| "epoch": 1.275, | |
| "step": 102 | |
| }, | |
| { | |
| "loss": 0.30886250734329224, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.5e-07, | |
| "num_tokens": 439024.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.5219852924346924, | |
| "rewards/reward_fn/std": 0.06218379735946655, | |
| "reward": -0.5219852924346924, | |
| "reward_std": 0.06218379735946655, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.07704110443592072, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.8439336939991335, | |
| "epoch": 1.2875, | |
| "step": 103 | |
| }, | |
| { | |
| "loss": -0.46633923053741455, | |
| "grad_norm": 40.75, | |
| "learning_rate": 1.4166666666666665e-07, | |
| "num_tokens": 443584.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.1838294267654419, | |
| "rewards/reward_fn/std": 0.3388920724391937, | |
| "reward": -0.1838294267654419, | |
| "reward_std": 0.3388920724391937, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3203857019543648, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.5903865929976746, | |
| "epoch": 1.3, | |
| "step": 104 | |
| }, | |
| { | |
| "loss": -0.40494483709335327, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "num_tokens": 447864.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.8963888883590698, | |
| "rewards/reward_fn/std": 0.8619859218597412, | |
| "reward": 0.8963888883590698, | |
| "reward_std": 0.8619858622550964, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4876931868493557, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.50387984900226, | |
| "epoch": 1.3125, | |
| "step": 105 | |
| }, | |
| { | |
| "loss": -0.40497589111328125, | |
| "grad_norm": 53.0, | |
| "learning_rate": 1.25e-07, | |
| "num_tokens": 452136.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.04333333671092987, | |
| "rewards/reward_fn/std": 0.8456292748451233, | |
| "reward": -0.04333333671092987, | |
| "reward_std": 0.8456292748451233, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.19357310980558395, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.5037597859954985, | |
| "epoch": 1.325, | |
| "step": 106 | |
| }, | |
| { | |
| "loss": -0.29236406087875366, | |
| "grad_norm": 91.5, | |
| "learning_rate": 1.1666666666666667e-07, | |
| "num_tokens": 455992.0, | |
| "completions/mean_length": 34.0, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 34.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 34.0, | |
| "rewards/reward_fn/mean": -0.4675000011920929, | |
| "rewards/reward_fn/std": 0.08647873997688293, | |
| "reward": -0.4675000011920929, | |
| "reward_std": 0.08647873997688293, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.43787892907857895, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.2227310779999243, | |
| "epoch": 1.3375, | |
| "step": 107 | |
| }, | |
| { | |
| "loss": -0.010123476386070251, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.0833333333333334e-07, | |
| "num_tokens": 461733.0, | |
| "completions/mean_length": 39.625, | |
| "completions/min_length": 37.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/mean_terminated_length": 38.5, | |
| "completions/min_terminated_length": 37.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.2874075770378113, | |
| "rewards/reward_fn/std": 0.3597066104412079, | |
| "reward": -0.2874075770378113, | |
| "reward_std": 0.3597065508365631, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.14996788650751114, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.931392318005237, | |
| "epoch": 1.35, | |
| "step": 108 | |
| }, | |
| { | |
| "loss": -0.3297921419143677, | |
| "grad_norm": 14.625, | |
| "learning_rate": 1e-07, | |
| "num_tokens": 465861.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.09431818872690201, | |
| "rewards/reward_fn/std": 0.614956796169281, | |
| "reward": -0.09431818872690201, | |
| "reward_std": 0.6149567365646362, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3540477678179741, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.5004861100023845, | |
| "epoch": 1.3625, | |
| "step": 109 | |
| }, | |
| { | |
| "loss": -0.1445135623216629, | |
| "grad_norm": 94.5, | |
| "learning_rate": 9.166666666666665e-08, | |
| "num_tokens": 469725.0, | |
| "completions/mean_length": 34.0, | |
| "completions/min_length": 34.0, | |
| "completions/max_length": 34.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "completions/max_terminated_length": 34.0, | |
| "rewards/reward_fn/mean": -0.4719230532646179, | |
| "rewards/reward_fn/std": 0.09804082661867142, | |
| "reward": -0.4719230532646179, | |
| "reward_std": 0.09804081171751022, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.41771702095866203, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.256323601995973, | |
| "epoch": 1.375, | |
| "step": 110 | |
| }, | |
| { | |
| "loss": -0.27893882989883423, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8.333333333333333e-08, | |
| "num_tokens": 473749.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.909318208694458, | |
| "rewards/reward_fn/std": 0.6314533352851868, | |
| "reward": 0.909318208694458, | |
| "reward_std": 0.6314533352851868, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.6610783189535141, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.407529885000258, | |
| "epoch": 1.3875, | |
| "step": 111 | |
| }, | |
| { | |
| "loss": -0.44871169328689575, | |
| "grad_norm": 99.0, | |
| "learning_rate": 7.5e-08, | |
| "num_tokens": 478285.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.11166666448116302, | |
| "rewards/reward_fn/std": 0.32444027066230774, | |
| "reward": -0.11166666448116302, | |
| "reward_std": 0.32444027066230774, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.4209945723414421, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.6372458310033835, | |
| "epoch": 1.4, | |
| "step": 112 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.666666666666667e-08, | |
| "num_tokens": 482117.0, | |
| "completions/mean_length": 29.0, | |
| "completions/min_length": 29.0, | |
| "completions/max_length": 29.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/mean_terminated_length": 29.0, | |
| "completions/min_terminated_length": 29.0, | |
| "completions/max_terminated_length": 29.0, | |
| "rewards/reward_fn/mean": -0.44999998807907104, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": -0.44999998807907104, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 0.1645583175122738, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.11068881400206, | |
| "epoch": 1.4125, | |
| "step": 113 | |
| }, | |
| { | |
| "loss": -0.30925267934799194, | |
| "grad_norm": 29.125, | |
| "learning_rate": 5.833333333333333e-08, | |
| "num_tokens": 485845.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.111022710800171, | |
| "rewards/reward_fn/std": 0.6510758399963379, | |
| "reward": 1.111022710800171, | |
| "reward_std": 0.6510758399963379, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5098879709839821, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3513348000051337, | |
| "epoch": 1.425, | |
| "step": 114 | |
| }, | |
| { | |
| "loss": -0.3415352404117584, | |
| "grad_norm": 40.75, | |
| "learning_rate": 5e-08, | |
| "num_tokens": 490197.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 0.574999988079071, | |
| "rewards/reward_fn/std": 0.9112263321876526, | |
| "reward": 0.574999988079071, | |
| "reward_std": 0.9112262725830078, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.3970959484577179, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.5100457160006044, | |
| "epoch": 1.4375, | |
| "step": 115 | |
| }, | |
| { | |
| "loss": -0.25803476572036743, | |
| "grad_norm": 42.5, | |
| "learning_rate": 4.166666666666666e-08, | |
| "num_tokens": 494389.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": 1.1160227060317993, | |
| "rewards/reward_fn/std": 0.6535513401031494, | |
| "reward": 1.1160227060317993, | |
| "reward_std": 0.6535513401031494, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5293388590216637, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.4952827420056565, | |
| "epoch": 1.45, | |
| "step": 116 | |
| }, | |
| { | |
| "loss": -0.21081024408340454, | |
| "grad_norm": 113.5, | |
| "learning_rate": 3.3333333333333334e-08, | |
| "num_tokens": 498237.0, | |
| "completions/mean_length": 32.0, | |
| "completions/min_length": 32.0, | |
| "completions/max_length": 32.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 32.0, | |
| "completions/min_terminated_length": 32.0, | |
| "completions/max_terminated_length": 32.0, | |
| "rewards/reward_fn/mean": -0.45499998331069946, | |
| "rewards/reward_fn/std": 0.09180726110935211, | |
| "reward": -0.45499998331069946, | |
| "reward_std": 0.09180724620819092, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.42862727493047714, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.2099126460016123, | |
| "epoch": 1.4625, | |
| "step": 117 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5e-08, | |
| "num_tokens": 503997.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_terminated_length": 0.0, | |
| "completions/max_terminated_length": 0.0, | |
| "rewards/reward_fn/mean": -0.519861102104187, | |
| "rewards/reward_fn/std": 0.056175705045461655, | |
| "reward": -0.519861102104187, | |
| "reward_std": 0.05617569014430046, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.898591545996169, | |
| "epoch": 1.475, | |
| "step": 118 | |
| }, | |
| { | |
| "loss": -0.20292121171951294, | |
| "grad_norm": 160.0, | |
| "learning_rate": 1.6666666666666667e-08, | |
| "num_tokens": 507901.0, | |
| "completions/mean_length": 38.0, | |
| "completions/min_length": 38.0, | |
| "completions/max_length": 38.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/mean_terminated_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "completions/max_terminated_length": 38.0, | |
| "rewards/reward_fn/mean": -0.4890865087509155, | |
| "rewards/reward_fn/std": 0.1267431229352951, | |
| "reward": -0.4890865087509155, | |
| "reward_std": 0.1267431229352951, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5254447758197784, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.3351195930008544, | |
| "epoch": 1.4875, | |
| "step": 119 | |
| }, | |
| { | |
| "loss": 0.39274802803993225, | |
| "grad_norm": 60.5, | |
| "learning_rate": 8.333333333333334e-09, | |
| "num_tokens": 513581.0, | |
| "completions/mean_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/max_length": 40.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "rewards/reward_fn/mean": -0.6524179577827454, | |
| "rewards/reward_fn/std": 0.09692026674747467, | |
| "reward": -0.6524179577827454, | |
| "reward_std": 0.09692025184631348, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.5660714134573936, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 2.883316584997374, | |
| "epoch": 1.5, | |
| "step": 120 | |
| }, | |
| { | |
| "train_runtime": 413.5097, | |
| "train_samples_per_second": 0.29, | |
| "train_steps_per_second": 0.29, | |
| "total_flos": 0.0, | |
| "train_loss": -0.2094177687074989, | |
| "epoch": 1.5, | |
| "step": 120 | |
| } | |
| ] |