| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.22857142857142856, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1702.03125, |
| "completions/mean_terminated_length": 993.6190795898438, |
| "completions/min_length": 483.0, |
| "completions/min_terminated_length": 483.0, |
| "epoch": 0.001142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2544344663619995, |
| "learning_rate": 0.0, |
| "loss": -0.0, |
| "num_tokens": 118418.0, |
| "reward": 0.17899775505065918, |
| "reward_std": 0.7650213241577148, |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, |
| "rewards/format_reward/mean": 0.375, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1738.90625, |
| "completions/mean_terminated_length": 949.0, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.002285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24362485110759735, |
| "learning_rate": 5e-08, |
| "loss": -0.0, |
| "num_tokens": 239748.0, |
| "reward": 0.3848632574081421, |
| "reward_std": 0.9111153483390808, |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1228.0, |
| "completions/mean_length": 1952.96875, |
| "completions/mean_terminated_length": 831.6000366210938, |
| "completions/min_length": 608.0, |
| "completions/min_terminated_length": 608.0, |
| "epoch": 0.0034285714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25938913226127625, |
| "learning_rate": 1e-07, |
| "loss": -0.0, |
| "num_tokens": 375210.0, |
| "reward": -0.31737297773361206, |
| "reward_std": 0.40810590982437134, |
| "rewards/cosine_scaled_reward/mean": -0.20556148886680603, |
| "rewards/cosine_scaled_reward/std": 0.2044239044189453, |
| "rewards/format_reward/mean": 0.09375, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1889.0, |
| "completions/mean_length": 1567.9375, |
| "completions/mean_terminated_length": 988.5516967773438, |
| "completions/min_length": 500.0, |
| "completions/min_terminated_length": 500.0, |
| "epoch": 0.004571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29626700282096863, |
| "learning_rate": 1.5e-07, |
| "loss": 0.0, |
| "num_tokens": 485366.0, |
| "reward": 0.1552329957485199, |
| "reward_std": 0.5780439376831055, |
| "rewards/cosine_scaled_reward/mean": -0.18800850212574005, |
| "rewards/cosine_scaled_reward/std": 0.2348431795835495, |
| "rewards/format_reward/mean": 0.53125, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1706.0, |
| "completions/mean_length": 1988.796875, |
| "completions/mean_terminated_length": 1100.75, |
| "completions/min_length": 573.0, |
| "completions/min_terminated_length": 573.0, |
| "epoch": 0.005714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2605815827846527, |
| "learning_rate": 2e-07, |
| "loss": -0.0, |
| "num_tokens": 623465.0, |
| "reward": -0.4418099522590637, |
| "reward_std": 0.3239253759384155, |
| "rewards/cosine_scaled_reward/mean": -0.25215497612953186, |
| "rewards/cosine_scaled_reward/std": 0.1853509098291397, |
| "rewards/format_reward/mean": 0.0625, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1899.0, |
| "completions/mean_length": 1864.1875, |
| "completions/mean_terminated_length": 871.6000366210938, |
| "completions/min_length": 561.0, |
| "completions/min_terminated_length": 561.0, |
| "epoch": 0.006857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.262493759393692, |
| "learning_rate": 2.5e-07, |
| "loss": 0.0, |
| "num_tokens": 754421.0, |
| "reward": -0.2906607687473297, |
| "reward_std": 0.34858179092407227, |
| "rewards/cosine_scaled_reward/mean": -0.22345538437366486, |
| "rewards/cosine_scaled_reward/std": 0.16744518280029297, |
| "rewards/format_reward/mean": 0.15625, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1947.0, |
| "completions/mean_length": 1931.09375, |
| "completions/mean_terminated_length": 1216.6666259765625, |
| "completions/min_length": 554.0, |
| "completions/min_terminated_length": 554.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23469489812850952, |
| "learning_rate": 3e-07, |
| "loss": 0.0, |
| "num_tokens": 888419.0, |
| "reward": -0.046325311064720154, |
| "reward_std": 0.5296324491500854, |
| "rewards/cosine_scaled_reward/mean": -0.14035014808177948, |
| "rewards/cosine_scaled_reward/std": 0.36545559763908386, |
| "rewards/format_reward/mean": 0.234375, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1730.828125, |
| "completions/mean_terminated_length": 979.631591796875, |
| "completions/min_length": 281.0, |
| "completions/min_terminated_length": 281.0, |
| "epoch": 0.009142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22580823302268982, |
| "learning_rate": 3.5e-07, |
| "loss": -0.0, |
| "num_tokens": 1009608.0, |
| "reward": 0.22049131989479065, |
| "reward_std": 0.6817946434020996, |
| "rewards/cosine_scaled_reward/mean": -0.05381683632731438, |
| "rewards/cosine_scaled_reward/std": 0.44645029306411743, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1868.140625, |
| "completions/mean_terminated_length": 1225.7857666015625, |
| "completions/min_length": 892.0, |
| "completions/min_terminated_length": 892.0, |
| "epoch": 0.010285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26566582918167114, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "num_tokens": 1140625.0, |
| "reward": -0.13664060831069946, |
| "reward_std": 0.6131436228752136, |
| "rewards/cosine_scaled_reward/mean": -0.19332030415534973, |
| "rewards/cosine_scaled_reward/std": 0.30607181787490845, |
| "rewards/format_reward/mean": 0.25, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1456.0, |
| "completions/mean_length": 1664.890625, |
| "completions/mean_terminated_length": 757.5263061523438, |
| "completions/min_length": 411.0, |
| "completions/min_terminated_length": 411.0, |
| "epoch": 0.011428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26136595010757446, |
| "learning_rate": 4.5e-07, |
| "loss": -0.0, |
| "num_tokens": 1258010.0, |
| "reward": 0.022913292050361633, |
| "reward_std": 0.545270562171936, |
| "rewards/cosine_scaled_reward/mean": -0.1369808465242386, |
| "rewards/cosine_scaled_reward/std": 0.3200873136520386, |
| "rewards/format_reward/mean": 0.296875, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1333.0, |
| "completions/mean_length": 1986.34375, |
| "completions/mean_terminated_length": 1061.5, |
| "completions/min_length": 841.0, |
| "completions/min_terminated_length": 841.0, |
| "epoch": 0.012571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23980404436588287, |
| "learning_rate": 5e-07, |
| "loss": -0.0, |
| "num_tokens": 1396808.0, |
| "reward": -0.45354267954826355, |
| "reward_std": 0.3950403332710266, |
| "rewards/cosine_scaled_reward/mean": -0.26583385467529297, |
| "rewards/cosine_scaled_reward/std": 0.16946381330490112, |
| "rewards/format_reward/mean": 0.078125, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1981.0, |
| "completions/mean_length": 1756.453125, |
| "completions/mean_terminated_length": 1236.7391357421875, |
| "completions/min_length": 528.0, |
| "completions/min_terminated_length": 528.0, |
| "epoch": 0.013714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.277899831533432, |
| "learning_rate": 5.5e-07, |
| "loss": -0.0, |
| "num_tokens": 1520165.0, |
| "reward": 0.1507202684879303, |
| "reward_std": 0.7362544536590576, |
| "rewards/cosine_scaled_reward/mean": -0.14338986575603485, |
| "rewards/cosine_scaled_reward/std": 0.39759454131126404, |
| "rewards/format_reward/mean": 0.4375, |
| "rewards/format_reward/std": 0.5, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1492.0, |
| "completions/mean_length": 1742.125, |
| "completions/mean_terminated_length": 896.4705810546875, |
| "completions/min_length": 532.0, |
| "completions/min_terminated_length": 532.0, |
| "epoch": 0.014857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25388944149017334, |
| "learning_rate": 6e-07, |
| "loss": -0.0, |
| "num_tokens": 1642701.0, |
| "reward": 0.02508428692817688, |
| "reward_std": 0.5804874300956726, |
| "rewards/cosine_scaled_reward/mean": -0.13589535653591156, |
| "rewards/cosine_scaled_reward/std": 0.3501027524471283, |
| "rewards/format_reward/mean": 0.296875, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2023.0, |
| "completions/mean_length": 1791.6875, |
| "completions/mean_terminated_length": 1184.631591796875, |
| "completions/min_length": 396.0, |
| "completions/min_terminated_length": 396.0, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26855364441871643, |
| "learning_rate": 6.5e-07, |
| "loss": -0.0, |
| "num_tokens": 1767977.0, |
| "reward": 0.027098476886749268, |
| "reward_std": 0.7340880632400513, |
| "rewards/cosine_scaled_reward/mean": -0.14270076155662537, |
| "rewards/cosine_scaled_reward/std": 0.36128607392311096, |
| "rewards/format_reward/mean": 0.3125, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1875.0, |
| "completions/mean_length": 1707.828125, |
| "completions/mean_terminated_length": 902.1578979492188, |
| "completions/min_length": 369.0, |
| "completions/min_terminated_length": 369.0, |
| "epoch": 0.017142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2692890167236328, |
| "learning_rate": 7e-07, |
| "loss": 0.0, |
| "num_tokens": 1888198.0, |
| "reward": 0.24003228545188904, |
| "reward_std": 0.5003666281700134, |
| "rewards/cosine_scaled_reward/mean": -0.02842137962579727, |
| "rewards/cosine_scaled_reward/std": 0.43434321880340576, |
| "rewards/format_reward/mean": 0.296875, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1028.0, |
| "completions/mean_length": 1985.046875, |
| "completions/mean_terminated_length": 705.0, |
| "completions/min_length": 463.0, |
| "completions/min_terminated_length": 463.0, |
| "epoch": 0.018285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24488449096679688, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0, |
| "num_tokens": 2025681.0, |
| "reward": -0.37671107053756714, |
| "reward_std": 0.4366358518600464, |
| "rewards/cosine_scaled_reward/mean": -0.21179303526878357, |
| "rewards/cosine_scaled_reward/std": 0.22632460296154022, |
| "rewards/format_reward/mean": 0.046875, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1850.0, |
| "completions/mean_length": 1561.9375, |
| "completions/mean_terminated_length": 975.3103637695312, |
| "completions/min_length": 347.0, |
| "completions/min_terminated_length": 347.0, |
| "epoch": 0.019428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3149985373020172, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "num_tokens": 2136165.0, |
| "reward": 0.12802264094352722, |
| "reward_std": 0.6542905569076538, |
| "rewards/cosine_scaled_reward/mean": -0.1703636795282364, |
| "rewards/cosine_scaled_reward/std": 0.3502788841724396, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1964.0, |
| "completions/mean_length": 1837.484375, |
| "completions/mean_terminated_length": 1085.6429443359375, |
| "completions/min_length": 574.0, |
| "completions/min_terminated_length": 574.0, |
| "epoch": 0.02057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20620153844356537, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 2264140.0, |
| "reward": -0.041578881442546844, |
| "reward_std": 0.7910969853401184, |
| "rewards/cosine_scaled_reward/mean": -0.16922692954540253, |
| "rewards/cosine_scaled_reward/std": 0.33054032921791077, |
| "rewards/format_reward/mean": 0.296875, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1864.265625, |
| "completions/mean_terminated_length": 1143.4615478515625, |
| "completions/min_length": 605.0, |
| "completions/min_terminated_length": 605.0, |
| "epoch": 0.021714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2673085629940033, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "num_tokens": 2394709.0, |
| "reward": 0.21812058985233307, |
| "reward_std": 0.8157521486282349, |
| "rewards/cosine_scaled_reward/mean": -0.02375221811234951, |
| "rewards/cosine_scaled_reward/std": 0.44612905383110046, |
| "rewards/format_reward/mean": 0.265625, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1966.0, |
| "completions/mean_length": 1601.125, |
| "completions/mean_terminated_length": 856.3333740234375, |
| "completions/min_length": 437.0, |
| "completions/min_terminated_length": 437.0, |
| "epoch": 0.022857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27571871876716614, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 2508533.0, |
| "reward": 0.13714352250099182, |
| "reward_std": 0.5742913484573364, |
| "rewards/cosine_scaled_reward/mean": -0.1267407238483429, |
| "rewards/cosine_scaled_reward/std": 0.379833847284317, |
| "rewards/format_reward/mean": 0.390625, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1881.0, |
| "completions/mean_length": 1720.75, |
| "completions/mean_terminated_length": 945.6842041015625, |
| "completions/min_length": 260.0, |
| "completions/min_terminated_length": 260.0, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3101024329662323, |
| "learning_rate": 1e-06, |
| "loss": -0.0, |
| "num_tokens": 2629469.0, |
| "reward": 0.0758291482925415, |
| "reward_std": 0.5849478840827942, |
| "rewards/cosine_scaled_reward/mean": -0.13396042585372925, |
| "rewards/cosine_scaled_reward/std": 0.3641633689403534, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1930.0, |
| "completions/mean_length": 1310.1875, |
| "completions/mean_terminated_length": 805.368408203125, |
| "completions/min_length": 262.0, |
| "completions/min_terminated_length": 262.0, |
| "epoch": 0.025142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35955631732940674, |
| "learning_rate": 9.99931462820376e-07, |
| "loss": 0.0, |
| "num_tokens": 2722337.0, |
| "reward": 0.5670604705810547, |
| "reward_std": 0.5711978077888489, |
| "rewards/cosine_scaled_reward/mean": -0.03678226098418236, |
| "rewards/cosine_scaled_reward/std": 0.4319343566894531, |
| "rewards/format_reward/mean": 0.640625, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1702.0, |
| "completions/mean_length": 1607.890625, |
| "completions/mean_terminated_length": 1004.7777709960938, |
| "completions/min_length": 408.0, |
| "completions/min_terminated_length": 408.0, |
| "epoch": 0.026285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27159449458122253, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": -0.0, |
| "num_tokens": 2836034.0, |
| "reward": 0.20600585639476776, |
| "reward_std": 0.6732993721961975, |
| "rewards/cosine_scaled_reward/mean": -0.13137206435203552, |
| "rewards/cosine_scaled_reward/std": 0.38508084416389465, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1959.0, |
| "completions/mean_length": 1755.890625, |
| "completions/mean_terminated_length": 1113.25, |
| "completions/min_length": 416.0, |
| "completions/min_terminated_length": 416.0, |
| "epoch": 0.027428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24094589054584503, |
| "learning_rate": 9.993832906395582e-07, |
| "loss": -0.0, |
| "num_tokens": 2959339.0, |
| "reward": 0.05567874014377594, |
| "reward_std": 0.7204875349998474, |
| "rewards/cosine_scaled_reward/mean": -0.15966063737869263, |
| "rewards/cosine_scaled_reward/std": 0.3462846875190735, |
| "rewards/format_reward/mean": 0.375, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1870.0, |
| "completions/mean_length": 1761.09375, |
| "completions/mean_terminated_length": 1027.888916015625, |
| "completions/min_length": 562.0, |
| "completions/min_terminated_length": 562.0, |
| "epoch": 0.02857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2641579508781433, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0, |
| "num_tokens": 3082345.0, |
| "reward": 0.12583430111408234, |
| "reward_std": 0.7026749849319458, |
| "rewards/cosine_scaled_reward/mean": -0.10114534199237823, |
| "rewards/cosine_scaled_reward/std": 0.3608616590499878, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1970.84375, |
| "completions/mean_terminated_length": 1636.5, |
| "completions/min_length": 975.0, |
| "completions/min_terminated_length": 975.0, |
| "epoch": 0.029714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24037978053092957, |
| "learning_rate": 9.982876141412855e-07, |
| "loss": 0.0, |
| "num_tokens": 3219111.0, |
| "reward": 0.21426932513713837, |
| "reward_std": 0.740675687789917, |
| "rewards/cosine_scaled_reward/mean": -0.06474034488201141, |
| "rewards/cosine_scaled_reward/std": 0.3838227093219757, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1897.0, |
| "completions/mean_length": 1945.515625, |
| "completions/mean_terminated_length": 1319.2222900390625, |
| "completions/min_length": 575.0, |
| "completions/min_terminated_length": 575.0, |
| "epoch": 0.030857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24205996096134186, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": -0.0, |
| "num_tokens": 3353912.0, |
| "reward": -0.21838442981243134, |
| "reward_std": 0.619316577911377, |
| "rewards/cosine_scaled_reward/mean": -0.19512970745563507, |
| "rewards/cosine_scaled_reward/std": 0.2882457375526428, |
| "rewards/format_reward/mean": 0.171875, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2040.0, |
| "completions/mean_length": 1776.484375, |
| "completions/mean_terminated_length": 1133.4210205078125, |
| "completions/min_length": 519.0, |
| "completions/min_terminated_length": 519.0, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.261581152677536, |
| "learning_rate": 9.96645768238595e-07, |
| "loss": 0.0, |
| "num_tokens": 3477943.0, |
| "reward": 0.2565116286277771, |
| "reward_std": 0.8822247385978699, |
| "rewards/cosine_scaled_reward/mean": -0.06705668568611145, |
| "rewards/cosine_scaled_reward/std": 0.47824493050575256, |
| "rewards/format_reward/mean": 0.390625, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1232.0, |
| "completions/mean_length": 1901.6875, |
| "completions/mean_terminated_length": 877.5, |
| "completions/min_length": 621.0, |
| "completions/min_terminated_length": 621.0, |
| "epoch": 0.03314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2625565528869629, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.0, |
| "num_tokens": 3610123.0, |
| "reward": -0.3446740508079529, |
| "reward_std": 0.3587799668312073, |
| "rewards/cosine_scaled_reward/mean": -0.24264952540397644, |
| "rewards/cosine_scaled_reward/std": 0.16127170622348785, |
| "rewards/format_reward/mean": 0.140625, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1845.828125, |
| "completions/mean_terminated_length": 1123.7857666015625, |
| "completions/min_length": 789.0, |
| "completions/min_terminated_length": 789.0, |
| "epoch": 0.03428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2583629786968231, |
| "learning_rate": 9.944597532678119e-07, |
| "loss": 0.0, |
| "num_tokens": 3738792.0, |
| "reward": -0.13950452208518982, |
| "reward_std": 0.5518099069595337, |
| "rewards/cosine_scaled_reward/mean": -0.1869397610425949, |
| "rewards/cosine_scaled_reward/std": 0.2614031732082367, |
| "rewards/format_reward/mean": 0.234375, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1753.0, |
| "completions/mean_length": 1823.40625, |
| "completions/mean_terminated_length": 942.3077392578125, |
| "completions/min_length": 232.0, |
| "completions/min_terminated_length": 232.0, |
| "epoch": 0.03542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25341325998306274, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": -0.0, |
| "num_tokens": 3865986.0, |
| "reward": -0.20477020740509033, |
| "reward_std": 0.6390085220336914, |
| "rewards/cosine_scaled_reward/mean": -0.20394760370254517, |
| "rewards/cosine_scaled_reward/std": 0.3794066309928894, |
| "rewards/format_reward/mean": 0.203125, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1980.0, |
| "completions/mean_length": 1889.484375, |
| "completions/mean_terminated_length": 1323.357177734375, |
| "completions/min_length": 714.0, |
| "completions/min_terminated_length": 714.0, |
| "epoch": 0.036571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23001989722251892, |
| "learning_rate": 9.917322325514487e-07, |
| "loss": -0.0, |
| "num_tokens": 3997265.0, |
| "reward": 0.01399039477109909, |
| "reward_std": 0.47122400999069214, |
| "rewards/cosine_scaled_reward/mean": -0.11800480633974075, |
| "rewards/cosine_scaled_reward/std": 0.4542357921600342, |
| "rewards/format_reward/mean": 0.25, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1654.0, |
| "completions/mean_length": 1917.125, |
| "completions/mean_terminated_length": 1210.4000244140625, |
| "completions/min_length": 914.0, |
| "completions/min_terminated_length": 914.0, |
| "epoch": 0.037714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21402673423290253, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": -0.0, |
| "num_tokens": 4131177.0, |
| "reward": -0.43841344118118286, |
| "reward_std": 0.3294987678527832, |
| "rewards/cosine_scaled_reward/mean": -0.29733169078826904, |
| "rewards/cosine_scaled_reward/std": 0.19245299696922302, |
| "rewards/format_reward/mean": 0.15625, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2006.0, |
| "completions/mean_length": 1486.53125, |
| "completions/mean_terminated_length": 925.0625, |
| "completions/min_length": 551.0, |
| "completions/min_terminated_length": 551.0, |
| "epoch": 0.038857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2830573320388794, |
| "learning_rate": 9.88466529153356e-07, |
| "loss": -0.0, |
| "num_tokens": 4235867.0, |
| "reward": 0.4512444734573364, |
| "reward_std": 0.8406625986099243, |
| "rewards/cosine_scaled_reward/mean": -0.04000277444720268, |
| "rewards/cosine_scaled_reward/std": 0.49787506461143494, |
| "rewards/format_reward/mean": 0.53125, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1168.0, |
| "completions/mean_length": 1839.96875, |
| "completions/mean_terminated_length": 568.6666870117188, |
| "completions/min_length": 357.0, |
| "completions/min_terminated_length": 357.0, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2852117717266083, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": -0.0, |
| "num_tokens": 4365121.0, |
| "reward": -0.1632520854473114, |
| "reward_std": 0.6035048961639404, |
| "rewards/cosine_scaled_reward/mean": -0.1675635278224945, |
| "rewards/cosine_scaled_reward/std": 0.38546639680862427, |
| "rewards/format_reward/mean": 0.171875, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1827.0, |
| "completions/mean_length": 1921.5625, |
| "completions/mean_terminated_length": 1148.888916015625, |
| "completions/min_length": 699.0, |
| "completions/min_terminated_length": 699.0, |
| "epoch": 0.04114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27304011583328247, |
| "learning_rate": 9.846666218300807e-07, |
| "loss": 0.0, |
| "num_tokens": 4499213.0, |
| "reward": -0.2672756016254425, |
| "reward_std": 0.45214492082595825, |
| "rewards/cosine_scaled_reward/mean": -0.20395030081272125, |
| "rewards/cosine_scaled_reward/std": 0.24503158032894135, |
| "rewards/format_reward/mean": 0.140625, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1805.0, |
| "completions/mean_length": 1901.515625, |
| "completions/mean_terminated_length": 1006.3333129882812, |
| "completions/min_length": 589.0, |
| "completions/min_terminated_length": 589.0, |
| "epoch": 0.04228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26615455746650696, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0, |
| "num_tokens": 4631934.0, |
| "reward": -0.4167596101760864, |
| "reward_std": 0.4093248248100281, |
| "rewards/cosine_scaled_reward/mean": -0.2786923050880432, |
| "rewards/cosine_scaled_reward/std": 0.16612833738327026, |
| "rewards/format_reward/mean": 0.140625, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1223.0, |
| "completions/mean_length": 1867.5625, |
| "completions/mean_terminated_length": 998.1818237304688, |
| "completions/min_length": 677.0, |
| "completions/min_terminated_length": 677.0, |
| "epoch": 0.04342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2513941526412964, |
| "learning_rate": 9.80337140183366e-07, |
| "loss": 0.0, |
| "num_tokens": 4763170.0, |
| "reward": -0.10445012152194977, |
| "reward_std": 0.42142462730407715, |
| "rewards/cosine_scaled_reward/mean": -0.1381625533103943, |
| "rewards/cosine_scaled_reward/std": 0.32096728682518005, |
| "rewards/format_reward/mean": 0.171875, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1940.0, |
| "completions/mean_length": 1663.828125, |
| "completions/mean_terminated_length": 818.6500244140625, |
| "completions/min_length": 312.0, |
| "completions/min_terminated_length": 312.0, |
| "epoch": 0.044571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2877403199672699, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": -0.0, |
| "num_tokens": 4880439.0, |
| "reward": 0.3092009425163269, |
| "reward_std": 0.5055705308914185, |
| "rewards/cosine_scaled_reward/mean": -0.040712013840675354, |
| "rewards/cosine_scaled_reward/std": 0.3459153175354004, |
| "rewards/format_reward/mean": 0.390625, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.515625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2027.0, |
| "completions/mean_length": 1534.296875, |
| "completions/mean_terminated_length": 987.4515991210938, |
| "completions/min_length": 401.0, |
| "completions/min_terminated_length": 401.0, |
| "epoch": 0.045714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26293206214904785, |
| "learning_rate": 9.754833590196926e-07, |
| "loss": 0.0, |
| "num_tokens": 4988706.0, |
| "reward": 0.37637200951576233, |
| "reward_std": 0.5428045392036438, |
| "rewards/cosine_scaled_reward/mean": -0.06962649524211884, |
| "rewards/cosine_scaled_reward/std": 0.44194599986076355, |
| "rewards/format_reward/mean": 0.515625, |
| "rewards/format_reward/std": 0.5037065148353577, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1795.0, |
| "completions/mean_length": 1786.28125, |
| "completions/mean_terminated_length": 1250.3809814453125, |
| "completions/min_length": 738.0, |
| "completions/min_terminated_length": 738.0, |
| "epoch": 0.046857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2139737904071808, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0, |
| "num_tokens": 5114180.0, |
| "reward": 0.01875646412372589, |
| "reward_std": 0.6959635019302368, |
| "rewards/cosine_scaled_reward/mean": -0.17812177538871765, |
| "rewards/cosine_scaled_reward/std": 0.34367337822914124, |
| "rewards/format_reward/mean": 0.375, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1828.0, |
| "completions/mean_length": 1684.140625, |
| "completions/mean_terminated_length": 592.5625, |
| "completions/min_length": 182.0, |
| "completions/min_terminated_length": 182.0, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39831504225730896, |
| "learning_rate": 9.701111919237408e-07, |
| "loss": 0.0, |
| "num_tokens": 5232325.0, |
| "reward": -0.21632033586502075, |
| "reward_std": 0.3267907500267029, |
| "rewards/cosine_scaled_reward/mean": -0.24097268283367157, |
| "rewards/cosine_scaled_reward/std": 0.17323769629001617, |
| "rewards/format_reward/mean": 0.265625, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1994.0, |
| "completions/mean_length": 1685.078125, |
| "completions/mean_terminated_length": 992.227294921875, |
| "completions/min_length": 534.0, |
| "completions/min_terminated_length": 534.0, |
| "epoch": 0.04914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25143784284591675, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": -0.0, |
| "num_tokens": 5351786.0, |
| "reward": 0.16303199529647827, |
| "reward_std": 0.48110607266426086, |
| "rewards/cosine_scaled_reward/mean": -0.09035900980234146, |
| "rewards/cosine_scaled_reward/std": 0.3455837368965149, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1377.0, |
| "completions/mean_length": 1539.421875, |
| "completions/mean_terminated_length": 796.1154174804688, |
| "completions/min_length": 289.0, |
| "completions/min_terminated_length": 289.0, |
| "epoch": 0.05028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3059767186641693, |
| "learning_rate": 9.64227184053598e-07, |
| "loss": -0.0, |
| "num_tokens": 5461005.0, |
| "reward": 0.5107974410057068, |
| "reward_std": 0.6938745379447937, |
| "rewards/cosine_scaled_reward/mean": 0.04446123540401459, |
| "rewards/cosine_scaled_reward/std": 0.5113232135772705, |
| "rewards/format_reward/mean": 0.421875, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1688.0, |
| "completions/mean_length": 1997.90625, |
| "completions/mean_terminated_length": 1513.666748046875, |
| "completions/min_length": 1198.0, |
| "completions/min_terminated_length": 1198.0, |
| "epoch": 0.05142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21965721249580383, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0, |
| "num_tokens": 5600527.0, |
| "reward": -0.0863756537437439, |
| "reward_std": 0.5902912020683289, |
| "rewards/cosine_scaled_reward/mean": -0.12131282687187195, |
| "rewards/cosine_scaled_reward/std": 0.3591388165950775, |
| "rewards/format_reward/mean": 0.15625, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1599.0, |
| "completions/mean_length": 1704.421875, |
| "completions/mean_terminated_length": 948.5499877929688, |
| "completions/min_length": 535.0, |
| "completions/min_terminated_length": 535.0, |
| "epoch": 0.052571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30566513538360596, |
| "learning_rate": 9.578385041664925e-07, |
| "loss": 0.0, |
| "num_tokens": 5720778.0, |
| "reward": -0.10181278735399246, |
| "reward_std": 0.5302228927612305, |
| "rewards/cosine_scaled_reward/mean": -0.21496888995170593, |
| "rewards/cosine_scaled_reward/std": 0.2217058539390564, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1997.0, |
| "completions/mean_length": 1668.171875, |
| "completions/mean_terminated_length": 991.0869750976562, |
| "completions/min_length": 368.0, |
| "completions/min_terminated_length": 368.0, |
| "epoch": 0.053714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3012264668941498, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0, |
| "num_tokens": 5837941.0, |
| "reward": 0.1859496831893921, |
| "reward_std": 0.9643809795379639, |
| "rewards/cosine_scaled_reward/mean": -0.11796265840530396, |
| "rewards/cosine_scaled_reward/std": 0.45073381066322327, |
| "rewards/format_reward/mean": 0.421875, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1945.0, |
| "completions/mean_length": 1640.0, |
| "completions/mean_terminated_length": 1003.5199584960938, |
| "completions/min_length": 441.0, |
| "completions/min_terminated_length": 441.0, |
| "epoch": 0.054857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23681208491325378, |
| "learning_rate": 9.509529358847654e-07, |
| "loss": 0.0, |
| "num_tokens": 5953445.0, |
| "reward": 0.13163542747497559, |
| "reward_std": 0.6663622856140137, |
| "rewards/cosine_scaled_reward/mean": -0.1294947862625122, |
| "rewards/cosine_scaled_reward/std": 0.3682635426521301, |
| "rewards/format_reward/mean": 0.390625, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1542.0, |
| "completions/mean_length": 1461.109375, |
| "completions/mean_terminated_length": 795.9667358398438, |
| "completions/min_length": 242.0, |
| "completions/min_terminated_length": 242.0, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3055432438850403, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0, |
| "num_tokens": 6057020.0, |
| "reward": 0.18277686834335327, |
| "reward_std": 0.6457837820053101, |
| "rewards/cosine_scaled_reward/mean": -0.14298656582832336, |
| "rewards/cosine_scaled_reward/std": 0.33668506145477295, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1936.0, |
| "completions/mean_length": 1671.09375, |
| "completions/mean_terminated_length": 951.5454711914062, |
| "completions/min_length": 342.0, |
| "completions/min_terminated_length": 342.0, |
| "epoch": 0.05714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2442624568939209, |
| "learning_rate": 9.43578868212728e-07, |
| "loss": 0.0, |
| "num_tokens": 6174786.0, |
| "reward": 0.2543642520904541, |
| "reward_std": 0.6998432874679565, |
| "rewards/cosine_scaled_reward/mean": -0.08375539630651474, |
| "rewards/cosine_scaled_reward/std": 0.4246826469898224, |
| "rewards/format_reward/mean": 0.421875, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1512.0, |
| "completions/mean_length": 1313.65625, |
| "completions/mean_terminated_length": 777.7838134765625, |
| "completions/min_length": 360.0, |
| "completions/min_terminated_length": 360.0, |
| "epoch": 0.05828571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3034026622772217, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": 0.0, |
| "num_tokens": 6269068.0, |
| "reward": 0.17114153504371643, |
| "reward_std": 0.5826554298400879, |
| "rewards/cosine_scaled_reward/mean": -0.21911674737930298, |
| "rewards/cosine_scaled_reward/std": 0.2822759747505188, |
| "rewards/format_reward/mean": 0.609375, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 51 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1453.0, |
| "completions/mean_length": 1678.28125, |
| "completions/mean_terminated_length": 733.4444580078125, |
| "completions/min_length": 235.0, |
| "completions/min_terminated_length": 235.0, |
| "epoch": 0.05942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2994474470615387, |
| "learning_rate": 9.357252853159505e-07, |
| "loss": -0.0, |
| "num_tokens": 6387830.0, |
| "reward": -0.17168548703193665, |
| "reward_std": 0.49792978167533875, |
| "rewards/cosine_scaled_reward/mean": -0.22646775841712952, |
| "rewards/cosine_scaled_reward/std": 0.34715980291366577, |
| "rewards/format_reward/mean": 0.28125, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1524.0, |
| "completions/mean_length": 1553.984375, |
| "completions/mean_terminated_length": 831.9615478515625, |
| "completions/min_length": 496.0, |
| "completions/min_terminated_length": 496.0, |
| "epoch": 0.060571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25160443782806396, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0, |
| "num_tokens": 6498117.0, |
| "reward": 0.1639135181903839, |
| "reward_std": 0.668002724647522, |
| "rewards/cosine_scaled_reward/mean": -0.14460574090480804, |
| "rewards/cosine_scaled_reward/std": 0.32541966438293457, |
| "rewards/format_reward/mean": 0.453125, |
| "rewards/format_reward/std": 0.501733124256134, |
| "step": 53 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1585.90625, |
| "completions/mean_terminated_length": 1062.2000732421875, |
| "completions/min_length": 375.0, |
| "completions/min_terminated_length": 375.0, |
| "epoch": 0.061714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2538788914680481, |
| "learning_rate": 9.274017555754407e-07, |
| "loss": -0.0, |
| "num_tokens": 6610759.0, |
| "reward": 0.7184321880340576, |
| "reward_std": 1.0729029178619385, |
| "rewards/cosine_scaled_reward/mean": 0.07015358656644821, |
| "rewards/cosine_scaled_reward/std": 0.5069921016693115, |
| "rewards/format_reward/mean": 0.578125, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 54 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1815.0, |
| "completions/mean_length": 1486.75, |
| "completions/mean_terminated_length": 850.6666870117188, |
| "completions/min_length": 365.0, |
| "completions/min_terminated_length": 365.0, |
| "epoch": 0.06285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28393346071243286, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0, |
| "num_tokens": 6716199.0, |
| "reward": 0.38126039505004883, |
| "reward_std": 0.6288601160049438, |
| "rewards/cosine_scaled_reward/mean": -0.08280730992555618, |
| "rewards/cosine_scaled_reward/std": 0.434533029794693, |
| "rewards/format_reward/mean": 0.546875, |
| "rewards/format_reward/std": 0.501733124256134, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1998.0, |
| "completions/mean_length": 1668.0, |
| "completions/mean_terminated_length": 1147.25927734375, |
| "completions/min_length": 570.0, |
| "completions/min_terminated_length": 570.0, |
| "epoch": 0.064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24122385680675507, |
| "learning_rate": 9.186184199300463e-07, |
| "loss": -0.0, |
| "num_tokens": 6833911.0, |
| "reward": 0.24217453598976135, |
| "reward_std": 0.576280951499939, |
| "rewards/cosine_scaled_reward/mean": -0.11328773200511932, |
| "rewards/cosine_scaled_reward/std": 0.43596696853637695, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2008.0, |
| "completions/mean_length": 1713.359375, |
| "completions/mean_terminated_length": 1191.3199462890625, |
| "completions/min_length": 677.0, |
| "completions/min_terminated_length": 677.0, |
| "epoch": 0.06514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2275950163602829, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": -0.0, |
| "num_tokens": 6955134.0, |
| "reward": -0.09758470952510834, |
| "reward_std": 0.5638470649719238, |
| "rewards/cosine_scaled_reward/mean": -0.29097986221313477, |
| "rewards/cosine_scaled_reward/std": 0.2019655853509903, |
| "rewards/format_reward/mean": 0.484375, |
| "rewards/format_reward/std": 0.5037065148353577, |
| "step": 57 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1954.0, |
| "completions/mean_length": 1325.90625, |
| "completions/mean_terminated_length": 973.2557983398438, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.06628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.270525723695755, |
| "learning_rate": 9.093859795212817e-07, |
| "loss": -0.0, |
| "num_tokens": 7050088.0, |
| "reward": 0.5497192144393921, |
| "reward_std": 0.8806554675102234, |
| "rewards/cosine_scaled_reward/mean": -0.07670287787914276, |
| "rewards/cosine_scaled_reward/std": 0.48966917395591736, |
| "rewards/format_reward/mean": 0.703125, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 58 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1863.0, |
| "completions/mean_length": 1507.9375, |
| "completions/mean_terminated_length": 1138.4210205078125, |
| "completions/min_length": 678.0, |
| "completions/min_terminated_length": 678.0, |
| "epoch": 0.06742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24991631507873535, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0, |
| "num_tokens": 7157060.0, |
| "reward": 0.47330179810523987, |
| "reward_std": 0.6620825529098511, |
| "rewards/cosine_scaled_reward/mean": -0.07584910094738007, |
| "rewards/cosine_scaled_reward/std": 0.39760199189186096, |
| "rewards/format_reward/mean": 0.625, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 59 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1620.84375, |
| "completions/mean_terminated_length": 1071.6429443359375, |
| "completions/min_length": 504.0, |
| "completions/min_terminated_length": 504.0, |
| "epoch": 0.06857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2627163827419281, |
| "learning_rate": 8.997156826556369e-07, |
| "loss": 0.0, |
| "num_tokens": 7271682.0, |
| "reward": 0.01943434774875641, |
| "reward_std": 0.7573007345199585, |
| "rewards/cosine_scaled_reward/mean": -0.2246578335762024, |
| "rewards/cosine_scaled_reward/std": 0.3148350715637207, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2027.0, |
| "completions/mean_length": 1701.5625, |
| "completions/mean_terminated_length": 1040.181884765625, |
| "completions/min_length": 452.0, |
| "completions/min_terminated_length": 452.0, |
| "epoch": 0.06971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21747122704982758, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0, |
| "num_tokens": 7392102.0, |
| "reward": 0.26178231835365295, |
| "reward_std": 0.6629467010498047, |
| "rewards/cosine_scaled_reward/mean": -0.10348384082317352, |
| "rewards/cosine_scaled_reward/std": 0.31626051664352417, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 61 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1704.0, |
| "completions/mean_length": 1174.796875, |
| "completions/mean_terminated_length": 907.4898071289062, |
| "completions/min_length": 379.0, |
| "completions/min_terminated_length": 379.0, |
| "epoch": 0.07085714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2614337205886841, |
| "learning_rate": 8.896193111002475e-07, |
| "loss": 0.0, |
| "num_tokens": 7477521.0, |
| "reward": 1.0250537395477295, |
| "reward_std": 0.7894514799118042, |
| "rewards/cosine_scaled_reward/mean": 0.11408931016921997, |
| "rewards/cosine_scaled_reward/std": 0.5407090783119202, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 62 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1249.328125, |
| "completions/mean_terminated_length": 859.279052734375, |
| "completions/min_length": 286.0, |
| "completions/min_terminated_length": 286.0, |
| "epoch": 0.072, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29820746183395386, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.0, |
| "num_tokens": 7567734.0, |
| "reward": 0.7643536329269409, |
| "reward_std": 0.7760990858078003, |
| "rewards/cosine_scaled_reward/mean": 0.007176805287599564, |
| "rewards/cosine_scaled_reward/std": 0.4894968271255493, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 63 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1999.0, |
| "completions/mean_length": 1257.546875, |
| "completions/mean_terminated_length": 1036.219970703125, |
| "completions/min_length": 544.0, |
| "completions/min_terminated_length": 544.0, |
| "epoch": 0.07314285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27683475613594055, |
| "learning_rate": 8.791091657286267e-07, |
| "loss": -0.0, |
| "num_tokens": 7659169.0, |
| "reward": 0.664791464805603, |
| "reward_std": 0.7692580223083496, |
| "rewards/cosine_scaled_reward/mean": -0.08947926759719849, |
| "rewards/cosine_scaled_reward/std": 0.4077052175998688, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1779.0, |
| "completions/mean_length": 1356.265625, |
| "completions/mean_terminated_length": 851.4865112304688, |
| "completions/min_length": 285.0, |
| "completions/min_terminated_length": 285.0, |
| "epoch": 0.07428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31939539313316345, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": -0.0, |
| "num_tokens": 7756674.0, |
| "reward": 0.253554105758667, |
| "reward_std": 0.5708951950073242, |
| "rewards/cosine_scaled_reward/mean": -0.1700979471206665, |
| "rewards/cosine_scaled_reward/std": 0.4101679027080536, |
| "rewards/format_reward/mean": 0.59375, |
| "rewards/format_reward/std": 0.49501484632492065, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1867.0, |
| "completions/mean_length": 924.375, |
| "completions/mean_terminated_length": 763.857177734375, |
| "completions/min_length": 193.0, |
| "completions/min_terminated_length": 193.0, |
| "epoch": 0.07542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3720225393772125, |
| "learning_rate": 8.681980515339463e-07, |
| "loss": -0.0, |
| "num_tokens": 7826066.0, |
| "reward": 1.2181510925292969, |
| "reward_std": 0.8191297650337219, |
| "rewards/cosine_scaled_reward/mean": 0.15595056116580963, |
| "rewards/cosine_scaled_reward/std": 0.5347589254379272, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 66 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1808.0, |
| "completions/mean_length": 1718.703125, |
| "completions/mean_terminated_length": 1090.0455322265625, |
| "completions/min_length": 418.0, |
| "completions/min_terminated_length": 418.0, |
| "epoch": 0.07657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2455451935529709, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": 0.0, |
| "num_tokens": 7946799.0, |
| "reward": -0.13946212828159332, |
| "reward_std": 0.4192533791065216, |
| "rewards/cosine_scaled_reward/mean": -0.26504355669021606, |
| "rewards/cosine_scaled_reward/std": 0.1596679985523224, |
| "rewards/format_reward/mean": 0.390625, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 67 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1835.0, |
| "completions/mean_length": 796.03125, |
| "completions/mean_terminated_length": 642.2807006835938, |
| "completions/min_length": 263.0, |
| "completions/min_terminated_length": 263.0, |
| "epoch": 0.07771428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3650038540363312, |
| "learning_rate": 8.568992620281243e-07, |
| "loss": 0.0, |
| "num_tokens": 8007001.0, |
| "reward": 0.8269755840301514, |
| "reward_std": 0.7205700874328613, |
| "rewards/cosine_scaled_reward/mean": -0.03182470053434372, |
| "rewards/cosine_scaled_reward/std": 0.44225865602493286, |
| "rewards/format_reward/mean": 0.890625, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1770.0, |
| "completions/mean_length": 1052.078125, |
| "completions/mean_terminated_length": 867.6481323242188, |
| "completions/min_length": 129.0, |
| "completions/min_terminated_length": 129.0, |
| "epoch": 0.07885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5175226926803589, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0, |
| "num_tokens": 8084678.0, |
| "reward": 0.5431624054908752, |
| "reward_std": 0.5567936897277832, |
| "rewards/cosine_scaled_reward/mean": -0.15810629725456238, |
| "rewards/cosine_scaled_reward/std": 0.31712469458580017, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 69 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1940.0, |
| "completions/mean_length": 1458.953125, |
| "completions/mean_terminated_length": 1055.9210205078125, |
| "completions/min_length": 447.0, |
| "completions/min_terminated_length": 447.0, |
| "epoch": 0.08, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2705329358577728, |
| "learning_rate": 8.452265630457282e-07, |
| "loss": -0.0, |
| "num_tokens": 8189507.0, |
| "reward": 0.2633436322212219, |
| "reward_std": 0.7909030914306641, |
| "rewards/cosine_scaled_reward/mean": -0.18864068388938904, |
| "rewards/cosine_scaled_reward/std": 0.37829747796058655, |
| "rewards/format_reward/mean": 0.640625, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2002.0, |
| "completions/mean_length": 1449.71875, |
| "completions/mean_terminated_length": 921.8235473632812, |
| "completions/min_length": 377.0, |
| "completions/min_terminated_length": 377.0, |
| "epoch": 0.08114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30838489532470703, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": -0.0, |
| "num_tokens": 8293009.0, |
| "reward": 0.3965587615966797, |
| "reward_std": 0.6687955856323242, |
| "rewards/cosine_scaled_reward/mean": -0.06734561175107956, |
| "rewards/cosine_scaled_reward/std": 0.4826039671897888, |
| "rewards/format_reward/mean": 0.53125, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 71 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1638.0, |
| "completions/mean_length": 1165.0625, |
| "completions/mean_terminated_length": 917.8399658203125, |
| "completions/min_length": 330.0, |
| "completions/min_terminated_length": 330.0, |
| "epoch": 0.08228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37962082028388977, |
| "learning_rate": 8.331941759724268e-07, |
| "loss": -0.0, |
| "num_tokens": 8377925.0, |
| "reward": 0.48002344369888306, |
| "reward_std": 0.7248474359512329, |
| "rewards/cosine_scaled_reward/mean": -0.15842577815055847, |
| "rewards/cosine_scaled_reward/std": 0.3461473882198334, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1892.0, |
| "completions/mean_length": 1507.953125, |
| "completions/mean_terminated_length": 1205.0, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "epoch": 0.08342857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.283483624458313, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": -0.0, |
| "num_tokens": 8485162.0, |
| "reward": 0.12193681299686432, |
| "reward_std": 0.43324506282806396, |
| "rewards/cosine_scaled_reward/mean": -0.28278160095214844, |
| "rewards/cosine_scaled_reward/std": 0.2103184014558792, |
| "rewards/format_reward/mean": 0.6875, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 73 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1872.0, |
| "completions/mean_length": 1235.390625, |
| "completions/mean_terminated_length": 964.5208740234375, |
| "completions/min_length": 522.0, |
| "completions/min_terminated_length": 522.0, |
| "epoch": 0.08457142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2533995509147644, |
| "learning_rate": 8.208167604184217e-07, |
| "loss": 0.0, |
| "num_tokens": 8574155.0, |
| "reward": 0.711876630783081, |
| "reward_std": 0.598979651927948, |
| "rewards/cosine_scaled_reward/mean": -0.019061744213104248, |
| "rewards/cosine_scaled_reward/std": 0.5070863962173462, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 74 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2010.0, |
| "completions/mean_length": 1396.0, |
| "completions/mean_terminated_length": 1178.666748046875, |
| "completions/min_length": 526.0, |
| "completions/min_terminated_length": 526.0, |
| "epoch": 0.08571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21683472394943237, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": -0.0, |
| "num_tokens": 8674459.0, |
| "reward": 0.8670482635498047, |
| "reward_std": 0.5205744504928589, |
| "rewards/cosine_scaled_reward/mean": 0.011649124324321747, |
| "rewards/cosine_scaled_reward/std": 0.46857622265815735, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1737.0, |
| "completions/mean_length": 1208.875, |
| "completions/mean_terminated_length": 827.45458984375, |
| "completions/min_length": 295.0, |
| "completions/min_terminated_length": 295.0, |
| "epoch": 0.08685714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29886162281036377, |
| "learning_rate": 8.081093963579707e-07, |
| "loss": 0.0, |
| "num_tokens": 8762227.0, |
| "reward": 0.33387672901153564, |
| "reward_std": 0.5217430591583252, |
| "rewards/cosine_scaled_reward/mean": -0.17681162059307098, |
| "rewards/cosine_scaled_reward/std": 0.33101972937583923, |
| "rewards/format_reward/mean": 0.6875, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1928.0, |
| "completions/mean_length": 1202.171875, |
| "completions/mean_terminated_length": 1026.6226806640625, |
| "completions/min_length": 617.0, |
| "completions/min_terminated_length": 617.0, |
| "epoch": 0.088, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2510688602924347, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": -0.0, |
| "num_tokens": 8850742.0, |
| "reward": 0.4464802145957947, |
| "reward_std": 0.452653169631958, |
| "rewards/cosine_scaled_reward/mean": -0.19863487780094147, |
| "rewards/cosine_scaled_reward/std": 0.29697054624557495, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 77 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 1510.171875, |
| "completions/mean_terminated_length": 1035.61767578125, |
| "completions/min_length": 500.0, |
| "completions/min_terminated_length": 500.0, |
| "epoch": 0.08914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22833088040351868, |
| "learning_rate": 7.950875657567621e-07, |
| "loss": 0.0, |
| "num_tokens": 8958113.0, |
| "reward": 0.47979119420051575, |
| "reward_std": 0.697140097618103, |
| "rewards/cosine_scaled_reward/mean": -0.06479191035032272, |
| "rewards/cosine_scaled_reward/std": 0.45924264192581177, |
| "rewards/format_reward/mean": 0.609375, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 78 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1590.0, |
| "completions/mean_length": 1005.609375, |
| "completions/mean_terminated_length": 812.5740966796875, |
| "completions/min_length": 206.0, |
| "completions/min_terminated_length": 206.0, |
| "epoch": 0.09028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31812578439712524, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": -0.0, |
| "num_tokens": 9032568.0, |
| "reward": 0.5333245992660522, |
| "reward_std": 0.58503657579422, |
| "rewards/cosine_scaled_reward/mean": -0.15521270036697388, |
| "rewards/cosine_scaled_reward/std": 0.39932510256767273, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 79 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.34375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2018.0, |
| "completions/mean_length": 1430.390625, |
| "completions/mean_terminated_length": 1106.8809814453125, |
| "completions/min_length": 408.0, |
| "completions/min_terminated_length": 408.0, |
| "epoch": 0.09142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.317196786403656, |
| "learning_rate": 7.817671337095244e-07, |
| "loss": 0.0, |
| "num_tokens": 9134505.0, |
| "reward": 0.3816445469856262, |
| "reward_std": 0.7023000717163086, |
| "rewards/cosine_scaled_reward/mean": -0.16074024140834808, |
| "rewards/cosine_scaled_reward/std": 0.340556800365448, |
| "rewards/format_reward/mean": 0.703125, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1936.0, |
| "completions/mean_length": 1082.9375, |
| "completions/mean_terminated_length": 983.1034545898438, |
| "completions/min_length": 252.0, |
| "completions/min_terminated_length": 252.0, |
| "epoch": 0.09257142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3431136906147003, |
| "learning_rate": 7.75e-07, |
| "loss": -0.0, |
| "num_tokens": 9215085.0, |
| "reward": 0.5323719382286072, |
| "reward_std": 0.5667048692703247, |
| "rewards/cosine_scaled_reward/mean": -0.1869390308856964, |
| "rewards/cosine_scaled_reward/std": 0.3326209485530853, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 81 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1958.0, |
| "completions/mean_length": 1150.921875, |
| "completions/mean_terminated_length": 984.7963256835938, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "epoch": 0.09371428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29262369871139526, |
| "learning_rate": 7.681643291108517e-07, |
| "loss": -0.0, |
| "num_tokens": 9299072.0, |
| "reward": 0.882739782333374, |
| "reward_std": 0.48830458521842957, |
| "rewards/cosine_scaled_reward/mean": 0.0038698911666870117, |
| "rewards/cosine_scaled_reward/std": 0.5622411966323853, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 82 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.265625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1688.0, |
| "completions/mean_length": 1255.515625, |
| "completions/mean_terminated_length": 968.872314453125, |
| "completions/min_length": 314.0, |
| "completions/min_terminated_length": 314.0, |
| "epoch": 0.09485714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3286426365375519, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0, |
| "num_tokens": 9390513.0, |
| "reward": 0.5811824202537537, |
| "reward_std": 0.43479597568511963, |
| "rewards/cosine_scaled_reward/mean": -0.07659629732370377, |
| "rewards/cosine_scaled_reward/std": 0.3988858759403229, |
| "rewards/format_reward/mean": 0.734375, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 83 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1944.0, |
| "completions/mean_length": 1143.40625, |
| "completions/mean_terminated_length": 955.660400390625, |
| "completions/min_length": 321.0, |
| "completions/min_terminated_length": 321.0, |
| "epoch": 0.096, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32434922456741333, |
| "learning_rate": 7.54295724882796e-07, |
| "loss": 0.0, |
| "num_tokens": 9474387.0, |
| "reward": 0.599439799785614, |
| "reward_std": 0.6882362961769104, |
| "rewards/cosine_scaled_reward/mean": -0.137780100107193, |
| "rewards/cosine_scaled_reward/std": 0.39472848176956177, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1309.71875, |
| "completions/mean_terminated_length": 1139.34619140625, |
| "completions/min_length": 707.0, |
| "completions/min_terminated_length": 707.0, |
| "epoch": 0.09714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1989050805568695, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0, |
| "num_tokens": 9568529.0, |
| "reward": 0.6224732398986816, |
| "reward_std": 0.6126816868782043, |
| "rewards/cosine_scaled_reward/mean": -0.12626340985298157, |
| "rewards/cosine_scaled_reward/std": 0.3711291551589966, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1932.0, |
| "completions/mean_length": 1287.6875, |
| "completions/mean_terminated_length": 1034.25, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.09828571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2835865616798401, |
| "learning_rate": 7.401782177833147e-07, |
| "loss": 0.0, |
| "num_tokens": 9661797.0, |
| "reward": 0.6656298637390137, |
| "reward_std": 0.6712964773178101, |
| "rewards/cosine_scaled_reward/mean": -0.05781007558107376, |
| "rewards/cosine_scaled_reward/std": 0.3957676589488983, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 86 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1946.0, |
| "completions/mean_length": 1048.0, |
| "completions/mean_terminated_length": 862.8148193359375, |
| "completions/min_length": 429.0, |
| "completions/min_terminated_length": 429.0, |
| "epoch": 0.09942857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32077720761299133, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": -0.0, |
| "num_tokens": 9738989.0, |
| "reward": 0.5565430521965027, |
| "reward_std": 0.48540347814559937, |
| "rewards/cosine_scaled_reward/mean": -0.15141595900058746, |
| "rewards/cosine_scaled_reward/std": 0.30698010325431824, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 87 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1977.0, |
| "completions/mean_length": 1026.359375, |
| "completions/mean_terminated_length": 939.7796630859375, |
| "completions/min_length": 142.0, |
| "completions/min_terminated_length": 142.0, |
| "epoch": 0.10057142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31619754433631897, |
| "learning_rate": 7.258290078201731e-07, |
| "loss": -0.0, |
| "num_tokens": 9815188.0, |
| "reward": 1.1399941444396973, |
| "reward_std": 0.672067403793335, |
| "rewards/cosine_scaled_reward/mean": 0.07780956476926804, |
| "rewards/cosine_scaled_reward/std": 0.45696425437927246, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.140625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1960.0, |
| "completions/mean_length": 1263.859375, |
| "completions/mean_terminated_length": 1135.54541015625, |
| "completions/min_length": 579.0, |
| "completions/min_terminated_length": 579.0, |
| "epoch": 0.10171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2358577847480774, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": -0.0, |
| "num_tokens": 9907075.0, |
| "reward": 0.6882132291793823, |
| "reward_std": 0.7329428195953369, |
| "rewards/cosine_scaled_reward/mean": -0.10901839286088943, |
| "rewards/cosine_scaled_reward/std": 0.3872850835323334, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 89 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.140625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1843.0, |
| "completions/mean_length": 1060.53125, |
| "completions/mean_terminated_length": 898.9454345703125, |
| "completions/min_length": 127.0, |
| "completions/min_terminated_length": 127.0, |
| "epoch": 0.10285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.48015958070755005, |
| "learning_rate": 7.11265577295385e-07, |
| "loss": 0.0, |
| "num_tokens": 9984949.0, |
| "reward": 0.5475899577140808, |
| "reward_std": 0.6522415280342102, |
| "rewards/cosine_scaled_reward/mean": -0.1558925211429596, |
| "rewards/cosine_scaled_reward/std": 0.36059972643852234, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2014.0, |
| "completions/mean_length": 1377.046875, |
| "completions/mean_terminated_length": 1093.755615234375, |
| "completions/min_length": 307.0, |
| "completions/min_terminated_length": 307.0, |
| "epoch": 0.104, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2658543288707733, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": -0.0, |
| "num_tokens": 10083648.0, |
| "reward": 0.5291076302528381, |
| "reward_std": 0.7617174386978149, |
| "rewards/cosine_scaled_reward/mean": -0.11044619232416153, |
| "rewards/cosine_scaled_reward/std": 0.42394205927848816, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 91 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1595.0, |
| "completions/mean_length": 1180.59375, |
| "completions/mean_terminated_length": 915.0612182617188, |
| "completions/min_length": 486.0, |
| "completions/min_terminated_length": 486.0, |
| "epoch": 0.10514285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2867736518383026, |
| "learning_rate": 6.965056695057204e-07, |
| "loss": 0.0, |
| "num_tokens": 10169198.0, |
| "reward": 0.5447607040405273, |
| "reward_std": 0.686552107334137, |
| "rewards/cosine_scaled_reward/mean": -0.11824464052915573, |
| "rewards/cosine_scaled_reward/std": 0.358975350856781, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.34375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2010.0, |
| "completions/mean_length": 1407.75, |
| "completions/mean_terminated_length": 1072.3809814453125, |
| "completions/min_length": 401.0, |
| "completions/min_terminated_length": 401.0, |
| "epoch": 0.10628571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35998135805130005, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0, |
| "num_tokens": 10270638.0, |
| "reward": 0.34720176458358765, |
| "reward_std": 0.5368383526802063, |
| "rewards/cosine_scaled_reward/mean": -0.22483661770820618, |
| "rewards/cosine_scaled_reward/std": 0.23887285590171814, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 93 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.34375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1906.0, |
| "completions/mean_length": 1299.734375, |
| "completions/mean_terminated_length": 907.7857055664062, |
| "completions/min_length": 346.0, |
| "completions/min_terminated_length": 346.0, |
| "epoch": 0.10742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37465807795524597, |
| "learning_rate": 6.815672671252315e-07, |
| "loss": 0.0, |
| "num_tokens": 10363589.0, |
| "reward": 0.7590231895446777, |
| "reward_std": 0.7117502689361572, |
| "rewards/cosine_scaled_reward/mean": 0.035761602222919464, |
| "rewards/cosine_scaled_reward/std": 0.48915430903434753, |
| "rewards/format_reward/mean": 0.6875, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 94 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1806.0, |
| "completions/mean_length": 1240.125, |
| "completions/mean_terminated_length": 1013.9199829101562, |
| "completions/min_length": 370.0, |
| "completions/min_terminated_length": 370.0, |
| "epoch": 0.10857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29062458872795105, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": -0.0, |
| "num_tokens": 10453485.0, |
| "reward": 0.2566743791103363, |
| "reward_std": 0.48482388257980347, |
| "rewards/cosine_scaled_reward/mean": -0.26228782534599304, |
| "rewards/cosine_scaled_reward/std": 0.20987816154956818, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 95 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.140625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1548.0, |
| "completions/mean_length": 1096.421875, |
| "completions/mean_terminated_length": 940.7090454101562, |
| "completions/min_length": 410.0, |
| "completions/min_terminated_length": 410.0, |
| "epoch": 0.10971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3024481236934662, |
| "learning_rate": 6.664685702961344e-07, |
| "loss": 0.0, |
| "num_tokens": 10534792.0, |
| "reward": 0.84682697057724, |
| "reward_std": 0.4051811099052429, |
| "rewards/cosine_scaled_reward/mean": -0.014086522161960602, |
| "rewards/cosine_scaled_reward/std": 0.4226605296134949, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1858.0, |
| "completions/mean_length": 1193.0, |
| "completions/mean_terminated_length": 1034.6666259765625, |
| "completions/min_length": 380.0, |
| "completions/min_terminated_length": 380.0, |
| "epoch": 0.11085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28268054127693176, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": -0.0, |
| "num_tokens": 10621408.0, |
| "reward": 0.6044580340385437, |
| "reward_std": 0.6774412393569946, |
| "rewards/cosine_scaled_reward/mean": -0.14308346807956696, |
| "rewards/cosine_scaled_reward/std": 0.3645778298377991, |
| "rewards/format_reward/mean": 0.890625, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 97 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1966.0, |
| "completions/mean_length": 1146.03125, |
| "completions/mean_terminated_length": 958.8302001953125, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.112, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2698092758655548, |
| "learning_rate": 6.512279744547392e-07, |
| "loss": -0.0, |
| "num_tokens": 10705818.0, |
| "reward": 0.8127368092536926, |
| "reward_std": 0.5926570892333984, |
| "rewards/cosine_scaled_reward/mean": -0.031131573021411896, |
| "rewards/cosine_scaled_reward/std": 0.4754258096218109, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 98 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1928.0, |
| "completions/mean_length": 1057.703125, |
| "completions/mean_terminated_length": 1008.9999389648438, |
| "completions/min_length": 414.0, |
| "completions/min_terminated_length": 414.0, |
| "epoch": 0.11314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3098371922969818, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": -0.0, |
| "num_tokens": 10784679.0, |
| "reward": 0.9630225300788879, |
| "reward_std": 0.48498910665512085, |
| "rewards/cosine_scaled_reward/mean": 0.01276126503944397, |
| "rewards/cosine_scaled_reward/std": 0.48310962319374084, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 99 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1873.0, |
| "completions/mean_length": 1139.109375, |
| "completions/mean_terminated_length": 1062.084716796875, |
| "completions/min_length": 478.0, |
| "completions/min_terminated_length": 478.0, |
| "epoch": 0.11428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29804834723472595, |
| "learning_rate": 6.358640479194451e-07, |
| "loss": 0.0, |
| "num_tokens": 10867294.0, |
| "reward": 0.958106279373169, |
| "reward_std": 0.757602334022522, |
| "rewards/cosine_scaled_reward/mean": 0.010303134098649025, |
| "rewards/cosine_scaled_reward/std": 0.4540289342403412, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1917.0, |
| "completions/mean_length": 1068.21875, |
| "completions/mean_terminated_length": 1052.666748046875, |
| "completions/min_length": 539.0, |
| "completions/min_terminated_length": 539.0, |
| "epoch": 0.11542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30073827505111694, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0, |
| "num_tokens": 10947092.0, |
| "reward": 0.7335419058799744, |
| "reward_std": 0.48280423879623413, |
| "rewards/cosine_scaled_reward/mean": -0.12541653215885162, |
| "rewards/cosine_scaled_reward/std": 0.34734830260276794, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 101 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1032.203125, |
| "completions/mean_terminated_length": 907.4561767578125, |
| "completions/min_length": 304.0, |
| "completions/min_terminated_length": 304.0, |
| "epoch": 0.11657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34232160449028015, |
| "learning_rate": 6.203955092681039e-07, |
| "loss": 0.0, |
| "num_tokens": 11023305.0, |
| "reward": 0.5640091300010681, |
| "reward_std": 0.6805330514907837, |
| "rewards/cosine_scaled_reward/mean": -0.16330792009830475, |
| "rewards/cosine_scaled_reward/std": 0.3974398970603943, |
| "rewards/format_reward/mean": 0.890625, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 102 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2037.0, |
| "completions/mean_length": 904.328125, |
| "completions/mean_terminated_length": 886.1746826171875, |
| "completions/min_length": 163.0, |
| "completions/min_terminated_length": 163.0, |
| "epoch": 0.11771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.43588608503341675, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": -0.0, |
| "num_tokens": 11091534.0, |
| "reward": 0.7056660056114197, |
| "reward_std": 0.5587431788444519, |
| "rewards/cosine_scaled_reward/mean": -0.13935449719429016, |
| "rewards/cosine_scaled_reward/std": 0.32663995027542114, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 103 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.203125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1530.0, |
| "completions/mean_length": 1109.765625, |
| "completions/mean_terminated_length": 870.6078491210938, |
| "completions/min_length": 272.0, |
| "completions/min_terminated_length": 272.0, |
| "epoch": 0.11885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2997737228870392, |
| "learning_rate": 6.048412045323164e-07, |
| "loss": -0.0, |
| "num_tokens": 11173023.0, |
| "reward": 0.5046586990356445, |
| "reward_std": 0.5760527849197388, |
| "rewards/cosine_scaled_reward/mean": -0.14610813558101654, |
| "rewards/cosine_scaled_reward/std": 0.366825133562088, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1959.0, |
| "completions/mean_length": 1184.390625, |
| "completions/mean_terminated_length": 920.0203857421875, |
| "completions/min_length": 440.0, |
| "completions/min_terminated_length": 440.0, |
| "epoch": 0.12, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2738684117794037, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0, |
| "num_tokens": 11260112.0, |
| "reward": 0.934418797492981, |
| "reward_std": 0.6995881795883179, |
| "rewards/cosine_scaled_reward/mean": 0.05314689874649048, |
| "rewards/cosine_scaled_reward/std": 0.5037528872489929, |
| "rewards/format_reward/mean": 0.828125, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1980.0, |
| "completions/mean_length": 984.8125, |
| "completions/mean_terminated_length": 874.8275756835938, |
| "completions/min_length": 258.0, |
| "completions/min_terminated_length": 258.0, |
| "epoch": 0.12114285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30534854531288147, |
| "learning_rate": 5.892200842364462e-07, |
| "loss": 0.0, |
| "num_tokens": 11333972.0, |
| "reward": 1.1282094717025757, |
| "reward_std": 0.701350748538971, |
| "rewards/cosine_scaled_reward/mean": 0.10316723585128784, |
| "rewards/cosine_scaled_reward/std": 0.4879910945892334, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1693.0, |
| "completions/mean_length": 1115.203125, |
| "completions/mean_terminated_length": 942.4629516601562, |
| "completions/min_length": 353.0, |
| "completions/min_terminated_length": 353.0, |
| "epoch": 0.12228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3157402276992798, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": 0.0, |
| "num_tokens": 11416497.0, |
| "reward": 0.6792718768119812, |
| "reward_std": 0.6421718597412109, |
| "rewards/cosine_scaled_reward/mean": -0.08223908394575119, |
| "rewards/cosine_scaled_reward/std": 0.4252789616584778, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1895.0, |
| "completions/mean_length": 1252.359375, |
| "completions/mean_terminated_length": 1008.7958984375, |
| "completions/min_length": 280.0, |
| "completions/min_terminated_length": 280.0, |
| "epoch": 0.12342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32090920209884644, |
| "learning_rate": 5.735511803093248e-07, |
| "loss": -0.0, |
| "num_tokens": 11507008.0, |
| "reward": 0.5321451425552368, |
| "reward_std": 0.7731401324272156, |
| "rewards/cosine_scaled_reward/mean": -0.124552421271801, |
| "rewards/cosine_scaled_reward/std": 0.39858752489089966, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2041.0, |
| "completions/mean_length": 1091.1875, |
| "completions/mean_terminated_length": 973.6842041015625, |
| "completions/min_length": 552.0, |
| "completions/min_terminated_length": 552.0, |
| "epoch": 0.12457142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27098071575164795, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.0, |
| "num_tokens": 11588092.0, |
| "reward": 0.8329494595527649, |
| "reward_std": 0.48314613103866577, |
| "rewards/cosine_scaled_reward/mean": -0.028837747871875763, |
| "rewards/cosine_scaled_reward/std": 0.4350675046443939, |
| "rewards/format_reward/mean": 0.890625, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2037.0, |
| "completions/mean_length": 1152.53125, |
| "completions/mean_terminated_length": 986.7037353515625, |
| "completions/min_length": 265.0, |
| "completions/min_terminated_length": 265.0, |
| "epoch": 0.12571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4122578799724579, |
| "learning_rate": 5.578535828967777e-07, |
| "loss": 0.0, |
| "num_tokens": 11672630.0, |
| "reward": 0.5694496631622314, |
| "reward_std": 0.6106870174407959, |
| "rewards/cosine_scaled_reward/mean": -0.14496266841888428, |
| "rewards/cosine_scaled_reward/std": 0.3454693555831909, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1783.0, |
| "completions/mean_length": 971.0, |
| "completions/mean_terminated_length": 918.03271484375, |
| "completions/min_length": 375.0, |
| "completions/min_terminated_length": 375.0, |
| "epoch": 0.12685714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2827076315879822, |
| "learning_rate": 5.5e-07, |
| "loss": -0.0, |
| "num_tokens": 11744894.0, |
| "reward": 0.7054448127746582, |
| "reward_std": 0.5191388130187988, |
| "rewards/cosine_scaled_reward/mean": -0.1238400787115097, |
| "rewards/cosine_scaled_reward/std": 0.2727503180503845, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 111 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1988.0, |
| "completions/mean_length": 1287.234375, |
| "completions/mean_terminated_length": 1111.673095703125, |
| "completions/min_length": 605.0, |
| "completions/min_terminated_length": 605.0, |
| "epoch": 0.128, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26578545570373535, |
| "learning_rate": 5.421464171032224e-07, |
| "loss": -0.0, |
| "num_tokens": 11838373.0, |
| "reward": 0.8936529159545898, |
| "reward_std": 0.677398681640625, |
| "rewards/cosine_scaled_reward/mean": 0.032763972878456116, |
| "rewards/cosine_scaled_reward/std": 0.47608643770217896, |
| "rewards/format_reward/mean": 0.828125, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1969.0, |
| "completions/mean_length": 907.5, |
| "completions/mean_terminated_length": 870.7096557617188, |
| "completions/min_length": 314.0, |
| "completions/min_terminated_length": 314.0, |
| "epoch": 0.12914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36232173442840576, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0, |
| "num_tokens": 11906749.0, |
| "reward": 0.9407690763473511, |
| "reward_std": 0.6962294578552246, |
| "rewards/cosine_scaled_reward/mean": -0.013990461826324463, |
| "rewards/cosine_scaled_reward/std": 0.4877306818962097, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 113 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1781.0, |
| "completions/mean_length": 946.78125, |
| "completions/mean_terminated_length": 873.36669921875, |
| "completions/min_length": 250.0, |
| "completions/min_terminated_length": 250.0, |
| "epoch": 0.13028571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3120001256465912, |
| "learning_rate": 5.264488196906752e-07, |
| "loss": 0.0, |
| "num_tokens": 11977191.0, |
| "reward": 0.61952805519104, |
| "reward_std": 0.609375536441803, |
| "rewards/cosine_scaled_reward/mean": -0.16679848730564117, |
| "rewards/cosine_scaled_reward/std": 0.37943220138549805, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1900.0, |
| "completions/mean_length": 1104.734375, |
| "completions/mean_terminated_length": 969.982177734375, |
| "completions/min_length": 370.0, |
| "completions/min_terminated_length": 370.0, |
| "epoch": 0.13142857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33438733220100403, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0, |
| "num_tokens": 12059110.0, |
| "reward": 0.5532407760620117, |
| "reward_std": 0.576167643070221, |
| "rewards/cosine_scaled_reward/mean": -0.16087961196899414, |
| "rewards/cosine_scaled_reward/std": 0.40955987572669983, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.203125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2022.0, |
| "completions/mean_length": 1260.59375, |
| "completions/mean_terminated_length": 1059.8824462890625, |
| "completions/min_length": 361.0, |
| "completions/min_terminated_length": 361.0, |
| "epoch": 0.13257142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27838730812072754, |
| "learning_rate": 5.107799157635538e-07, |
| "loss": -0.0, |
| "num_tokens": 12151172.0, |
| "reward": 0.7721755504608154, |
| "reward_std": 0.7768255472183228, |
| "rewards/cosine_scaled_reward/mean": -0.027974726632237434, |
| "rewards/cosine_scaled_reward/std": 0.42550837993621826, |
| "rewards/format_reward/mean": 0.828125, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1938.0, |
| "completions/mean_length": 1062.296875, |
| "completions/mean_terminated_length": 978.7626953125, |
| "completions/min_length": 440.0, |
| "completions/min_terminated_length": 440.0, |
| "epoch": 0.1337142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2676664888858795, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": -0.0, |
| "num_tokens": 12230183.0, |
| "reward": 0.7667758464813232, |
| "reward_std": 0.6048427820205688, |
| "rewards/cosine_scaled_reward/mean": -0.10879956185817719, |
| "rewards/cosine_scaled_reward/std": 0.4106774926185608, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1925.0, |
| "completions/mean_length": 1205.96875, |
| "completions/mean_terminated_length": 1011.6538696289062, |
| "completions/min_length": 275.0, |
| "completions/min_terminated_length": 275.0, |
| "epoch": 0.13485714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33676981925964355, |
| "learning_rate": 4.951587954676837e-07, |
| "loss": -0.0, |
| "num_tokens": 12317901.0, |
| "reward": 0.7116703987121582, |
| "reward_std": 0.7047961950302124, |
| "rewards/cosine_scaled_reward/mean": -0.05041477829217911, |
| "rewards/cosine_scaled_reward/std": 0.4400728642940521, |
| "rewards/format_reward/mean": 0.8125, |
| "rewards/format_reward/std": 0.39339789748191833, |
| "step": 118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1943.0, |
| "completions/mean_length": 963.578125, |
| "completions/mean_terminated_length": 851.3965454101562, |
| "completions/min_length": 162.0, |
| "completions/min_terminated_length": 162.0, |
| "epoch": 0.136, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6137372851371765, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0, |
| "num_tokens": 12389978.0, |
| "reward": 0.884113073348999, |
| "reward_std": 0.64817214012146, |
| "rewards/cosine_scaled_reward/mean": -0.011068470776081085, |
| "rewards/cosine_scaled_reward/std": 0.43141883611679077, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 119 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1543.0, |
| "completions/mean_length": 804.75, |
| "completions/mean_terminated_length": 764.6451416015625, |
| "completions/min_length": 170.0, |
| "completions/min_terminated_length": 170.0, |
| "epoch": 0.13714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39063799381256104, |
| "learning_rate": 4.79604490731896e-07, |
| "loss": -0.0, |
| "num_tokens": 12451938.0, |
| "reward": 0.7966957092285156, |
| "reward_std": 0.5527613759040833, |
| "rewards/cosine_scaled_reward/mean": -0.093839630484581, |
| "rewards/cosine_scaled_reward/std": 0.43766382336616516, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1688.0, |
| "completions/mean_length": 711.921875, |
| "completions/mean_terminated_length": 690.71435546875, |
| "completions/min_length": 225.0, |
| "completions/min_terminated_length": 225.0, |
| "epoch": 0.1382857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4063321352005005, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0, |
| "num_tokens": 12507197.0, |
| "reward": 1.1037222146987915, |
| "reward_std": 0.5403161644935608, |
| "rewards/cosine_scaled_reward/mean": 0.059673577547073364, |
| "rewards/cosine_scaled_reward/std": 0.4637821912765503, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 121 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1967.0, |
| "completions/mean_length": 1144.90625, |
| "completions/mean_terminated_length": 1015.8928833007812, |
| "completions/min_length": 346.0, |
| "completions/min_terminated_length": 346.0, |
| "epoch": 0.13942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3278358280658722, |
| "learning_rate": 4.641359520805548e-07, |
| "loss": 0.0, |
| "num_tokens": 12592031.0, |
| "reward": 0.9771745800971985, |
| "reward_std": 0.6282449960708618, |
| "rewards/cosine_scaled_reward/mean": 0.00421229749917984, |
| "rewards/cosine_scaled_reward/std": 0.4215405285358429, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1899.0, |
| "completions/mean_length": 1109.59375, |
| "completions/mean_terminated_length": 1012.5172119140625, |
| "completions/min_length": 360.0, |
| "completions/min_terminated_length": 360.0, |
| "epoch": 0.14057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2767045497894287, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": -0.0, |
| "num_tokens": 12673277.0, |
| "reward": 0.650445818901062, |
| "reward_std": 0.5948874950408936, |
| "rewards/cosine_scaled_reward/mean": -0.1435271054506302, |
| "rewards/cosine_scaled_reward/std": 0.39249518513679504, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 123 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1785.0, |
| "completions/mean_length": 1095.515625, |
| "completions/mean_terminated_length": 978.5438842773438, |
| "completions/min_length": 381.0, |
| "completions/min_terminated_length": 381.0, |
| "epoch": 0.1417142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2732603847980499, |
| "learning_rate": 4.4877202554526084e-07, |
| "loss": 0.0, |
| "num_tokens": 12754342.0, |
| "reward": 0.8451436758041382, |
| "reward_std": 0.7594490051269531, |
| "rewards/cosine_scaled_reward/mean": -0.0383656844496727, |
| "rewards/cosine_scaled_reward/std": 0.45196905732154846, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1931.0, |
| "completions/mean_length": 1066.4375, |
| "completions/mean_terminated_length": 1034.774169921875, |
| "completions/min_length": 395.0, |
| "completions/min_terminated_length": 395.0, |
| "epoch": 0.14285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28073129057884216, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": -0.0, |
| "num_tokens": 12833522.0, |
| "reward": 1.1210604906082153, |
| "reward_std": 0.480854868888855, |
| "rewards/cosine_scaled_reward/mean": 0.06834279000759125, |
| "rewards/cosine_scaled_reward/std": 0.5010288953781128, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1960.0, |
| "completions/mean_length": 1028.75, |
| "completions/mean_terminated_length": 978.6229248046875, |
| "completions/min_length": 575.0, |
| "completions/min_terminated_length": 575.0, |
| "epoch": 0.144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2878725826740265, |
| "learning_rate": 4.3353142970386557e-07, |
| "loss": -0.0, |
| "num_tokens": 12910658.0, |
| "reward": 1.0619797706604004, |
| "reward_std": 0.7742013335227966, |
| "rewards/cosine_scaled_reward/mean": 0.04661493003368378, |
| "rewards/cosine_scaled_reward/std": 0.4794113337993622, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 126 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1780.0, |
| "completions/mean_length": 1057.734375, |
| "completions/mean_terminated_length": 991.7167358398438, |
| "completions/min_length": 528.0, |
| "completions/min_terminated_length": 528.0, |
| "epoch": 0.14514285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.314113050699234, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": -0.0, |
| "num_tokens": 12989449.0, |
| "reward": 0.5687937140464783, |
| "reward_std": 0.5123973488807678, |
| "rewards/cosine_scaled_reward/mean": -0.19997814297676086, |
| "rewards/cosine_scaled_reward/std": 0.29952365159988403, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1836.0, |
| "completions/mean_length": 970.15625, |
| "completions/mean_terminated_length": 917.1474609375, |
| "completions/min_length": 275.0, |
| "completions/min_terminated_length": 275.0, |
| "epoch": 0.1462857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3059556186199188, |
| "learning_rate": 4.1843273287476854e-07, |
| "loss": -0.0, |
| "num_tokens": 13061891.0, |
| "reward": 0.986152708530426, |
| "reward_std": 0.6476150751113892, |
| "rewards/cosine_scaled_reward/mean": 0.0008888617157936096, |
| "rewards/cosine_scaled_reward/std": 0.47057685256004333, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1270.71875, |
| "completions/mean_terminated_length": 1175.26318359375, |
| "completions/min_length": 420.0, |
| "completions/min_terminated_length": 420.0, |
| "epoch": 0.14742857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26219478249549866, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0, |
| "num_tokens": 13153921.0, |
| "reward": 0.7368666529655457, |
| "reward_std": 0.6195722818374634, |
| "rewards/cosine_scaled_reward/mean": -0.08469165861606598, |
| "rewards/cosine_scaled_reward/std": 0.4251137375831604, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 129 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1985.0, |
| "completions/mean_length": 1131.328125, |
| "completions/mean_terminated_length": 941.0755004882812, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "epoch": 0.14857142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31731143593788147, |
| "learning_rate": 4.034943304942796e-07, |
| "loss": -0.0, |
| "num_tokens": 13236830.0, |
| "reward": 0.6148363351821899, |
| "reward_std": 0.5969675183296204, |
| "rewards/cosine_scaled_reward/mean": -0.13008181750774384, |
| "rewards/cosine_scaled_reward/std": 0.34181615710258484, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2037.0, |
| "completions/mean_length": 941.3125, |
| "completions/mean_terminated_length": 826.8275756835938, |
| "completions/min_length": 327.0, |
| "completions/min_terminated_length": 327.0, |
| "epoch": 0.14971428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3140406608581543, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0, |
| "num_tokens": 13306810.0, |
| "reward": 1.0727087259292603, |
| "reward_std": 0.5550357699394226, |
| "rewards/cosine_scaled_reward/mean": 0.07541687786579132, |
| "rewards/cosine_scaled_reward/std": 0.4260079562664032, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 131 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1545.0, |
| "completions/mean_length": 1367.53125, |
| "completions/mean_terminated_length": 870.9730224609375, |
| "completions/min_length": 340.0, |
| "completions/min_terminated_length": 340.0, |
| "epoch": 0.15085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2837361693382263, |
| "learning_rate": 3.8873442270461485e-07, |
| "loss": -0.0, |
| "num_tokens": 13405396.0, |
| "reward": 0.5292781591415405, |
| "reward_std": 0.6481244564056396, |
| "rewards/cosine_scaled_reward/mean": -0.04786092787981033, |
| "rewards/cosine_scaled_reward/std": 0.4380926489830017, |
| "rewards/format_reward/mean": 0.625, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 132 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.203125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1924.0, |
| "completions/mean_length": 1180.875, |
| "completions/mean_terminated_length": 959.8432006835938, |
| "completions/min_length": 252.0, |
| "completions/min_terminated_length": 252.0, |
| "epoch": 0.152, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36373820900917053, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": -0.0, |
| "num_tokens": 13492196.0, |
| "reward": 0.5728870630264282, |
| "reward_std": 0.6083178520202637, |
| "rewards/cosine_scaled_reward/mean": -0.1276189684867859, |
| "rewards/cosine_scaled_reward/std": 0.3563939332962036, |
| "rewards/format_reward/mean": 0.828125, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 133 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1936.0, |
| "completions/mean_length": 1008.140625, |
| "completions/mean_terminated_length": 956.9999389648438, |
| "completions/min_length": 452.0, |
| "completions/min_terminated_length": 452.0, |
| "epoch": 0.15314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33836495876312256, |
| "learning_rate": 3.7417099217982686e-07, |
| "loss": 0.0, |
| "num_tokens": 13567285.0, |
| "reward": 1.2238911390304565, |
| "reward_std": 0.4800982177257538, |
| "rewards/cosine_scaled_reward/mean": 0.11194555461406708, |
| "rewards/cosine_scaled_reward/std": 0.5184221267700195, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 134 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1599.0, |
| "completions/mean_length": 723.609375, |
| "completions/mean_terminated_length": 680.8870849609375, |
| "completions/min_length": 276.0, |
| "completions/min_terminated_length": 276.0, |
| "epoch": 0.15428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3624517619609833, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": -0.0, |
| "num_tokens": 13624068.0, |
| "reward": 1.5145277976989746, |
| "reward_std": 0.5155797004699707, |
| "rewards/cosine_scaled_reward/mean": 0.2650764584541321, |
| "rewards/cosine_scaled_reward/std": 0.4171845614910126, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1915.0, |
| "completions/mean_length": 1110.328125, |
| "completions/mean_terminated_length": 1013.3275756835938, |
| "completions/min_length": 452.0, |
| "completions/min_terminated_length": 452.0, |
| "epoch": 0.15542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32373178005218506, |
| "learning_rate": 3.5982178221668533e-07, |
| "loss": -0.0, |
| "num_tokens": 13705801.0, |
| "reward": 1.1360313892364502, |
| "reward_std": 0.668129563331604, |
| "rewards/cosine_scaled_reward/mean": 0.06801574677228928, |
| "rewards/cosine_scaled_reward/std": 0.5162939429283142, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 136 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2020.0, |
| "completions/mean_length": 1127.09375, |
| "completions/mean_terminated_length": 1112.4761962890625, |
| "completions/min_length": 411.0, |
| "completions/min_terminated_length": 411.0, |
| "epoch": 0.15657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30264437198638916, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0, |
| "num_tokens": 13788543.0, |
| "reward": 0.6311769485473633, |
| "reward_std": 0.5873199701309204, |
| "rewards/cosine_scaled_reward/mean": -0.18441152572631836, |
| "rewards/cosine_scaled_reward/std": 0.3330920338630676, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1794.0, |
| "completions/mean_length": 1032.8125, |
| "completions/mean_terminated_length": 946.7796630859375, |
| "completions/min_length": 400.0, |
| "completions/min_terminated_length": 400.0, |
| "epoch": 0.15771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2963666617870331, |
| "learning_rate": 3.45704275117204e-07, |
| "loss": -0.0, |
| "num_tokens": 13865955.0, |
| "reward": 0.5941890478134155, |
| "reward_std": 0.553059995174408, |
| "rewards/cosine_scaled_reward/mean": -0.19509297609329224, |
| "rewards/cosine_scaled_reward/std": 0.34852251410484314, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 138 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2044.0, |
| "completions/mean_length": 1111.96875, |
| "completions/mean_terminated_length": 1015.137939453125, |
| "completions/min_length": 178.0, |
| "completions/min_terminated_length": 178.0, |
| "epoch": 0.15885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3229401707649231, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0, |
| "num_tokens": 13947761.0, |
| "reward": 0.6673665046691895, |
| "reward_std": 0.5299196243286133, |
| "rewards/cosine_scaled_reward/mean": -0.16631674766540527, |
| "rewards/cosine_scaled_reward/std": 0.38788118958473206, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 139 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 980.09375, |
| "completions/mean_terminated_length": 945.6451416015625, |
| "completions/min_length": 270.0, |
| "completions/min_terminated_length": 270.0, |
| "epoch": 0.16, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.34156280755996704, |
| "learning_rate": 3.3183567088914833e-07, |
| "loss": 0.0, |
| "num_tokens": 14020711.0, |
| "reward": 0.8693222999572754, |
| "reward_std": 0.5208969116210938, |
| "rewards/cosine_scaled_reward/mean": -0.0653388649225235, |
| "rewards/cosine_scaled_reward/std": 0.5035129189491272, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1492.0, |
| "completions/mean_length": 1007.890625, |
| "completions/mean_terminated_length": 919.7457885742188, |
| "completions/min_length": 446.0, |
| "completions/min_terminated_length": 446.0, |
| "epoch": 0.16114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3044126331806183, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 14095776.0, |
| "reward": 1.0021867752075195, |
| "reward_std": 0.6079363822937012, |
| "rewards/cosine_scaled_reward/mean": 0.0010933950543403625, |
| "rewards/cosine_scaled_reward/std": 0.4816957116127014, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2017.0, |
| "completions/mean_length": 1149.90625, |
| "completions/mean_terminated_length": 1073.796630859375, |
| "completions/min_length": 375.0, |
| "completions/min_terminated_length": 375.0, |
| "epoch": 0.16228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30035167932510376, |
| "learning_rate": 3.182328662904756e-07, |
| "loss": 0.0, |
| "num_tokens": 14179874.0, |
| "reward": 0.6333685517311096, |
| "reward_std": 0.41481128334999084, |
| "rewards/cosine_scaled_reward/mean": -0.1755032241344452, |
| "rewards/cosine_scaled_reward/std": 0.2467116117477417, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 142 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1816.0, |
| "completions/mean_length": 906.671875, |
| "completions/mean_terminated_length": 888.5556030273438, |
| "completions/min_length": 422.0, |
| "completions/min_terminated_length": 422.0, |
| "epoch": 0.16342857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.357653945684433, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": -0.0, |
| "num_tokens": 14248717.0, |
| "reward": 0.6540926098823547, |
| "reward_std": 0.46782517433166504, |
| "rewards/cosine_scaled_reward/mean": -0.16514119505882263, |
| "rewards/cosine_scaled_reward/std": 0.28250446915626526, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 143 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1934.0, |
| "completions/mean_length": 1020.765625, |
| "completions/mean_terminated_length": 894.614013671875, |
| "completions/min_length": 294.0, |
| "completions/min_terminated_length": 294.0, |
| "epoch": 0.16457142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35631585121154785, |
| "learning_rate": 3.0491243424323783e-07, |
| "loss": 0.0, |
| "num_tokens": 14325534.0, |
| "reward": 1.0688426494598389, |
| "reward_std": 0.7873537540435791, |
| "rewards/cosine_scaled_reward/mean": 0.08129630982875824, |
| "rewards/cosine_scaled_reward/std": 0.5166342258453369, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 144 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1719.0, |
| "completions/mean_length": 842.203125, |
| "completions/mean_terminated_length": 823.0635375976562, |
| "completions/min_length": 186.0, |
| "completions/min_terminated_length": 186.0, |
| "epoch": 0.1657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40442949533462524, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0, |
| "num_tokens": 14389379.0, |
| "reward": 0.8424907922744751, |
| "reward_std": 0.612415611743927, |
| "rewards/cosine_scaled_reward/mean": -0.07094208896160126, |
| "rewards/cosine_scaled_reward/std": 0.4410366714000702, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1974.0, |
| "completions/mean_length": 1026.828125, |
| "completions/mean_terminated_length": 976.6065063476562, |
| "completions/min_length": 440.0, |
| "completions/min_terminated_length": 440.0, |
| "epoch": 0.16685714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28185784816741943, |
| "learning_rate": 2.918906036420294e-07, |
| "loss": 0.0, |
| "num_tokens": 14465712.0, |
| "reward": 0.560012698173523, |
| "reward_std": 0.4264100193977356, |
| "rewards/cosine_scaled_reward/mean": -0.21999366581439972, |
| "rewards/cosine_scaled_reward/std": 0.2619490623474121, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1906.0, |
| "completions/mean_length": 1220.9375, |
| "completions/mean_terminated_length": 1119.368408203125, |
| "completions/min_length": 621.0, |
| "completions/min_terminated_length": 621.0, |
| "epoch": 0.168, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2767592966556549, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0, |
| "num_tokens": 14554636.0, |
| "reward": 0.8378211259841919, |
| "reward_std": 0.6600607633590698, |
| "rewards/cosine_scaled_reward/mean": -0.07327694445848465, |
| "rewards/cosine_scaled_reward/std": 0.4367770254611969, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1039.28125, |
| "completions/mean_terminated_length": 915.4035034179688, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.16914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31175175309181213, |
| "learning_rate": 2.791832395815782e-07, |
| "loss": 0.0, |
| "num_tokens": 14632334.0, |
| "reward": 0.749801754951477, |
| "reward_std": 0.5025944709777832, |
| "rewards/cosine_scaled_reward/mean": -0.10166161507368088, |
| "rewards/cosine_scaled_reward/std": 0.4026789367198944, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1947.0, |
| "completions/mean_length": 1362.421875, |
| "completions/mean_terminated_length": 1050.7955322265625, |
| "completions/min_length": 427.0, |
| "completions/min_terminated_length": 427.0, |
| "epoch": 0.1702857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2945536971092224, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": -0.0, |
| "num_tokens": 14731425.0, |
| "reward": 0.5095837116241455, |
| "reward_std": 0.9072202444076538, |
| "rewards/cosine_scaled_reward/mean": -0.12020813673734665, |
| "rewards/cosine_scaled_reward/std": 0.44057199358940125, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 149 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2002.0, |
| "completions/mean_length": 1051.125, |
| "completions/mean_terminated_length": 984.666748046875, |
| "completions/min_length": 422.0, |
| "completions/min_terminated_length": 422.0, |
| "epoch": 0.17142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31511664390563965, |
| "learning_rate": 2.6680582402757324e-07, |
| "loss": 0.0, |
| "num_tokens": 14809201.0, |
| "reward": 0.8506758213043213, |
| "reward_std": 0.6328262686729431, |
| "rewards/cosine_scaled_reward/mean": -0.06684959679841995, |
| "rewards/cosine_scaled_reward/std": 0.4310523569583893, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 150 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2013.0, |
| "completions/mean_length": 1043.53125, |
| "completions/mean_terminated_length": 900.0357666015625, |
| "completions/min_length": 352.0, |
| "completions/min_terminated_length": 352.0, |
| "epoch": 0.17257142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.317804217338562, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0, |
| "num_tokens": 14886667.0, |
| "reward": 1.1822679042816162, |
| "reward_std": 0.8676217794418335, |
| "rewards/cosine_scaled_reward/mean": 0.13019640743732452, |
| "rewards/cosine_scaled_reward/std": 0.5323836207389832, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 151 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1948.0, |
| "completions/mean_length": 1029.5, |
| "completions/mean_terminated_length": 979.4097900390625, |
| "completions/min_length": 177.0, |
| "completions/min_terminated_length": 177.0, |
| "epoch": 0.1737142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41914206743240356, |
| "learning_rate": 2.547734369542718e-07, |
| "loss": 0.0, |
| "num_tokens": 14963219.0, |
| "reward": 0.8113790154457092, |
| "reward_std": 0.7262269258499146, |
| "rewards/cosine_scaled_reward/mean": -0.07868549972772598, |
| "rewards/cosine_scaled_reward/std": 0.46254217624664307, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 152 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.140625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1182.75, |
| "completions/mean_terminated_length": 1041.16357421875, |
| "completions/min_length": 407.0, |
| "completions/min_terminated_length": 407.0, |
| "epoch": 0.17485714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30266106128692627, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": -0.0, |
| "num_tokens": 15050059.0, |
| "reward": 0.5501826405525208, |
| "reward_std": 0.3770068287849426, |
| "rewards/cosine_scaled_reward/mean": -0.19365867972373962, |
| "rewards/cosine_scaled_reward/std": 0.18398644030094147, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 153 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1711.0, |
| "completions/mean_length": 1236.765625, |
| "completions/mean_terminated_length": 966.3541870117188, |
| "completions/min_length": 473.0, |
| "completions/min_terminated_length": 473.0, |
| "epoch": 0.176, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31099703907966614, |
| "learning_rate": 2.4310073797187573e-07, |
| "loss": -0.0, |
| "num_tokens": 15140276.0, |
| "reward": 0.590886116027832, |
| "reward_std": 0.6541597843170166, |
| "rewards/cosine_scaled_reward/mean": -0.1342444270849228, |
| "rewards/cosine_scaled_reward/std": 0.36679577827453613, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 154 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1770.0, |
| "completions/mean_length": 910.109375, |
| "completions/mean_terminated_length": 792.3965454101562, |
| "completions/min_length": 320.0, |
| "completions/min_terminated_length": 320.0, |
| "epoch": 0.17714285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35653260350227356, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0, |
| "num_tokens": 15209147.0, |
| "reward": 0.8104115724563599, |
| "reward_std": 0.6592832803726196, |
| "rewards/cosine_scaled_reward/mean": -0.06354419887065887, |
| "rewards/cosine_scaled_reward/std": 0.4711204171180725, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1213.96875, |
| "completions/mean_terminated_length": 980.4400024414062, |
| "completions/min_length": 393.0, |
| "completions/min_terminated_length": 393.0, |
| "epoch": 0.1782857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29153549671173096, |
| "learning_rate": 2.3180194846605364e-07, |
| "loss": 0.0, |
| "num_tokens": 15296945.0, |
| "reward": 0.6139351725578308, |
| "reward_std": 0.6581733226776123, |
| "rewards/cosine_scaled_reward/mean": -0.1070949137210846, |
| "rewards/cosine_scaled_reward/std": 0.4280206561088562, |
| "rewards/format_reward/mean": 0.828125, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 156 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1971.0, |
| "completions/mean_length": 1071.921875, |
| "completions/mean_terminated_length": 1023.91796875, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.17942857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3182876706123352, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": -0.0, |
| "num_tokens": 15375508.0, |
| "reward": 0.6302845478057861, |
| "reward_std": 0.7132326364517212, |
| "rewards/cosine_scaled_reward/mean": -0.16142022609710693, |
| "rewards/cosine_scaled_reward/std": 0.37519919872283936, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 943.375, |
| "completions/mean_terminated_length": 907.7418823242188, |
| "completions/min_length": 136.0, |
| "completions/min_terminated_length": 136.0, |
| "epoch": 0.18057142857142858, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.2808084189891815, |
| "learning_rate": 2.2089083427137329e-07, |
| "loss": 0.0, |
| "num_tokens": 15446004.0, |
| "reward": 0.9674867391586304, |
| "reward_std": 0.5017939805984497, |
| "rewards/cosine_scaled_reward/mean": -0.008444137871265411, |
| "rewards/cosine_scaled_reward/std": 0.49054765701293945, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 158 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1937.0, |
| "completions/mean_length": 1154.34375, |
| "completions/mean_terminated_length": 1026.6785888671875, |
| "completions/min_length": 237.0, |
| "completions/min_terminated_length": 237.0, |
| "epoch": 0.18171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2708660066127777, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.0, |
| "num_tokens": 15531562.0, |
| "reward": 0.5022876262664795, |
| "reward_std": 0.5563845038414001, |
| "rewards/cosine_scaled_reward/mean": -0.20979365706443787, |
| "rewards/cosine_scaled_reward/std": 0.2771652638912201, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 159 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 2000.0, |
| "completions/max_terminated_length": 2000.0, |
| "completions/mean_length": 900.078125, |
| "completions/mean_terminated_length": 900.078125, |
| "completions/min_length": 384.0, |
| "completions/min_terminated_length": 384.0, |
| "epoch": 0.18285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3298548460006714, |
| "learning_rate": 2.1038068889975259e-07, |
| "loss": -0.0, |
| "num_tokens": 15600751.0, |
| "reward": 1.3369240760803223, |
| "reward_std": 0.6572985053062439, |
| "rewards/cosine_scaled_reward/mean": 0.16846203804016113, |
| "rewards/cosine_scaled_reward/std": 0.5345699787139893, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 160 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1737.0, |
| "completions/mean_length": 1066.46875, |
| "completions/mean_terminated_length": 926.2500610351562, |
| "completions/min_length": 326.0, |
| "completions/min_terminated_length": 326.0, |
| "epoch": 0.184, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31214678287506104, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0, |
| "num_tokens": 15678997.0, |
| "reward": 0.7608721256256104, |
| "reward_std": 0.6540825366973877, |
| "rewards/cosine_scaled_reward/mean": -0.08831392973661423, |
| "rewards/cosine_scaled_reward/std": 0.35966333746910095, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 161 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1973.0, |
| "completions/mean_length": 1089.203125, |
| "completions/mean_terminated_length": 952.232177734375, |
| "completions/min_length": 470.0, |
| "completions/min_terminated_length": 470.0, |
| "epoch": 0.18514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3335266411304474, |
| "learning_rate": 2.0028431734436308e-07, |
| "loss": 0.0, |
| "num_tokens": 15759010.0, |
| "reward": 0.6903920769691467, |
| "reward_std": 0.522528886795044, |
| "rewards/cosine_scaled_reward/mean": -0.10792896896600723, |
| "rewards/cosine_scaled_reward/std": 0.35296061635017395, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 162 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1494.0, |
| "completions/mean_length": 1043.9375, |
| "completions/mean_terminated_length": 812.2307739257812, |
| "completions/min_length": 405.0, |
| "completions/min_terminated_length": 405.0, |
| "epoch": 0.18628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33791008591651917, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": -0.0, |
| "num_tokens": 15837006.0, |
| "reward": 1.0535857677459717, |
| "reward_std": 0.7004721164703369, |
| "rewards/cosine_scaled_reward/mean": 0.12054289877414703, |
| "rewards/cosine_scaled_reward/std": 0.5006844997406006, |
| "rewards/format_reward/mean": 0.8125, |
| "rewards/format_reward/std": 0.39339789748191833, |
| "step": 163 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1648.0, |
| "completions/mean_length": 933.0, |
| "completions/mean_terminated_length": 858.6666870117188, |
| "completions/min_length": 340.0, |
| "completions/min_terminated_length": 340.0, |
| "epoch": 0.18742857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3110557794570923, |
| "learning_rate": 1.9061402047871833e-07, |
| "loss": 0.0, |
| "num_tokens": 15907246.0, |
| "reward": 1.0376479625701904, |
| "reward_std": 0.5748878121376038, |
| "rewards/cosine_scaled_reward/mean": 0.04226145148277283, |
| "rewards/cosine_scaled_reward/std": 0.48099249601364136, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 164 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1855.0, |
| "completions/mean_length": 1060.765625, |
| "completions/mean_terminated_length": 939.5263061523438, |
| "completions/min_length": 329.0, |
| "completions/min_terminated_length": 329.0, |
| "epoch": 0.18857142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3239347040653229, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0, |
| "num_tokens": 15986551.0, |
| "reward": 0.8562759160995483, |
| "reward_std": 0.5603832602500916, |
| "rewards/cosine_scaled_reward/mean": -0.06404951959848404, |
| "rewards/cosine_scaled_reward/std": 0.4168683588504791, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 165 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1649.0, |
| "completions/mean_length": 1088.78125, |
| "completions/mean_terminated_length": 951.7500610351562, |
| "completions/min_length": 439.0, |
| "completions/min_terminated_length": 439.0, |
| "epoch": 0.18971428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2883976697921753, |
| "learning_rate": 1.8138158006995363e-07, |
| "loss": -0.0, |
| "num_tokens": 16067809.0, |
| "reward": 0.7584704160690308, |
| "reward_std": 0.6604156494140625, |
| "rewards/cosine_scaled_reward/mean": -0.05826478824019432, |
| "rewards/cosine_scaled_reward/std": 0.3981381356716156, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1756.0, |
| "completions/mean_length": 939.125, |
| "completions/mean_terminated_length": 903.3547973632812, |
| "completions/min_length": 487.0, |
| "completions/min_terminated_length": 487.0, |
| "epoch": 0.19085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2965018153190613, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": 0.0, |
| "num_tokens": 16139457.0, |
| "reward": 0.7410329580307007, |
| "reward_std": 0.5566695928573608, |
| "rewards/cosine_scaled_reward/mean": -0.11385852843523026, |
| "rewards/cosine_scaled_reward/std": 0.3546922504901886, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 167 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1825.0, |
| "completions/mean_length": 1022.125, |
| "completions/mean_terminated_length": 1005.84130859375, |
| "completions/min_length": 382.0, |
| "completions/min_terminated_length": 382.0, |
| "epoch": 0.192, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2879563868045807, |
| "learning_rate": 1.7259824442455923e-07, |
| "loss": -0.0, |
| "num_tokens": 16215713.0, |
| "reward": 0.8576459288597107, |
| "reward_std": 0.6195322275161743, |
| "rewards/cosine_scaled_reward/mean": -0.06336455047130585, |
| "rewards/cosine_scaled_reward/std": 0.4510643184185028, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 168 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1484.0, |
| "completions/mean_length": 926.8125, |
| "completions/mean_terminated_length": 909.0159301757812, |
| "completions/min_length": 529.0, |
| "completions/min_terminated_length": 529.0, |
| "epoch": 0.19314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27474501729011536, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": -0.0, |
| "num_tokens": 16285653.0, |
| "reward": 1.4680557250976562, |
| "reward_std": 0.7384843826293945, |
| "rewards/cosine_scaled_reward/mean": 0.24184036254882812, |
| "rewards/cosine_scaled_reward/std": 0.5405412316322327, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 169 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1897.0, |
| "completions/mean_length": 1003.953125, |
| "completions/mean_terminated_length": 854.8035888671875, |
| "completions/min_length": 354.0, |
| "completions/min_terminated_length": 354.0, |
| "epoch": 0.19428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30747127532958984, |
| "learning_rate": 1.6427471468404952e-07, |
| "loss": 0.0, |
| "num_tokens": 16359690.0, |
| "reward": 0.9851430654525757, |
| "reward_std": 0.3564821481704712, |
| "rewards/cosine_scaled_reward/mean": 0.055071547627449036, |
| "rewards/cosine_scaled_reward/std": 0.4447442889213562, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 170 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.265625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1212.328125, |
| "completions/mean_terminated_length": 910.0637817382812, |
| "completions/min_length": 542.0, |
| "completions/min_terminated_length": 542.0, |
| "epoch": 0.19542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3194407820701599, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.0, |
| "num_tokens": 16447671.0, |
| "reward": 0.8521295785903931, |
| "reward_std": 0.6044571399688721, |
| "rewards/cosine_scaled_reward/mean": 0.04325229674577713, |
| "rewards/cosine_scaled_reward/std": 0.4702494442462921, |
| "rewards/format_reward/mean": 0.765625, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 171 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1785.0, |
| "completions/mean_length": 973.09375, |
| "completions/mean_terminated_length": 882.0, |
| "completions/min_length": 240.0, |
| "completions/min_terminated_length": 240.0, |
| "epoch": 0.19657142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3858748972415924, |
| "learning_rate": 1.5642113178727193e-07, |
| "loss": -0.0, |
| "num_tokens": 16520565.0, |
| "reward": 1.4210284948349, |
| "reward_std": 0.6327061057090759, |
| "rewards/cosine_scaled_reward/mean": 0.21832676231861115, |
| "rewards/cosine_scaled_reward/std": 0.5338378548622131, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 172 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2041.0, |
| "completions/mean_length": 874.0, |
| "completions/mean_terminated_length": 816.2622680664062, |
| "completions/min_length": 277.0, |
| "completions/min_terminated_length": 277.0, |
| "epoch": 0.1977142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34975388646125793, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0, |
| "num_tokens": 16587813.0, |
| "reward": 0.9026652574539185, |
| "reward_std": 0.7158900499343872, |
| "rewards/cosine_scaled_reward/mean": -0.025229886174201965, |
| "rewards/cosine_scaled_reward/std": 0.411268025636673, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 173 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1908.0, |
| "completions/mean_length": 946.546875, |
| "completions/mean_terminated_length": 911.01611328125, |
| "completions/min_length": 452.0, |
| "completions/min_terminated_length": 452.0, |
| "epoch": 0.19885714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3235276937484741, |
| "learning_rate": 1.4904706411523448e-07, |
| "loss": 0.0, |
| "num_tokens": 16658728.0, |
| "reward": 0.9661835432052612, |
| "reward_std": 0.6674793362617493, |
| "rewards/cosine_scaled_reward/mean": -0.009095773100852966, |
| "rewards/cosine_scaled_reward/std": 0.4818039536476135, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 174 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1785.0, |
| "completions/mean_length": 946.375, |
| "completions/mean_terminated_length": 910.8386840820312, |
| "completions/min_length": 442.0, |
| "completions/min_terminated_length": 442.0, |
| "epoch": 0.2, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3162003755569458, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": 0.0, |
| "num_tokens": 16730120.0, |
| "reward": 0.9043581485748291, |
| "reward_std": 0.41858798265457153, |
| "rewards/cosine_scaled_reward/mean": -0.040008433163166046, |
| "rewards/cosine_scaled_reward/std": 0.4500538408756256, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1941.0, |
| "completions/mean_length": 1066.109375, |
| "completions/mean_terminated_length": 964.5344848632812, |
| "completions/min_length": 299.0, |
| "completions/min_terminated_length": 299.0, |
| "epoch": 0.20114285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33956244587898254, |
| "learning_rate": 1.4216149583350755e-07, |
| "loss": -0.0, |
| "num_tokens": 16809519.0, |
| "reward": 0.7081954479217529, |
| "reward_std": 0.5614209771156311, |
| "rewards/cosine_scaled_reward/mean": -0.11465225368738174, |
| "rewards/cosine_scaled_reward/std": 0.34775587916374207, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 176 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1808.0, |
| "completions/mean_length": 917.015625, |
| "completions/mean_terminated_length": 899.0635375976562, |
| "completions/min_length": 449.0, |
| "completions/min_terminated_length": 449.0, |
| "epoch": 0.2022857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32397690415382385, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0, |
| "num_tokens": 16878432.0, |
| "reward": 1.0032364130020142, |
| "reward_std": 0.7183334827423096, |
| "rewards/cosine_scaled_reward/mean": 0.009430669248104095, |
| "rewards/cosine_scaled_reward/std": 0.4540334641933441, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 177 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1671.0, |
| "completions/mean_length": 1051.28125, |
| "completions/mean_terminated_length": 928.877197265625, |
| "completions/min_length": 384.0, |
| "completions/min_terminated_length": 384.0, |
| "epoch": 0.20342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32052552700042725, |
| "learning_rate": 1.3577281594640182e-07, |
| "loss": -0.0, |
| "num_tokens": 16957250.0, |
| "reward": 0.7599722146987915, |
| "reward_std": 0.612259566783905, |
| "rewards/cosine_scaled_reward/mean": -0.08095138520002365, |
| "rewards/cosine_scaled_reward/std": 0.3940528631210327, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 178 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.203125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1793.0, |
| "completions/mean_length": 1150.203125, |
| "completions/mean_terminated_length": 921.3529663085938, |
| "completions/min_length": 367.0, |
| "completions/min_terminated_length": 367.0, |
| "epoch": 0.20457142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36251163482666016, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0, |
| "num_tokens": 17041695.0, |
| "reward": 0.5754084587097168, |
| "reward_std": 0.5908599495887756, |
| "rewards/cosine_scaled_reward/mean": -0.11854580044746399, |
| "rewards/cosine_scaled_reward/std": 0.32444170117378235, |
| "rewards/format_reward/mean": 0.8125, |
| "rewards/format_reward/std": 0.39339789748191833, |
| "step": 179 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1981.0, |
| "completions/mean_length": 1004.453125, |
| "completions/mean_terminated_length": 896.5, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 256.0, |
| "epoch": 0.2057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3620760142803192, |
| "learning_rate": 1.2988880807625927e-07, |
| "loss": -0.0, |
| "num_tokens": 17117156.0, |
| "reward": 1.403580904006958, |
| "reward_std": 0.9170527458190918, |
| "rewards/cosine_scaled_reward/mean": 0.2096029818058014, |
| "rewards/cosine_scaled_reward/std": 0.5256889462471008, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 180 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1716.0, |
| "completions/mean_length": 1117.15625, |
| "completions/mean_terminated_length": 984.1785888671875, |
| "completions/min_length": 423.0, |
| "completions/min_terminated_length": 423.0, |
| "epoch": 0.20685714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2816322147846222, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": -0.0, |
| "num_tokens": 17200126.0, |
| "reward": 0.8361604809761047, |
| "reward_std": 0.6540721654891968, |
| "rewards/cosine_scaled_reward/mean": -0.050669748336076736, |
| "rewards/cosine_scaled_reward/std": 0.44142434000968933, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 181 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2025.0, |
| "completions/mean_length": 1210.0, |
| "completions/mean_terminated_length": 930.6666870117188, |
| "completions/min_length": 287.0, |
| "completions/min_terminated_length": 287.0, |
| "epoch": 0.208, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2651442885398865, |
| "learning_rate": 1.2451664098030743e-07, |
| "loss": 0.0, |
| "num_tokens": 17287358.0, |
| "reward": 0.62415611743927, |
| "reward_std": 0.6586728096008301, |
| "rewards/cosine_scaled_reward/mean": -0.10198444128036499, |
| "rewards/cosine_scaled_reward/std": 0.3847215175628662, |
| "rewards/format_reward/mean": 0.828125, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 182 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1794.0, |
| "completions/mean_length": 861.359375, |
| "completions/mean_terminated_length": 823.0806274414062, |
| "completions/min_length": 362.0, |
| "completions/min_terminated_length": 362.0, |
| "epoch": 0.20914285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3385181427001953, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": -0.0, |
| "num_tokens": 17353101.0, |
| "reward": 1.0283212661743164, |
| "reward_std": 0.6364034414291382, |
| "rewards/cosine_scaled_reward/mean": 0.029785610735416412, |
| "rewards/cosine_scaled_reward/std": 0.42320308089256287, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 183 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1726.0, |
| "completions/mean_length": 1004.765625, |
| "completions/mean_terminated_length": 916.35595703125, |
| "completions/min_length": 174.0, |
| "completions/min_terminated_length": 174.0, |
| "epoch": 0.2102857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30591627955436707, |
| "learning_rate": 1.1966285981663407e-07, |
| "loss": -0.0, |
| "num_tokens": 17428758.0, |
| "reward": 0.7365655899047852, |
| "reward_std": 0.4397754371166229, |
| "rewards/cosine_scaled_reward/mean": -0.12390469759702682, |
| "rewards/cosine_scaled_reward/std": 0.3771846890449524, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 184 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2025.0, |
| "completions/mean_length": 976.359375, |
| "completions/mean_terminated_length": 959.3492431640625, |
| "completions/min_length": 319.0, |
| "completions/min_terminated_length": 319.0, |
| "epoch": 0.21142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3458598852157593, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": -0.0, |
| "num_tokens": 17501429.0, |
| "reward": 0.9240812063217163, |
| "reward_std": 0.5495443344116211, |
| "rewards/cosine_scaled_reward/mean": -0.03795938193798065, |
| "rewards/cosine_scaled_reward/std": 0.46237269043922424, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1951.0, |
| "completions/mean_length": 1122.5, |
| "completions/mean_terminated_length": 951.1111450195312, |
| "completions/min_length": 469.0, |
| "completions/min_terminated_length": 469.0, |
| "epoch": 0.21257142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31067562103271484, |
| "learning_rate": 1.1533337816991931e-07, |
| "loss": -0.0, |
| "num_tokens": 17583965.0, |
| "reward": 0.8422703742980957, |
| "reward_std": 0.6076713800430298, |
| "rewards/cosine_scaled_reward/mean": -0.01636481285095215, |
| "rewards/cosine_scaled_reward/std": 0.4099017381668091, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 186 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1007.21875, |
| "completions/mean_terminated_length": 937.8333740234375, |
| "completions/min_length": 433.0, |
| "completions/min_terminated_length": 433.0, |
| "epoch": 0.21371428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32321879267692566, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": -0.0, |
| "num_tokens": 17658851.0, |
| "reward": 0.7644214630126953, |
| "reward_std": 0.4659081697463989, |
| "rewards/cosine_scaled_reward/mean": -0.08653924614191055, |
| "rewards/cosine_scaled_reward/std": 0.38629019260406494, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 187 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 992.703125, |
| "completions/mean_terminated_length": 958.6612548828125, |
| "completions/min_length": 242.0, |
| "completions/min_terminated_length": 242.0, |
| "epoch": 0.21485714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36401256918907166, |
| "learning_rate": 1.1153347084664419e-07, |
| "loss": -0.0, |
| "num_tokens": 17734184.0, |
| "reward": 0.5271173715591431, |
| "reward_std": 0.5396482348442078, |
| "rewards/cosine_scaled_reward/mean": -0.23644131422042847, |
| "rewards/cosine_scaled_reward/std": 0.34559664130210876, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 188 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 834.171875, |
| "completions/mean_terminated_length": 753.2500610351562, |
| "completions/min_length": 179.0, |
| "completions/min_terminated_length": 179.0, |
| "epoch": 0.216, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.356674462556839, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": -0.0, |
| "num_tokens": 17796859.0, |
| "reward": 0.8302590847015381, |
| "reward_std": 0.6041134595870972, |
| "rewards/cosine_scaled_reward/mean": -0.07705795764923096, |
| "rewards/cosine_scaled_reward/std": 0.43526527285575867, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 189 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2012.0, |
| "completions/mean_length": 974.890625, |
| "completions/mean_terminated_length": 940.274169921875, |
| "completions/min_length": 449.0, |
| "completions/min_terminated_length": 449.0, |
| "epoch": 0.21714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29584038257598877, |
| "learning_rate": 1.0826776744855121e-07, |
| "loss": 0.0, |
| "num_tokens": 17869020.0, |
| "reward": 1.0407956838607788, |
| "reward_std": 0.5199205875396729, |
| "rewards/cosine_scaled_reward/mean": 0.020397864282131195, |
| "rewards/cosine_scaled_reward/std": 0.4723619520664215, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 190 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1847.0, |
| "completions/mean_length": 910.59375, |
| "completions/mean_terminated_length": 892.5397338867188, |
| "completions/min_length": 259.0, |
| "completions/min_terminated_length": 259.0, |
| "epoch": 0.21828571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33857786655426025, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": -0.0, |
| "num_tokens": 17937586.0, |
| "reward": 1.044985055923462, |
| "reward_std": 0.7015856504440308, |
| "rewards/cosine_scaled_reward/mean": 0.030305005609989166, |
| "rewards/cosine_scaled_reward/std": 0.4603172242641449, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 191 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.203125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2025.0, |
| "completions/mean_length": 1249.640625, |
| "completions/mean_terminated_length": 1046.1373291015625, |
| "completions/min_length": 404.0, |
| "completions/min_terminated_length": 404.0, |
| "epoch": 0.21942857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2704383134841919, |
| "learning_rate": 1.0554024673218806e-07, |
| "loss": -0.0, |
| "num_tokens": 18028859.0, |
| "reward": 0.5498086810112, |
| "reward_std": 0.5540546178817749, |
| "rewards/cosine_scaled_reward/mean": -0.17040817439556122, |
| "rewards/cosine_scaled_reward/std": 0.2906738519668579, |
| "rewards/format_reward/mean": 0.890625, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 192 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1859.0, |
| "completions/mean_length": 1235.921875, |
| "completions/mean_terminated_length": 1067.3773193359375, |
| "completions/min_length": 351.0, |
| "completions/min_terminated_length": 351.0, |
| "epoch": 0.22057142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3072021007537842, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0, |
| "num_tokens": 18119102.0, |
| "reward": 0.8473511338233948, |
| "reward_std": 0.7724316716194153, |
| "rewards/cosine_scaled_reward/mean": -0.021636933088302612, |
| "rewards/cosine_scaled_reward/std": 0.4808884263038635, |
| "rewards/format_reward/mean": 0.890625, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 193 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2023.0, |
| "completions/mean_length": 1306.71875, |
| "completions/mean_terminated_length": 1135.6539306640625, |
| "completions/min_length": 559.0, |
| "completions/min_terminated_length": 559.0, |
| "epoch": 0.22171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2672087252140045, |
| "learning_rate": 1.0335423176140511e-07, |
| "loss": 0.0, |
| "num_tokens": 18214092.0, |
| "reward": 0.8827314972877502, |
| "reward_std": 0.7681792974472046, |
| "rewards/cosine_scaled_reward/mean": 0.011678241193294525, |
| "rewards/cosine_scaled_reward/std": 0.48625898361206055, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 194 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1073.125, |
| "completions/mean_terminated_length": 1025.1802978515625, |
| "completions/min_length": 494.0, |
| "completions/min_terminated_length": 494.0, |
| "epoch": 0.22285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2599698603153229, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": -0.0, |
| "num_tokens": 18293916.0, |
| "reward": 0.9670987129211426, |
| "reward_std": 0.7788794040679932, |
| "rewards/cosine_scaled_reward/mean": 0.006986856460571289, |
| "rewards/cosine_scaled_reward/std": 0.5052981972694397, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 195 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1871.0, |
| "completions/mean_length": 1129.515625, |
| "completions/mean_terminated_length": 1034.5, |
| "completions/min_length": 356.0, |
| "completions/min_terminated_length": 356.0, |
| "epoch": 0.224, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.329662024974823, |
| "learning_rate": 1.017123858587145e-07, |
| "loss": -0.0, |
| "num_tokens": 18377797.0, |
| "reward": 0.9397312998771667, |
| "reward_std": 0.7938928604125977, |
| "rewards/cosine_scaled_reward/mean": 0.016740664839744568, |
| "rewards/cosine_scaled_reward/std": 0.4878515601158142, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 196 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1673.0, |
| "completions/mean_length": 845.515625, |
| "completions/mean_terminated_length": 806.7257690429688, |
| "completions/min_length": 359.0, |
| "completions/min_terminated_length": 359.0, |
| "epoch": 0.22514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35329359769821167, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": -0.0, |
| "num_tokens": 18442278.0, |
| "reward": 1.326704740524292, |
| "reward_std": 0.6592847108840942, |
| "rewards/cosine_scaled_reward/mean": 0.17116491496562958, |
| "rewards/cosine_scaled_reward/std": 0.5057182908058167, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 197 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1691.0, |
| "completions/mean_length": 1010.03125, |
| "completions/mean_terminated_length": 882.5614013671875, |
| "completions/min_length": 399.0, |
| "completions/min_terminated_length": 399.0, |
| "epoch": 0.22628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32196998596191406, |
| "learning_rate": 1.0061670936044178e-07, |
| "loss": 0.0, |
| "num_tokens": 18518424.0, |
| "reward": 0.9607409238815308, |
| "reward_std": 0.6629819869995117, |
| "rewards/cosine_scaled_reward/mean": -0.004004567861557007, |
| "rewards/cosine_scaled_reward/std": 0.4992019534111023, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 198 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1985.0, |
| "completions/mean_length": 1223.03125, |
| "completions/mean_terminated_length": 1121.7193603515625, |
| "completions/min_length": 346.0, |
| "completions/min_terminated_length": 346.0, |
| "epoch": 0.22742857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2712598741054535, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0, |
| "num_tokens": 18608202.0, |
| "reward": 0.7551803588867188, |
| "reward_std": 0.5998207330703735, |
| "rewards/cosine_scaled_reward/mean": -0.11459730565547943, |
| "rewards/cosine_scaled_reward/std": 0.3166539669036865, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 199 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1882.0, |
| "completions/mean_length": 974.171875, |
| "completions/mean_terminated_length": 921.360595703125, |
| "completions/min_length": 423.0, |
| "completions/min_terminated_length": 423.0, |
| "epoch": 0.22857142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.285355806350708, |
| "learning_rate": 1.0006853717962393e-07, |
| "loss": -0.0, |
| "num_tokens": 18680669.0, |
| "reward": 1.1672099828720093, |
| "reward_std": 0.7903769016265869, |
| "rewards/cosine_scaled_reward/mean": 0.09922999888658524, |
| "rewards/cosine_scaled_reward/std": 0.5049266219139099, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.22857142857142856, |
| "step": 200, |
| "total_flos": 0.0, |
| "train_loss": -1.9138678908348085e-09, |
| "train_runtime": 10259.1504, |
| "train_samples_per_second": 1.248, |
| "train_steps_per_second": 0.019 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 200, |
| "num_input_tokens_seen": 18680669, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|