| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.05714285714285714, |
| "eval_steps": 500, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1702.03125, |
| "completions/mean_terminated_length": 993.6190795898438, |
| "completions/min_length": 483.0, |
| "completions/min_terminated_length": 483.0, |
| "epoch": 0.001142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2837817668914795, |
| "learning_rate": 0.0, |
| "loss": -0.0, |
| "num_tokens": 118418.0, |
| "reward": -0.09800112247467041, |
| "reward_std": 0.3028089702129364, |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1738.90625, |
| "completions/mean_terminated_length": 949.0, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.002285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2421981245279312, |
| "learning_rate": 2e-08, |
| "loss": -0.0, |
| "num_tokens": 239748.0, |
| "reward": 0.020556632429361343, |
| "reward_std": 0.3545936942100525, |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 953.0, |
| "completions/mean_length": 1952.234375, |
| "completions/mean_terminated_length": 822.2000122070312, |
| "completions/min_length": 703.0, |
| "completions/min_terminated_length": 703.0, |
| "epoch": 0.0034285714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24851329624652863, |
| "learning_rate": 4e-08, |
| "loss": -0.0, |
| "num_tokens": 375163.0, |
| "reward": -0.22721199691295624, |
| "reward_std": 0.14563649892807007, |
| "rewards/cosine_scaled_reward/mean": -0.22721199691295624, |
| "rewards/cosine_scaled_reward/std": 0.1709199845790863, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1685.0, |
| "completions/mean_length": 1554.109375, |
| "completions/mean_terminated_length": 958.0344848632812, |
| "completions/min_length": 504.0, |
| "completions/min_terminated_length": 504.0, |
| "epoch": 0.004571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29272863268852234, |
| "learning_rate": 6e-08, |
| "loss": -0.0, |
| "num_tokens": 484434.0, |
| "reward": -0.17542189359664917, |
| "reward_std": 0.18219107389450073, |
| "rewards/cosine_scaled_reward/mean": -0.17542189359664917, |
| "rewards/cosine_scaled_reward/std": 0.27975013852119446, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1930.0, |
| "completions/mean_length": 1943.0625, |
| "completions/mean_terminated_length": 1088.571533203125, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.005714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2773251533508301, |
| "learning_rate": 8e-08, |
| "loss": 0.0, |
| "num_tokens": 619606.0, |
| "reward": -0.2648562788963318, |
| "reward_std": 0.21638144552707672, |
| "rewards/cosine_scaled_reward/mean": -0.2648562788963318, |
| "rewards/cosine_scaled_reward/std": 0.23959198594093323, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1824.0, |
| "completions/mean_length": 1854.21875, |
| "completions/mean_terminated_length": 920.5454711914062, |
| "completions/min_length": 548.0, |
| "completions/min_terminated_length": 548.0, |
| "epoch": 0.006857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27399909496307373, |
| "learning_rate": 1e-07, |
| "loss": -0.0, |
| "num_tokens": 749924.0, |
| "reward": -0.19292885065078735, |
| "reward_std": 0.2666770815849304, |
| "rewards/cosine_scaled_reward/mean": -0.19292885065078735, |
| "rewards/cosine_scaled_reward/std": 0.295730322599411, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1589.0, |
| "completions/mean_length": 1940.5625, |
| "completions/mean_terminated_length": 1065.71435546875, |
| "completions/min_length": 773.0, |
| "completions/min_terminated_length": 773.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23362359404563904, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0, |
| "num_tokens": 884528.0, |
| "reward": -0.18198424577713013, |
| "reward_std": 0.18540163338184357, |
| "rewards/cosine_scaled_reward/mean": -0.18198424577713013, |
| "rewards/cosine_scaled_reward/std": 0.32407456636428833, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2048.0, |
| "completions/mean_length": 1708.5625, |
| "completions/mean_terminated_length": 1013.5238037109375, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.009142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24677562713623047, |
| "learning_rate": 1.4e-07, |
| "loss": -0.0, |
| "num_tokens": 1004292.0, |
| "reward": -0.09573853015899658, |
| "reward_std": 0.22485454380512238, |
| "rewards/cosine_scaled_reward/mean": -0.09573852270841599, |
| "rewards/cosine_scaled_reward/std": 0.449250191450119, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1221.0, |
| "completions/mean_length": 1979.359375, |
| "completions/mean_terminated_length": 949.75, |
| "completions/min_length": 569.0, |
| "completions/min_terminated_length": 569.0, |
| "epoch": 0.010285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26966309547424316, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1142427.0, |
| "reward": -0.19992578029632568, |
| "reward_std": 0.20190927386283875, |
| "rewards/cosine_scaled_reward/mean": -0.19992581009864807, |
| "rewards/cosine_scaled_reward/std": 0.23785534501075745, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1918.0, |
| "completions/mean_length": 1652.59375, |
| "completions/mean_terminated_length": 897.727294921875, |
| "completions/min_length": 286.0, |
| "completions/min_terminated_length": 286.0, |
| "epoch": 0.011428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3011312484741211, |
| "learning_rate": 1.8e-07, |
| "loss": 0.0, |
| "num_tokens": 1259025.0, |
| "reward": -0.11706389486789703, |
| "reward_std": 0.2934548258781433, |
| "rewards/cosine_scaled_reward/mean": -0.11706390231847763, |
| "rewards/cosine_scaled_reward/std": 0.3601698577404022, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1333.0, |
| "completions/mean_length": 1946.6875, |
| "completions/mean_terminated_length": 967.3333740234375, |
| "completions/min_length": 599.0, |
| "completions/min_terminated_length": 599.0, |
| "epoch": 0.012571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2451399564743042, |
| "learning_rate": 2e-07, |
| "loss": -0.0, |
| "num_tokens": 1395285.0, |
| "reward": -0.2866281270980835, |
| "reward_std": 0.12184012681245804, |
| "rewards/cosine_scaled_reward/mean": -0.2866281270980835, |
| "rewards/cosine_scaled_reward/std": 0.15141677856445312, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2032.0, |
| "completions/mean_length": 1659.28125, |
| "completions/mean_terminated_length": 1190.137939453125, |
| "completions/min_length": 535.0, |
| "completions/min_terminated_length": 535.0, |
| "epoch": 0.013714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2733561396598816, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0, |
| "num_tokens": 1512423.0, |
| "reward": -0.13816070556640625, |
| "reward_std": 0.2968980073928833, |
| "rewards/cosine_scaled_reward/mean": -0.13816070556640625, |
| "rewards/cosine_scaled_reward/std": 0.3597467839717865, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1770.0, |
| "completions/mean_length": 1807.796875, |
| "completions/mean_terminated_length": 1023.1333618164062, |
| "completions/min_length": 697.0, |
| "completions/min_terminated_length": 697.0, |
| "epoch": 0.014857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25238803029060364, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0, |
| "num_tokens": 1639162.0, |
| "reward": -0.13488636910915375, |
| "reward_std": 0.2661236524581909, |
| "rewards/cosine_scaled_reward/mean": -0.13488635420799255, |
| "rewards/cosine_scaled_reward/std": 0.3444243371486664, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1866.0, |
| "completions/mean_length": 1846.921875, |
| "completions/mean_terminated_length": 1243.6875, |
| "completions/min_length": 698.0, |
| "completions/min_terminated_length": 698.0, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2201598882675171, |
| "learning_rate": 2.6e-07, |
| "loss": -0.0, |
| "num_tokens": 1767973.0, |
| "reward": -0.20591925084590912, |
| "reward_std": 0.21505361795425415, |
| "rewards/cosine_scaled_reward/mean": -0.20591923594474792, |
| "rewards/cosine_scaled_reward/std": 0.323749840259552, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1713.0, |
| "completions/mean_length": 1710.421875, |
| "completions/mean_terminated_length": 847.7222290039062, |
| "completions/min_length": 450.0, |
| "completions/min_terminated_length": 450.0, |
| "epoch": 0.017142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2665213644504547, |
| "learning_rate": 2.8e-07, |
| "loss": 0.0, |
| "num_tokens": 1888360.0, |
| "reward": -0.0778750479221344, |
| "reward_std": 0.17502948641777039, |
| "rewards/cosine_scaled_reward/mean": -0.0778750628232956, |
| "rewards/cosine_scaled_reward/std": 0.47343766689300537, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.984375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 962.0, |
| "completions/mean_length": 2031.03125, |
| "completions/mean_terminated_length": 962.0, |
| "completions/min_length": 962.0, |
| "completions/min_terminated_length": 962.0, |
| "epoch": 0.018285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23009927570819855, |
| "learning_rate": 3e-07, |
| "loss": -0.0, |
| "num_tokens": 2028786.0, |
| "reward": -0.2619968056678772, |
| "reward_std": 0.16954168677330017, |
| "rewards/cosine_scaled_reward/mean": -0.2619968056678772, |
| "rewards/cosine_scaled_reward/std": 0.18357795476913452, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1918.0, |
| "completions/mean_length": 1533.15625, |
| "completions/mean_terminated_length": 780.6923217773438, |
| "completions/min_length": 380.0, |
| "completions/min_terminated_length": 380.0, |
| "epoch": 0.019428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3392995297908783, |
| "learning_rate": 3.2e-07, |
| "loss": -0.0, |
| "num_tokens": 2137428.0, |
| "reward": -0.11706461012363434, |
| "reward_std": 0.3096129894256592, |
| "rewards/cosine_scaled_reward/mean": -0.11706460267305374, |
| "rewards/cosine_scaled_reward/std": 0.3810974657535553, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1626.0, |
| "completions/mean_length": 1774.46875, |
| "completions/mean_terminated_length": 1018.2352905273438, |
| "completions/min_length": 516.0, |
| "completions/min_terminated_length": 516.0, |
| "epoch": 0.02057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23254038393497467, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0, |
| "num_tokens": 2261370.0, |
| "reward": -0.18709540367126465, |
| "reward_std": 0.2795025110244751, |
| "rewards/cosine_scaled_reward/mean": -0.18709540367126465, |
| "rewards/cosine_scaled_reward/std": 0.3359416127204895, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1859.0, |
| "completions/mean_length": 1719.0, |
| "completions/mean_terminated_length": 995.2000122070312, |
| "completions/min_length": 577.0, |
| "completions/min_terminated_length": 577.0, |
| "epoch": 0.021714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.262045681476593, |
| "learning_rate": 3.6e-07, |
| "loss": -0.0, |
| "num_tokens": 2382642.0, |
| "reward": -0.02329203486442566, |
| "reward_std": 0.34684932231903076, |
| "rewards/cosine_scaled_reward/mean": -0.02329203486442566, |
| "rewards/cosine_scaled_reward/std": 0.47637447714805603, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1988.0, |
| "completions/mean_length": 1630.90625, |
| "completions/mean_terminated_length": 935.75, |
| "completions/min_length": 425.0, |
| "completions/min_terminated_length": 425.0, |
| "epoch": 0.022857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.250532329082489, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0, |
| "num_tokens": 2498372.0, |
| "reward": -0.06319350004196167, |
| "reward_std": 0.2394939512014389, |
| "rewards/cosine_scaled_reward/mean": -0.06319350004196167, |
| "rewards/cosine_scaled_reward/std": 0.3889789879322052, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1818.0, |
| "completions/mean_length": 1735.96875, |
| "completions/mean_terminated_length": 1140.272705078125, |
| "completions/min_length": 428.0, |
| "completions/min_terminated_length": 428.0, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2773231565952301, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "num_tokens": 2620282.0, |
| "reward": -0.20884393155574799, |
| "reward_std": 0.20233216881752014, |
| "rewards/cosine_scaled_reward/mean": -0.20884393155574799, |
| "rewards/cosine_scaled_reward/std": 0.28432920575141907, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1790.0, |
| "completions/mean_length": 1342.953125, |
| "completions/mean_terminated_length": 919.9249877929688, |
| "completions/min_length": 286.0, |
| "completions/min_terminated_length": 286.0, |
| "epoch": 0.025142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34627005457878113, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0, |
| "num_tokens": 2715247.0, |
| "reward": -0.09092864394187927, |
| "reward_std": 0.21042926609516144, |
| "rewards/cosine_scaled_reward/mean": -0.09092865139245987, |
| "rewards/cosine_scaled_reward/std": 0.43559205532073975, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2038.0, |
| "completions/mean_length": 1661.9375, |
| "completions/mean_terminated_length": 1132.888916015625, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.026285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2705242335796356, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0, |
| "num_tokens": 2832403.0, |
| "reward": -0.13339249789714813, |
| "reward_std": 0.2433384656906128, |
| "rewards/cosine_scaled_reward/mean": -0.13339248299598694, |
| "rewards/cosine_scaled_reward/std": 0.3815627098083496, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2020.0, |
| "completions/mean_length": 1802.296875, |
| "completions/mean_terminated_length": 1065.1875, |
| "completions/min_length": 572.0, |
| "completions/min_terminated_length": 572.0, |
| "epoch": 0.027428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24961258471012115, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0, |
| "num_tokens": 2958678.0, |
| "reward": -0.18733163177967072, |
| "reward_std": 0.2773033380508423, |
| "rewards/cosine_scaled_reward/mean": -0.1873316466808319, |
| "rewards/cosine_scaled_reward/std": 0.37051624059677124, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1848.0, |
| "completions/mean_length": 1731.53125, |
| "completions/mean_terminated_length": 982.0, |
| "completions/min_length": 406.0, |
| "completions/min_terminated_length": 406.0, |
| "epoch": 0.02857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2662124037742615, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0, |
| "num_tokens": 3079792.0, |
| "reward": -0.12407588213682175, |
| "reward_std": 0.25581949949264526, |
| "rewards/cosine_scaled_reward/mean": -0.12407589703798294, |
| "rewards/cosine_scaled_reward/std": 0.39043793082237244, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2017.0, |
| "completions/mean_length": 1965.46875, |
| "completions/mean_terminated_length": 1567.8182373046875, |
| "completions/min_length": 1006.0, |
| "completions/min_terminated_length": 1006.0, |
| "epoch": 0.029714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23202598094940186, |
| "learning_rate": 5e-07, |
| "loss": 0.0, |
| "num_tokens": 3216214.0, |
| "reward": -0.0963105633854866, |
| "reward_std": 0.30887559056282043, |
| "rewards/cosine_scaled_reward/mean": -0.0963105633854866, |
| "rewards/cosine_scaled_reward/std": 0.39396020770072937, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2023.0, |
| "completions/mean_length": 1886.96875, |
| "completions/mean_terminated_length": 1111.0909423828125, |
| "completions/min_length": 498.0, |
| "completions/min_terminated_length": 498.0, |
| "epoch": 0.030857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2878379225730896, |
| "learning_rate": 5.2e-07, |
| "loss": -0.0, |
| "num_tokens": 3347268.0, |
| "reward": -0.1645491123199463, |
| "reward_std": 0.28629785776138306, |
| "rewards/cosine_scaled_reward/mean": -0.1645491123199463, |
| "rewards/cosine_scaled_reward/std": 0.35050687193870544, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 1843.640625, |
| "completions/mean_terminated_length": 1230.5625, |
| "completions/min_length": 444.0, |
| "completions/min_terminated_length": 444.0, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24996496737003326, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0, |
| "num_tokens": 3475597.0, |
| "reward": -0.06605555862188339, |
| "reward_std": 0.2643629312515259, |
| "rewards/cosine_scaled_reward/mean": -0.06605555862188339, |
| "rewards/cosine_scaled_reward/std": 0.438128799200058, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 2020.5, |
| "completions/mean_terminated_length": 1608.0, |
| "completions/min_length": 516.0, |
| "completions/min_terminated_length": 516.0, |
| "epoch": 0.03314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23316837847232819, |
| "learning_rate": 5.6e-07, |
| "loss": -0.0, |
| "num_tokens": 3615381.0, |
| "reward": -0.2015206664800644, |
| "reward_std": 0.15312039852142334, |
| "rewards/cosine_scaled_reward/mean": -0.2015206664800644, |
| "rewards/cosine_scaled_reward/std": 0.1648881882429123, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1839.0, |
| "completions/mean_length": 1826.046875, |
| "completions/mean_terminated_length": 955.3077392578125, |
| "completions/min_length": 364.0, |
| "completions/min_terminated_length": 364.0, |
| "epoch": 0.03428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2410832792520523, |
| "learning_rate": 5.8e-07, |
| "loss": -0.0, |
| "num_tokens": 3742784.0, |
| "reward": -0.17509159445762634, |
| "reward_std": 0.18994277715682983, |
| "rewards/cosine_scaled_reward/mean": -0.17509159445762634, |
| "rewards/cosine_scaled_reward/std": 0.22516494989395142, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1678.0, |
| "completions/mean_length": 1781.4375, |
| "completions/mean_terminated_length": 910.6666870117188, |
| "completions/min_length": 313.0, |
| "completions/min_terminated_length": 313.0, |
| "epoch": 0.03542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2693414092063904, |
| "learning_rate": 6e-07, |
| "loss": 0.0, |
| "num_tokens": 3867292.0, |
| "reward": -0.24513831734657288, |
| "reward_std": 0.28315529227256775, |
| "rewards/cosine_scaled_reward/mean": -0.24513831734657288, |
| "rewards/cosine_scaled_reward/std": 0.3480584919452667, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1975.0, |
| "completions/mean_length": 1969.28125, |
| "completions/mean_terminated_length": 1488.2222900390625, |
| "completions/min_length": 1088.0, |
| "completions/min_terminated_length": 1088.0, |
| "epoch": 0.036571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24202018976211548, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0, |
| "num_tokens": 4003678.0, |
| "reward": -0.18968716263771057, |
| "reward_std": 0.28299200534820557, |
| "rewards/cosine_scaled_reward/mean": -0.18968716263771057, |
| "rewards/cosine_scaled_reward/std": 0.3119950294494629, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2048.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2048.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.037714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22288212180137634, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0, |
| "num_tokens": 4145966.0, |
| "reward": -0.2955162525177002, |
| "reward_std": 0.17793573439121246, |
| "rewards/cosine_scaled_reward/mean": -0.2955162525177002, |
| "rewards/cosine_scaled_reward/std": 0.22786569595336914, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1809.0, |
| "completions/mean_length": 1589.640625, |
| "completions/mean_terminated_length": 1036.4482421875, |
| "completions/min_length": 515.0, |
| "completions/min_terminated_length": 515.0, |
| "epoch": 0.038857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31030499935150146, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0, |
| "num_tokens": 4257255.0, |
| "reward": 0.008002171292901039, |
| "reward_std": 0.3413254916667938, |
| "rewards/cosine_scaled_reward/mean": 0.008002176880836487, |
| "rewards/cosine_scaled_reward/std": 0.4431404769420624, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1785.921875, |
| "completions/mean_terminated_length": 757.769287109375, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3145958483219147, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": -0.0, |
| "num_tokens": 4383050.0, |
| "reward": -0.16386553645133972, |
| "reward_std": 0.2818174958229065, |
| "rewards/cosine_scaled_reward/mean": -0.16386555135250092, |
| "rewards/cosine_scaled_reward/std": 0.3242056965827942, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1195.0, |
| "completions/mean_length": 2000.421875, |
| "completions/mean_terminated_length": 1033.0, |
| "completions/min_length": 863.0, |
| "completions/min_terminated_length": 863.0, |
| "epoch": 0.04114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25796815752983093, |
| "learning_rate": 7e-07, |
| "loss": 0.0, |
| "num_tokens": 4522189.0, |
| "reward": -0.2470606118440628, |
| "reward_std": 0.15509279072284698, |
| "rewards/cosine_scaled_reward/mean": -0.2470606118440628, |
| "rewards/cosine_scaled_reward/std": 0.16412879526615143, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1964.46875, |
| "completions/mean_terminated_length": 1284.2857666015625, |
| "completions/min_length": 931.0, |
| "completions/min_terminated_length": 931.0, |
| "epoch": 0.04228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22452199459075928, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0, |
| "num_tokens": 4658939.0, |
| "reward": -0.24706938862800598, |
| "reward_std": 0.18499845266342163, |
| "rewards/cosine_scaled_reward/mean": -0.24706941843032837, |
| "rewards/cosine_scaled_reward/std": 0.21092188358306885, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1840.0, |
| "completions/mean_length": 1925.234375, |
| "completions/mean_terminated_length": 1175.0, |
| "completions/min_length": 916.0, |
| "completions/min_terminated_length": 916.0, |
| "epoch": 0.04342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23703666031360626, |
| "learning_rate": 7.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4793866.0, |
| "reward": -0.11504355818033218, |
| "reward_std": 0.20660358667373657, |
| "rewards/cosine_scaled_reward/mean": -0.11504356563091278, |
| "rewards/cosine_scaled_reward/std": 0.3190351724624634, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1412.0, |
| "completions/mean_length": 1740.546875, |
| "completions/mean_terminated_length": 642.5, |
| "completions/min_length": 339.0, |
| "completions/min_terminated_length": 339.0, |
| "epoch": 0.044571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23829001188278198, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 4916045.0, |
| "reward": -0.12095541507005692, |
| "reward_std": 0.1958026885986328, |
| "rewards/cosine_scaled_reward/mean": -0.12095542997121811, |
| "rewards/cosine_scaled_reward/std": 0.340241402387619, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1918.0, |
| "completions/mean_length": 1713.203125, |
| "completions/mean_terminated_length": 920.26318359375, |
| "completions/min_length": 451.0, |
| "completions/min_terminated_length": 451.0, |
| "epoch": 0.045714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24145744740962982, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 5035762.0, |
| "reward": -0.10936243832111359, |
| "reward_std": 0.14468500018119812, |
| "rewards/cosine_scaled_reward/mean": -0.10936242341995239, |
| "rewards/cosine_scaled_reward/std": 0.4288744330406189, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1801.0, |
| "completions/mean_length": 1909.71875, |
| "completions/mean_terminated_length": 1367.2308349609375, |
| "completions/min_length": 1138.0, |
| "completions/min_terminated_length": 1138.0, |
| "epoch": 0.046857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22317881882190704, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "num_tokens": 5169136.0, |
| "reward": -0.2058967649936676, |
| "reward_std": 0.2325170338153839, |
| "rewards/cosine_scaled_reward/mean": -0.20589673519134521, |
| "rewards/cosine_scaled_reward/std": 0.28897321224212646, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1752.0, |
| "completions/mean_length": 1727.71875, |
| "completions/mean_terminated_length": 583.857177734375, |
| "completions/min_length": 159.0, |
| "completions/min_terminated_length": 159.0, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.44688937067985535, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5290070.0, |
| "reward": -0.2254919707775116, |
| "reward_std": 0.1687203049659729, |
| "rewards/cosine_scaled_reward/mean": -0.2254919707775116, |
| "rewards/cosine_scaled_reward/std": 0.18203677237033844, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1082.0, |
| "completions/mean_length": 1855.328125, |
| "completions/mean_terminated_length": 814.9000244140625, |
| "completions/min_length": 588.0, |
| "completions/min_terminated_length": 588.0, |
| "epoch": 0.04914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2430828958749771, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5420427.0, |
| "reward": -0.09104865789413452, |
| "reward_std": 0.18217626214027405, |
| "rewards/cosine_scaled_reward/mean": -0.09104865789413452, |
| "rewards/cosine_scaled_reward/std": 0.3521345257759094, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1675.0, |
| "completions/mean_length": 1727.9375, |
| "completions/mean_terminated_length": 767.75, |
| "completions/min_length": 407.0, |
| "completions/min_terminated_length": 407.0, |
| "epoch": 0.05028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32065215706825256, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5541711.0, |
| "reward": -0.17701950669288635, |
| "reward_std": 0.2957555055618286, |
| "rewards/cosine_scaled_reward/mean": -0.17701953649520874, |
| "rewards/cosine_scaled_reward/std": 0.38460060954093933, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2032.0, |
| "completions/mean_length": 2013.9375, |
| "completions/mean_terminated_length": 1321.3333740234375, |
| "completions/min_length": 740.0, |
| "completions/min_terminated_length": 740.0, |
| "epoch": 0.05142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22363637387752533, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5682259.0, |
| "reward": -0.20341511070728302, |
| "reward_std": 0.23104795813560486, |
| "rewards/cosine_scaled_reward/mean": -0.20341511070728302, |
| "rewards/cosine_scaled_reward/std": 0.3092363774776459, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1224.0, |
| "completions/mean_length": 1909.0, |
| "completions/mean_terminated_length": 936.0, |
| "completions/min_length": 525.0, |
| "completions/min_terminated_length": 525.0, |
| "epoch": 0.052571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26306217908859253, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "num_tokens": 5815603.0, |
| "reward": -0.26145532727241516, |
| "reward_std": 0.17108051478862762, |
| "rewards/cosine_scaled_reward/mean": -0.2614552974700928, |
| "rewards/cosine_scaled_reward/std": 0.18312901258468628, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1668.0, |
| "completions/mean_length": 1757.1875, |
| "completions/mean_terminated_length": 884.75, |
| "completions/min_length": 477.0, |
| "completions/min_terminated_length": 477.0, |
| "epoch": 0.053714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2856813371181488, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0, |
| "num_tokens": 5938463.0, |
| "reward": -0.20879247784614563, |
| "reward_std": 0.23861759901046753, |
| "rewards/cosine_scaled_reward/mean": -0.20879246294498444, |
| "rewards/cosine_scaled_reward/std": 0.39607998728752136, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1708.0, |
| "completions/mean_length": 1756.5, |
| "completions/mean_terminated_length": 1011.5555419921875, |
| "completions/min_length": 487.0, |
| "completions/min_terminated_length": 487.0, |
| "epoch": 0.054857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27563413977622986, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 6061423.0, |
| "reward": -0.16147920489311218, |
| "reward_std": 0.24055320024490356, |
| "rewards/cosine_scaled_reward/mean": -0.16147920489311218, |
| "rewards/cosine_scaled_reward/std": 0.3948959410190582, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1458.0, |
| "completions/mean_length": 1538.078125, |
| "completions/mean_terminated_length": 839.2963256835938, |
| "completions/min_length": 284.0, |
| "completions/min_terminated_length": 284.0, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27617642283439636, |
| "learning_rate": 9.6e-07, |
| "loss": -0.0, |
| "num_tokens": 6169924.0, |
| "reward": -0.18436825275421143, |
| "reward_std": 0.27141550183296204, |
| "rewards/cosine_scaled_reward/mean": -0.18436823785305023, |
| "rewards/cosine_scaled_reward/std": 0.3920196294784546, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1938.0, |
| "completions/mean_length": 1749.0625, |
| "completions/mean_terminated_length": 772.5333862304688, |
| "completions/min_length": 235.0, |
| "completions/min_terminated_length": 235.0, |
| "epoch": 0.05714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23394836485385895, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0, |
| "num_tokens": 6292680.0, |
| "reward": -0.10770958662033081, |
| "reward_std": 0.22513547539710999, |
| "rewards/cosine_scaled_reward/mean": -0.10770957916975021, |
| "rewards/cosine_scaled_reward/std": 0.421062707901001, |
| "step": 50 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 6292680, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|