{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05714285714285714, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1702.03125, "completions/mean_terminated_length": 993.6190795898438, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.001142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2837817668914795, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 118418.0, "reward": -0.09800112247467041, "reward_std": 0.3028089702129364, "rewards/cosine_scaled_reward/mean": -0.09800112992525101, "rewards/cosine_scaled_reward/std": 0.37953105568885803, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1738.90625, "completions/mean_terminated_length": 949.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2421981245279312, "learning_rate": 2e-08, "loss": -0.0, "num_tokens": 239748.0, "reward": 0.020556632429361343, "reward_std": 0.3545936942100525, "rewards/cosine_scaled_reward/mean": 0.020556632429361343, "rewards/cosine_scaled_reward/std": 0.4492928683757782, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 1952.234375, "completions/mean_terminated_length": 822.2000122070312, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.0034285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.24851329624652863, "learning_rate": 4e-08, "loss": -0.0, "num_tokens": 375163.0, "reward": -0.22721199691295624, "reward_std": 0.14563649892807007, "rewards/cosine_scaled_reward/mean": -0.22721199691295624, "rewards/cosine_scaled_reward/std": 0.1709199845790863, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1554.109375, "completions/mean_terminated_length": 958.0344848632812, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.29272863268852234, "learning_rate": 6e-08, "loss": -0.0, "num_tokens": 484434.0, "reward": -0.17542189359664917, "reward_std": 0.18219107389450073, "rewards/cosine_scaled_reward/mean": -0.17542189359664917, "rewards/cosine_scaled_reward/std": 0.27975013852119446, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1943.0625, "completions/mean_terminated_length": 1088.571533203125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2773251533508301, "learning_rate": 8e-08, "loss": 0.0, "num_tokens": 619606.0, "reward": -0.2648562788963318, "reward_std": 0.21638144552707672, "rewards/cosine_scaled_reward/mean": -0.2648562788963318, "rewards/cosine_scaled_reward/std": 0.23959198594093323, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1854.21875, "completions/mean_terminated_length": 920.5454711914062, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.27399909496307373, "learning_rate": 1e-07, "loss": -0.0, "num_tokens": 749924.0, "reward": -0.19292885065078735, "reward_std": 0.2666770815849304, "rewards/cosine_scaled_reward/mean": -0.19292885065078735, "rewards/cosine_scaled_reward/std": 0.295730322599411, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 1940.5625, "completions/mean_terminated_length": 1065.71435546875, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.23362359404563904, "learning_rate": 1.2e-07, "loss": 0.0, "num_tokens": 884528.0, "reward": -0.18198424577713013, "reward_std": 0.18540163338184357, "rewards/cosine_scaled_reward/mean": -0.18198424577713013, "rewards/cosine_scaled_reward/std": 0.32407456636428833, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1708.5625, "completions/mean_terminated_length": 1013.5238037109375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.24677562713623047, "learning_rate": 1.4e-07, "loss": -0.0, "num_tokens": 1004292.0, "reward": -0.09573853015899658, "reward_std": 0.22485454380512238, "rewards/cosine_scaled_reward/mean": -0.09573852270841599, "rewards/cosine_scaled_reward/std": 0.449250191450119, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 1979.359375, "completions/mean_terminated_length": 949.75, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.010285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.26966309547424316, "learning_rate": 1.6e-07, "loss": 0.0, "num_tokens": 1142427.0, "reward": -0.19992578029632568, "reward_std": 0.20190927386283875, "rewards/cosine_scaled_reward/mean": -0.19992581009864807, "rewards/cosine_scaled_reward/std": 0.23785534501075745, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 1652.59375, "completions/mean_terminated_length": 897.727294921875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3011312484741211, "learning_rate": 1.8e-07, "loss": 0.0, "num_tokens": 1259025.0, "reward": -0.11706389486789703, "reward_std": 0.2934548258781433, "rewards/cosine_scaled_reward/mean": -0.11706390231847763, "rewards/cosine_scaled_reward/std": 0.3601698577404022, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 1946.6875, "completions/mean_terminated_length": 967.3333740234375, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.012571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2451399564743042, "learning_rate": 2e-07, "loss": -0.0, "num_tokens": 1395285.0, "reward": -0.2866281270980835, "reward_std": 0.12184012681245804, "rewards/cosine_scaled_reward/mean": -0.2866281270980835, "rewards/cosine_scaled_reward/std": 0.15141677856445312, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1659.28125, "completions/mean_terminated_length": 1190.137939453125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2733561396598816, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "num_tokens": 1512423.0, "reward": -0.13816070556640625, "reward_std": 0.2968980073928833, "rewards/cosine_scaled_reward/mean": -0.13816070556640625, "rewards/cosine_scaled_reward/std": 0.3597467839717865, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 1807.796875, "completions/mean_terminated_length": 1023.1333618164062, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 0.014857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.25238803029060364, "learning_rate": 2.4e-07, "loss": 0.0, "num_tokens": 1639162.0, "reward": -0.13488636910915375, "reward_std": 0.2661236524581909, "rewards/cosine_scaled_reward/mean": -0.13488635420799255, "rewards/cosine_scaled_reward/std": 0.3444243371486664, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 1846.921875, "completions/mean_terminated_length": 1243.6875, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.2201598882675171, "learning_rate": 2.6e-07, "loss": -0.0, "num_tokens": 1767973.0, "reward": -0.20591925084590912, "reward_std": 0.21505361795425415, "rewards/cosine_scaled_reward/mean": -0.20591923594474792, "rewards/cosine_scaled_reward/std": 0.323749840259552, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 1710.421875, "completions/mean_terminated_length": 847.7222290039062, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2665213644504547, "learning_rate": 2.8e-07, "loss": 0.0, "num_tokens": 1888360.0, "reward": -0.0778750479221344, "reward_std": 0.17502948641777039, "rewards/cosine_scaled_reward/mean": -0.0778750628232956, "rewards/cosine_scaled_reward/std": 0.47343766689300537, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 2031.03125, "completions/mean_terminated_length": 962.0, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.23009927570819855, "learning_rate": 3e-07, "loss": -0.0, "num_tokens": 2028786.0, "reward": -0.2619968056678772, "reward_std": 0.16954168677330017, "rewards/cosine_scaled_reward/mean": -0.2619968056678772, "rewards/cosine_scaled_reward/std": 0.18357795476913452, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 1533.15625, "completions/mean_terminated_length": 780.6923217773438, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.019428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.3392995297908783, "learning_rate": 3.2e-07, "loss": -0.0, "num_tokens": 2137428.0, "reward": -0.11706461012363434, "reward_std": 0.3096129894256592, "rewards/cosine_scaled_reward/mean": -0.11706460267305374, "rewards/cosine_scaled_reward/std": 0.3810974657535553, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 1774.46875, "completions/mean_terminated_length": 1018.2352905273438, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23254038393497467, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "num_tokens": 2261370.0, "reward": -0.18709540367126465, "reward_std": 0.2795025110244751, "rewards/cosine_scaled_reward/mean": -0.18709540367126465, "rewards/cosine_scaled_reward/std": 0.3359416127204895, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1719.0, "completions/mean_terminated_length": 995.2000122070312, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.021714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.262045681476593, "learning_rate": 3.6e-07, "loss": -0.0, "num_tokens": 2382642.0, "reward": -0.02329203486442566, "reward_std": 0.34684932231903076, "rewards/cosine_scaled_reward/mean": -0.02329203486442566, "rewards/cosine_scaled_reward/std": 0.47637447714805603, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1630.90625, "completions/mean_terminated_length": 935.75, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.250532329082489, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "num_tokens": 2498372.0, "reward": -0.06319350004196167, "reward_std": 0.2394939512014389, "rewards/cosine_scaled_reward/mean": -0.06319350004196167, "rewards/cosine_scaled_reward/std": 0.3889789879322052, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1735.96875, "completions/mean_terminated_length": 1140.272705078125, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.2773231565952301, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 2620282.0, "reward": -0.20884393155574799, "reward_std": 0.20233216881752014, "rewards/cosine_scaled_reward/mean": -0.20884393155574799, "rewards/cosine_scaled_reward/std": 0.28432920575141907, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1342.953125, "completions/mean_terminated_length": 919.9249877929688, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.34627005457878113, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "num_tokens": 2715247.0, "reward": -0.09092864394187927, "reward_std": 0.21042926609516144, "rewards/cosine_scaled_reward/mean": -0.09092865139245987, "rewards/cosine_scaled_reward/std": 0.43559205532073975, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1661.9375, "completions/mean_terminated_length": 1132.888916015625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.026285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.2705242335796356, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "num_tokens": 2832403.0, "reward": -0.13339249789714813, "reward_std": 0.2433384656906128, "rewards/cosine_scaled_reward/mean": -0.13339248299598694, "rewards/cosine_scaled_reward/std": 0.3815627098083496, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1802.296875, "completions/mean_terminated_length": 1065.1875, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.24961258471012115, "learning_rate": 4.6e-07, "loss": 0.0, "num_tokens": 2958678.0, "reward": -0.18733163177967072, "reward_std": 0.2773033380508423, "rewards/cosine_scaled_reward/mean": -0.1873316466808319, "rewards/cosine_scaled_reward/std": 0.37051624059677124, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 1731.53125, "completions/mean_terminated_length": 982.0, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2662124037742615, "learning_rate": 4.8e-07, "loss": 0.0, "num_tokens": 3079792.0, "reward": -0.12407588213682175, "reward_std": 0.25581949949264526, "rewards/cosine_scaled_reward/mean": -0.12407589703798294, "rewards/cosine_scaled_reward/std": 0.39043793082237244, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1965.46875, "completions/mean_terminated_length": 1567.8182373046875, "completions/min_length": 1006.0, "completions/min_terminated_length": 1006.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23202598094940186, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 3216214.0, "reward": -0.0963105633854866, "reward_std": 0.30887559056282043, "rewards/cosine_scaled_reward/mean": -0.0963105633854866, "rewards/cosine_scaled_reward/std": 0.39396020770072937, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1886.96875, "completions/mean_terminated_length": 1111.0909423828125, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.030857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2878379225730896, "learning_rate": 5.2e-07, "loss": -0.0, "num_tokens": 3347268.0, "reward": -0.1645491123199463, "reward_std": 0.28629785776138306, "rewards/cosine_scaled_reward/mean": -0.1645491123199463, "rewards/cosine_scaled_reward/std": 0.35050687193870544, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1843.640625, "completions/mean_terminated_length": 1230.5625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.24996496737003326, "learning_rate": 5.4e-07, "loss": 0.0, "num_tokens": 3475597.0, "reward": -0.06605555862188339, "reward_std": 0.2643629312515259, "rewards/cosine_scaled_reward/mean": -0.06605555862188339, "rewards/cosine_scaled_reward/std": 0.438128799200058, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 2020.5, "completions/mean_terminated_length": 1608.0, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.03314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23316837847232819, "learning_rate": 5.6e-07, "loss": -0.0, "num_tokens": 3615381.0, "reward": -0.2015206664800644, "reward_std": 0.15312039852142334, "rewards/cosine_scaled_reward/mean": -0.2015206664800644, "rewards/cosine_scaled_reward/std": 0.1648881882429123, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 1826.046875, "completions/mean_terminated_length": 955.3077392578125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2410832792520523, "learning_rate": 5.8e-07, "loss": -0.0, "num_tokens": 3742784.0, "reward": -0.17509159445762634, "reward_std": 0.18994277715682983, "rewards/cosine_scaled_reward/mean": -0.17509159445762634, "rewards/cosine_scaled_reward/std": 0.22516494989395142, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 1781.4375, "completions/mean_terminated_length": 910.6666870117188, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.03542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2693414092063904, "learning_rate": 6e-07, "loss": 0.0, "num_tokens": 3867292.0, "reward": -0.24513831734657288, "reward_std": 0.28315529227256775, "rewards/cosine_scaled_reward/mean": -0.24513831734657288, "rewards/cosine_scaled_reward/std": 0.3480584919452667, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1969.28125, "completions/mean_terminated_length": 1488.2222900390625, "completions/min_length": 1088.0, "completions/min_terminated_length": 1088.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.24202018976211548, "learning_rate": 6.2e-07, "loss": 0.0, "num_tokens": 4003678.0, "reward": -0.18968716263771057, "reward_std": 0.28299200534820557, "rewards/cosine_scaled_reward/mean": -0.18968716263771057, "rewards/cosine_scaled_reward/std": 0.3119950294494629, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.037714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22288212180137634, "learning_rate": 6.4e-07, "loss": 0.0, "num_tokens": 4145966.0, "reward": -0.2955162525177002, "reward_std": 0.17793573439121246, "rewards/cosine_scaled_reward/mean": -0.2955162525177002, "rewards/cosine_scaled_reward/std": 0.22786569595336914, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1589.640625, "completions/mean_terminated_length": 1036.4482421875, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.31030499935150146, "learning_rate": 6.6e-07, "loss": 0.0, "num_tokens": 4257255.0, "reward": 0.008002171292901039, "reward_std": 0.3413254916667938, "rewards/cosine_scaled_reward/mean": 0.008002176880836487, "rewards/cosine_scaled_reward/std": 0.4431404769420624, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1785.921875, "completions/mean_terminated_length": 757.769287109375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.3145958483219147, "learning_rate": 6.800000000000001e-07, "loss": -0.0, "num_tokens": 4383050.0, "reward": -0.16386553645133972, "reward_std": 0.2818174958229065, "rewards/cosine_scaled_reward/mean": -0.16386555135250092, "rewards/cosine_scaled_reward/std": 0.3242056965827942, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 2000.421875, "completions/mean_terminated_length": 1033.0, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25796815752983093, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 4522189.0, "reward": -0.2470606118440628, "reward_std": 0.15509279072284698, "rewards/cosine_scaled_reward/mean": -0.2470606118440628, "rewards/cosine_scaled_reward/std": 0.16412879526615143, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1964.46875, "completions/mean_terminated_length": 1284.2857666015625, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 0.04228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.22452199459075928, "learning_rate": 7.2e-07, "loss": 0.0, "num_tokens": 4658939.0, "reward": -0.24706938862800598, "reward_std": 0.18499845266342163, "rewards/cosine_scaled_reward/mean": -0.24706941843032837, "rewards/cosine_scaled_reward/std": 0.21092188358306885, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1925.234375, "completions/mean_terminated_length": 1175.0, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.23703666031360626, "learning_rate": 7.4e-07, "loss": -0.0, "num_tokens": 4793866.0, "reward": -0.11504355818033218, "reward_std": 0.20660358667373657, "rewards/cosine_scaled_reward/mean": -0.11504356563091278, "rewards/cosine_scaled_reward/std": 0.3190351724624634, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 1740.546875, "completions/mean_terminated_length": 642.5, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.044571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.23829001188278198, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "num_tokens": 4916045.0, "reward": -0.12095541507005692, "reward_std": 0.1958026885986328, "rewards/cosine_scaled_reward/mean": -0.12095542997121811, "rewards/cosine_scaled_reward/std": 0.340241402387619, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 1713.203125, "completions/mean_terminated_length": 920.26318359375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.24145744740962982, "learning_rate": 7.799999999999999e-07, "loss": -0.0, "num_tokens": 5035762.0, "reward": -0.10936243832111359, "reward_std": 0.14468500018119812, "rewards/cosine_scaled_reward/mean": -0.10936242341995239, "rewards/cosine_scaled_reward/std": 0.4288744330406189, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 1909.71875, "completions/mean_terminated_length": 1367.2308349609375, "completions/min_length": 1138.0, "completions/min_terminated_length": 1138.0, "epoch": 0.046857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.22317881882190704, "learning_rate": 8e-07, "loss": 0.0, "num_tokens": 5169136.0, "reward": -0.2058967649936676, "reward_std": 0.2325170338153839, "rewards/cosine_scaled_reward/mean": -0.20589673519134521, "rewards/cosine_scaled_reward/std": 0.28897321224212646, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 1727.71875, "completions/mean_terminated_length": 583.857177734375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.44688937067985535, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "num_tokens": 5290070.0, "reward": -0.2254919707775116, "reward_std": 0.1687203049659729, "rewards/cosine_scaled_reward/mean": -0.2254919707775116, "rewards/cosine_scaled_reward/std": 0.18203677237033844, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 1855.328125, "completions/mean_terminated_length": 814.9000244140625, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 0.04914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2430828958749771, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "num_tokens": 5420427.0, "reward": -0.09104865789413452, "reward_std": 0.18217626214027405, "rewards/cosine_scaled_reward/mean": -0.09104865789413452, "rewards/cosine_scaled_reward/std": 0.3521345257759094, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 1727.9375, "completions/mean_terminated_length": 767.75, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.32065215706825256, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "num_tokens": 5541711.0, "reward": -0.17701950669288635, "reward_std": 0.2957555055618286, "rewards/cosine_scaled_reward/mean": -0.17701953649520874, "rewards/cosine_scaled_reward/std": 0.38460060954093933, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 2013.9375, "completions/mean_terminated_length": 1321.3333740234375, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.22363637387752533, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "num_tokens": 5682259.0, "reward": -0.20341511070728302, "reward_std": 0.23104795813560486, "rewards/cosine_scaled_reward/mean": -0.20341511070728302, "rewards/cosine_scaled_reward/std": 0.3092363774776459, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 1909.0, "completions/mean_terminated_length": 936.0, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.26306217908859253, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 5815603.0, "reward": -0.26145532727241516, "reward_std": 0.17108051478862762, "rewards/cosine_scaled_reward/mean": -0.2614552974700928, "rewards/cosine_scaled_reward/std": 0.18312901258468628, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 1757.1875, "completions/mean_terminated_length": 884.75, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.053714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2856813371181488, "learning_rate": 9.2e-07, "loss": 0.0, "num_tokens": 5938463.0, "reward": -0.20879247784614563, "reward_std": 0.23861759901046753, "rewards/cosine_scaled_reward/mean": -0.20879246294498444, "rewards/cosine_scaled_reward/std": 0.39607998728752136, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1756.5, "completions/mean_terminated_length": 1011.5555419921875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.27563413977622986, "learning_rate": 9.399999999999999e-07, "loss": -0.0, "num_tokens": 6061423.0, "reward": -0.16147920489311218, "reward_std": 0.24055320024490356, "rewards/cosine_scaled_reward/mean": -0.16147920489311218, "rewards/cosine_scaled_reward/std": 0.3948959410190582, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 1538.078125, "completions/mean_terminated_length": 839.2963256835938, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.27617642283439636, "learning_rate": 9.6e-07, "loss": -0.0, "num_tokens": 6169924.0, "reward": -0.18436825275421143, "reward_std": 0.27141550183296204, "rewards/cosine_scaled_reward/mean": -0.18436823785305023, "rewards/cosine_scaled_reward/std": 0.3920196294784546, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1749.0625, "completions/mean_terminated_length": 772.5333862304688, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23394836485385895, "learning_rate": 9.8e-07, "loss": 0.0, "num_tokens": 6292680.0, "reward": -0.10770958662033081, "reward_std": 0.22513547539710999, "rewards/cosine_scaled_reward/mean": -0.10770957916975021, "rewards/cosine_scaled_reward/std": 0.421062707901001, "step": 50 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 6292680, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }