| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.17142857142857143, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1702.03125, |
| "completions/mean_terminated_length": 993.6190795898438, |
| "completions/min_length": 483.0, |
| "completions/min_terminated_length": 483.0, |
| "epoch": 0.001142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28377610445022583, |
| "learning_rate": 0.0, |
| "loss": -0.0, |
| "num_tokens": 118418.0, |
| "reward": -0.09800112247467041, |
| "reward_std": 0.3028089702129364, |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1738.90625, |
| "completions/mean_terminated_length": 949.0, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.002285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24221572279930115, |
| "learning_rate": 2e-08, |
| "loss": -0.0, |
| "num_tokens": 239748.0, |
| "reward": 0.020556632429361343, |
| "reward_std": 0.3545936942100525, |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1542.0, |
| "completions/mean_length": 1964.078125, |
| "completions/mean_terminated_length": 973.7999877929688, |
| "completions/min_length": 733.0, |
| "completions/min_terminated_length": 733.0, |
| "epoch": 0.0034285714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2472974807024002, |
| "learning_rate": 4e-08, |
| "loss": 0.0, |
| "num_tokens": 375921.0, |
| "reward": -0.20954538881778717, |
| "reward_std": 0.13813795149326324, |
| "rewards/cosine_scaled_reward/mean": -0.20954540371894836, |
| "rewards/cosine_scaled_reward/std": 0.16814909875392914, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2010.0, |
| "completions/mean_length": 1555.6875, |
| "completions/mean_terminated_length": 1093.212158203125, |
| "completions/min_length": 502.0, |
| "completions/min_terminated_length": 502.0, |
| "epoch": 0.004571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2868657112121582, |
| "learning_rate": 6e-08, |
| "loss": -0.0, |
| "num_tokens": 485293.0, |
| "reward": -0.12192361056804657, |
| "reward_std": 0.31710442900657654, |
| "rewards/cosine_scaled_reward/mean": -0.12192361056804657, |
| "rewards/cosine_scaled_reward/std": 0.35428565740585327, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1758.0, |
| "completions/mean_length": 1958.5625, |
| "completions/mean_terminated_length": 1332.5, |
| "completions/min_length": 932.0, |
| "completions/min_terminated_length": 932.0, |
| "epoch": 0.005714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2460148185491562, |
| "learning_rate": 8e-08, |
| "loss": -0.0, |
| "num_tokens": 621457.0, |
| "reward": -0.21145480871200562, |
| "reward_std": 0.14890719950199127, |
| "rewards/cosine_scaled_reward/mean": -0.21145479381084442, |
| "rewards/cosine_scaled_reward/std": 0.20399661362171173, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1670.0, |
| "completions/mean_length": 1908.375, |
| "completions/mean_terminated_length": 931.0, |
| "completions/min_length": 593.0, |
| "completions/min_terminated_length": 593.0, |
| "epoch": 0.006857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26549720764160156, |
| "learning_rate": 1e-07, |
| "loss": -0.0, |
| "num_tokens": 755241.0, |
| "reward": -0.2408866286277771, |
| "reward_std": 0.16572487354278564, |
| "rewards/cosine_scaled_reward/mean": -0.2408866286277771, |
| "rewards/cosine_scaled_reward/std": 0.17492830753326416, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1889.296875, |
| "completions/mean_terminated_length": 1201.5833740234375, |
| "completions/min_length": 396.0, |
| "completions/min_terminated_length": 396.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23518230020999908, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0, |
| "num_tokens": 886564.0, |
| "reward": -0.16087877750396729, |
| "reward_std": 0.24579641222953796, |
| "rewards/cosine_scaled_reward/mean": -0.16087877750396729, |
| "rewards/cosine_scaled_reward/std": 0.37339961528778076, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1751.578125, |
| "completions/mean_terminated_length": 994.0555419921875, |
| "completions/min_length": 330.0, |
| "completions/min_terminated_length": 330.0, |
| "epoch": 0.009142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2354528158903122, |
| "learning_rate": 1.4e-07, |
| "loss": 0.0, |
| "num_tokens": 1009081.0, |
| "reward": -0.023812226951122284, |
| "reward_std": 0.2823081314563751, |
| "rewards/cosine_scaled_reward/mean": -0.02381223440170288, |
| "rewards/cosine_scaled_reward/std": 0.4484662115573883, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1889.0, |
| "completions/mean_length": 2000.59375, |
| "completions/mean_terminated_length": 1289.5, |
| "completions/min_length": 903.0, |
| "completions/min_terminated_length": 903.0, |
| "epoch": 0.010285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24302220344543457, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1148575.0, |
| "reward": -0.2453702688217163, |
| "reward_std": 0.18811637163162231, |
| "rewards/cosine_scaled_reward/mean": -0.2453702688217163, |
| "rewards/cosine_scaled_reward/std": 0.22203005850315094, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1739.0, |
| "completions/mean_length": 1701.140625, |
| "completions/mean_terminated_length": 879.631591796875, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "epoch": 0.011428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25642141699790955, |
| "learning_rate": 1.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1268280.0, |
| "reward": -0.15177705883979797, |
| "reward_std": 0.2125300019979477, |
| "rewards/cosine_scaled_reward/mean": -0.15177705883979797, |
| "rewards/cosine_scaled_reward/std": 0.3240113854408264, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1683.0, |
| "completions/mean_length": 1950.609375, |
| "completions/mean_terminated_length": 1157.571533203125, |
| "completions/min_length": 584.0, |
| "completions/min_terminated_length": 584.0, |
| "epoch": 0.012571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24372951686382294, |
| "learning_rate": 2e-07, |
| "loss": 0.0, |
| "num_tokens": 1404791.0, |
| "reward": -0.23502977192401886, |
| "reward_std": 0.18896539509296417, |
| "rewards/cosine_scaled_reward/mean": -0.23502977192401886, |
| "rewards/cosine_scaled_reward/std": 0.24224351346492767, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1944.0, |
| "completions/mean_length": 1751.03125, |
| "completions/mean_terminated_length": 1221.6522216796875, |
| "completions/min_length": 489.0, |
| "completions/min_terminated_length": 489.0, |
| "epoch": 0.013714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28422027826309204, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": -0.0, |
| "num_tokens": 1527801.0, |
| "reward": -0.14280016720294952, |
| "reward_std": 0.32843896746635437, |
| "rewards/cosine_scaled_reward/mean": -0.14280015230178833, |
| "rewards/cosine_scaled_reward/std": 0.41895967721939087, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1793.0, |
| "completions/mean_length": 1834.453125, |
| "completions/mean_terminated_length": 1193.8125, |
| "completions/min_length": 783.0, |
| "completions/min_terminated_length": 783.0, |
| "epoch": 0.014857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24033738672733307, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0, |
| "num_tokens": 1656246.0, |
| "reward": -0.17057427763938904, |
| "reward_std": 0.24429668486118317, |
| "rewards/cosine_scaled_reward/mean": -0.17057427763938904, |
| "rewards/cosine_scaled_reward/std": 0.27816399931907654, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1474.0, |
| "completions/mean_length": 1800.65625, |
| "completions/mean_terminated_length": 1116.823486328125, |
| "completions/min_length": 495.0, |
| "completions/min_terminated_length": 495.0, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2312558889389038, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1782096.0, |
| "reward": -0.11817245185375214, |
| "reward_std": 0.24491220712661743, |
| "rewards/cosine_scaled_reward/mean": -0.11817245930433273, |
| "rewards/cosine_scaled_reward/std": 0.3942086696624756, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1636.0, |
| "completions/mean_length": 1692.828125, |
| "completions/mean_terminated_length": 785.1666870117188, |
| "completions/min_length": 438.0, |
| "completions/min_terminated_length": 438.0, |
| "epoch": 0.017142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2563658654689789, |
| "learning_rate": 2.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1901357.0, |
| "reward": -0.027107469737529755, |
| "reward_std": 0.1853453516960144, |
| "rewards/cosine_scaled_reward/mean": -0.027107462286949158, |
| "rewards/cosine_scaled_reward/std": 0.4734213352203369, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2048.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2048.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24149107933044434, |
| "learning_rate": 3e-07, |
| "loss": -0.0, |
| "num_tokens": 2042869.0, |
| "reward": -0.2542623281478882, |
| "reward_std": 0.14302438497543335, |
| "rewards/cosine_scaled_reward/mean": -0.2542623281478882, |
| "rewards/cosine_scaled_reward/std": 0.160969540476799, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1827.0, |
| "completions/mean_length": 1548.75, |
| "completions/mean_terminated_length": 864.5925903320312, |
| "completions/min_length": 357.0, |
| "completions/min_terminated_length": 357.0, |
| "epoch": 0.019428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31088724732398987, |
| "learning_rate": 3.2e-07, |
| "loss": 0.0, |
| "num_tokens": 2152509.0, |
| "reward": -0.12113451957702637, |
| "reward_std": 0.284165620803833, |
| "rewards/cosine_scaled_reward/mean": -0.12113452702760696, |
| "rewards/cosine_scaled_reward/std": 0.4259316623210907, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1899.0, |
| "completions/mean_length": 1793.03125, |
| "completions/mean_terminated_length": 1028.125, |
| "completions/min_length": 531.0, |
| "completions/min_terminated_length": 531.0, |
| "epoch": 0.02057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2451843023300171, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0, |
| "num_tokens": 2277639.0, |
| "reward": -0.18317042291164398, |
| "reward_std": 0.20634235441684723, |
| "rewards/cosine_scaled_reward/mean": -0.18317043781280518, |
| "rewards/cosine_scaled_reward/std": 0.27781662344932556, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1633.0, |
| "completions/mean_length": 1735.984375, |
| "completions/mean_terminated_length": 997.0, |
| "completions/min_length": 462.0, |
| "completions/min_terminated_length": 462.0, |
| "epoch": 0.021714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24677637219429016, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0, |
| "num_tokens": 2399998.0, |
| "reward": -0.04996331408619881, |
| "reward_std": 0.2841629385948181, |
| "rewards/cosine_scaled_reward/mean": -0.04996330291032791, |
| "rewards/cosine_scaled_reward/std": 0.4186851680278778, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1643.0, |
| "completions/mean_length": 1614.890625, |
| "completions/mean_terminated_length": 842.8261108398438, |
| "completions/min_length": 411.0, |
| "completions/min_terminated_length": 411.0, |
| "epoch": 0.022857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2543003559112549, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": -0.0, |
| "num_tokens": 2514703.0, |
| "reward": -0.09282197058200836, |
| "reward_std": 0.2568933367729187, |
| "rewards/cosine_scaled_reward/mean": -0.09282197058200836, |
| "rewards/cosine_scaled_reward/std": 0.4104878604412079, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1988.0, |
| "completions/mean_length": 1786.734375, |
| "completions/mean_terminated_length": 1119.0555419921875, |
| "completions/min_length": 348.0, |
| "completions/min_terminated_length": 348.0, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3147278130054474, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "num_tokens": 2639862.0, |
| "reward": -0.16029146313667297, |
| "reward_std": 0.2322564721107483, |
| "rewards/cosine_scaled_reward/mean": -0.16029146313667297, |
| "rewards/cosine_scaled_reward/std": 0.36191171407699585, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1833.0, |
| "completions/mean_length": 1300.484375, |
| "completions/mean_terminated_length": 789.0263061523438, |
| "completions/min_length": 287.0, |
| "completions/min_terminated_length": 287.0, |
| "epoch": 0.025142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32522445917129517, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0, |
| "num_tokens": 2732109.0, |
| "reward": 0.0033364146947860718, |
| "reward_std": 0.18878400325775146, |
| "rewards/cosine_scaled_reward/mean": 0.0033364109694957733, |
| "rewards/cosine_scaled_reward/std": 0.45390966534614563, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1641.03125, |
| "completions/mean_terminated_length": 1046.2308349609375, |
| "completions/min_length": 422.0, |
| "completions/min_terminated_length": 422.0, |
| "epoch": 0.026285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28244850039482117, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0, |
| "num_tokens": 2847927.0, |
| "reward": -0.21077856421470642, |
| "reward_std": 0.24399788677692413, |
| "rewards/cosine_scaled_reward/mean": -0.21077856421470642, |
| "rewards/cosine_scaled_reward/std": 0.2925592362880707, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1965.0, |
| "completions/mean_length": 1789.59375, |
| "completions/mean_terminated_length": 1129.2222900390625, |
| "completions/min_length": 560.0, |
| "completions/min_terminated_length": 560.0, |
| "epoch": 0.027428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24896308779716492, |
| "learning_rate": 4.6e-07, |
| "loss": -0.0, |
| "num_tokens": 2973389.0, |
| "reward": -0.1665852814912796, |
| "reward_std": 0.307574987411499, |
| "rewards/cosine_scaled_reward/mean": -0.1665852665901184, |
| "rewards/cosine_scaled_reward/std": 0.4072873294353485, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1851.0, |
| "completions/mean_length": 1696.40625, |
| "completions/mean_terminated_length": 1025.181884765625, |
| "completions/min_length": 434.0, |
| "completions/min_terminated_length": 434.0, |
| "epoch": 0.02857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.262716144323349, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0, |
| "num_tokens": 3092255.0, |
| "reward": -0.14361324906349182, |
| "reward_std": 0.3466429114341736, |
| "rewards/cosine_scaled_reward/mean": -0.14361326396465302, |
| "rewards/cosine_scaled_reward/std": 0.3933021128177643, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1975.0, |
| "completions/mean_length": 1973.046875, |
| "completions/mean_terminated_length": 1448.375, |
| "completions/min_length": 1035.0, |
| "completions/min_terminated_length": 1035.0, |
| "epoch": 0.029714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2365841567516327, |
| "learning_rate": 5e-07, |
| "loss": -0.0, |
| "num_tokens": 3229162.0, |
| "reward": -0.050574399530887604, |
| "reward_std": 0.22459164261817932, |
| "rewards/cosine_scaled_reward/mean": -0.050574399530887604, |
| "rewards/cosine_scaled_reward/std": 0.37290775775909424, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1878.53125, |
| "completions/mean_terminated_length": 1213.6923828125, |
| "completions/min_length": 498.0, |
| "completions/min_terminated_length": 498.0, |
| "epoch": 0.030857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2821083068847656, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0, |
| "num_tokens": 3359676.0, |
| "reward": -0.13096781075000763, |
| "reward_std": 0.26249831914901733, |
| "rewards/cosine_scaled_reward/mean": -0.13096781075000763, |
| "rewards/cosine_scaled_reward/std": 0.3478032350540161, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1933.0, |
| "completions/mean_length": 1827.453125, |
| "completions/mean_terminated_length": 1039.7857666015625, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2539210915565491, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0, |
| "num_tokens": 3486969.0, |
| "reward": -0.11822876334190369, |
| "reward_std": 0.2370690554380417, |
| "rewards/cosine_scaled_reward/mean": -0.11822875589132309, |
| "rewards/cosine_scaled_reward/std": 0.4236762225627899, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 2020.5, |
| "completions/mean_terminated_length": 1608.0, |
| "completions/min_length": 887.0, |
| "completions/min_terminated_length": 887.0, |
| "epoch": 0.03314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23259545862674713, |
| "learning_rate": 5.6e-07, |
| "loss": -0.0, |
| "num_tokens": 3626753.0, |
| "reward": -0.20220182836055756, |
| "reward_std": 0.15910759568214417, |
| "rewards/cosine_scaled_reward/mean": -0.20220182836055756, |
| "rewards/cosine_scaled_reward/std": 0.20781411230564117, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1662.0, |
| "completions/mean_length": 1903.703125, |
| "completions/mean_terminated_length": 1208.45458984375, |
| "completions/min_length": 961.0, |
| "completions/min_terminated_length": 961.0, |
| "epoch": 0.03428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24027252197265625, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0, |
| "num_tokens": 3759126.0, |
| "reward": -0.19193249940872192, |
| "reward_std": 0.24584847688674927, |
| "rewards/cosine_scaled_reward/mean": -0.19193249940872192, |
| "rewards/cosine_scaled_reward/std": 0.28378522396087646, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1922.0, |
| "completions/mean_length": 1847.34375, |
| "completions/mean_terminated_length": 1060.1539306640625, |
| "completions/min_length": 311.0, |
| "completions/min_terminated_length": 311.0, |
| "epoch": 0.03542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2703397274017334, |
| "learning_rate": 6e-07, |
| "loss": -0.0, |
| "num_tokens": 3887852.0, |
| "reward": -0.25379180908203125, |
| "reward_std": 0.24661941826343536, |
| "rewards/cosine_scaled_reward/mean": -0.25379180908203125, |
| "rewards/cosine_scaled_reward/std": 0.29188498854637146, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 1950.3125, |
| "completions/mean_terminated_length": 1479.6363525390625, |
| "completions/min_length": 766.0, |
| "completions/min_terminated_length": 766.0, |
| "epoch": 0.036571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21763876080513, |
| "learning_rate": 6.2e-07, |
| "loss": -0.0, |
| "num_tokens": 4023024.0, |
| "reward": -0.16017228364944458, |
| "reward_std": 0.2255343496799469, |
| "rewards/cosine_scaled_reward/mean": -0.16017228364944458, |
| "rewards/cosine_scaled_reward/std": 0.3709539771080017, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 1996.28125, |
| "completions/mean_terminated_length": 1634.25, |
| "completions/min_length": 1237.0, |
| "completions/min_terminated_length": 1237.0, |
| "epoch": 0.037714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22758260369300842, |
| "learning_rate": 6.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4162002.0, |
| "reward": -0.20318198204040527, |
| "reward_std": 0.18396919965744019, |
| "rewards/cosine_scaled_reward/mean": -0.20318198204040527, |
| "rewards/cosine_scaled_reward/std": 0.34913352131843567, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1850.0, |
| "completions/mean_length": 1703.265625, |
| "completions/mean_terminated_length": 1230.851806640625, |
| "completions/min_length": 651.0, |
| "completions/min_terminated_length": 651.0, |
| "epoch": 0.038857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31658875942230225, |
| "learning_rate": 6.6e-07, |
| "loss": -0.0, |
| "num_tokens": 4280563.0, |
| "reward": -0.05977274850010872, |
| "reward_std": 0.30437377095222473, |
| "rewards/cosine_scaled_reward/mean": -0.059772733598947525, |
| "rewards/cosine_scaled_reward/std": 0.4424094259738922, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1846.0, |
| "completions/mean_length": 1807.546875, |
| "completions/mean_terminated_length": 765.5833740234375, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2792847156524658, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": -0.0, |
| "num_tokens": 4407742.0, |
| "reward": -0.18658886849880219, |
| "reward_std": 0.2910658121109009, |
| "rewards/cosine_scaled_reward/mean": -0.18658888339996338, |
| "rewards/cosine_scaled_reward/std": 0.34802255034446716, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1912.0, |
| "completions/mean_length": 1995.65625, |
| "completions/mean_terminated_length": 1378.0, |
| "completions/min_length": 1090.0, |
| "completions/min_terminated_length": 1090.0, |
| "epoch": 0.04114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23547738790512085, |
| "learning_rate": 7e-07, |
| "loss": 0.0, |
| "num_tokens": 4546576.0, |
| "reward": -0.23918019235134125, |
| "reward_std": 0.19598917663097382, |
| "rewards/cosine_scaled_reward/mean": -0.23918019235134125, |
| "rewards/cosine_scaled_reward/std": 0.2425125539302826, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2015.0, |
| "completions/mean_length": 1994.75, |
| "completions/mean_terminated_length": 1480.0, |
| "completions/min_length": 545.0, |
| "completions/min_terminated_length": 545.0, |
| "epoch": 0.04228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22962674498558044, |
| "learning_rate": 7.2e-07, |
| "loss": -0.0, |
| "num_tokens": 4685264.0, |
| "reward": -0.25335729122161865, |
| "reward_std": 0.15323391556739807, |
| "rewards/cosine_scaled_reward/mean": -0.25335729122161865, |
| "rewards/cosine_scaled_reward/std": 0.17556406557559967, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1934.0, |
| "completions/mean_length": 1957.484375, |
| "completions/mean_terminated_length": 1220.4285888671875, |
| "completions/min_length": 965.0, |
| "completions/min_terminated_length": 965.0, |
| "epoch": 0.04342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24781912565231323, |
| "learning_rate": 7.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4822255.0, |
| "reward": -0.13536512851715088, |
| "reward_std": 0.19208545982837677, |
| "rewards/cosine_scaled_reward/mean": -0.13536511361598969, |
| "rewards/cosine_scaled_reward/std": 0.30052343010902405, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1824.0, |
| "completions/mean_length": 1744.421875, |
| "completions/mean_terminated_length": 833.6875, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.044571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2562144994735718, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 4944682.0, |
| "reward": -0.041110455989837646, |
| "reward_std": 0.21381449699401855, |
| "rewards/cosine_scaled_reward/mean": -0.04111045226454735, |
| "rewards/cosine_scaled_reward/std": 0.35980772972106934, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1774.359375, |
| "completions/mean_terminated_length": 1017.8235473632812, |
| "completions/min_length": 445.0, |
| "completions/min_terminated_length": 445.0, |
| "epoch": 0.045714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25478634238243103, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5068313.0, |
| "reward": -0.12165145576000214, |
| "reward_std": 0.17204006016254425, |
| "rewards/cosine_scaled_reward/mean": -0.12165144830942154, |
| "rewards/cosine_scaled_reward/std": 0.4099982678890228, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1991.0, |
| "completions/mean_length": 1814.375, |
| "completions/mean_terminated_length": 1397.9130859375, |
| "completions/min_length": 968.0, |
| "completions/min_terminated_length": 968.0, |
| "epoch": 0.046857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21750310063362122, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "num_tokens": 5195585.0, |
| "reward": -0.25668060779571533, |
| "reward_std": 0.2832298278808594, |
| "rewards/cosine_scaled_reward/mean": -0.25668060779571533, |
| "rewards/cosine_scaled_reward/std": 0.3347759544849396, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1764.0, |
| "completions/mean_length": 1714.59375, |
| "completions/mean_terminated_length": 625.4666748046875, |
| "completions/min_length": 186.0, |
| "completions/min_terminated_length": 186.0, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34486907720565796, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 5315679.0, |
| "reward": -0.2253742218017578, |
| "reward_std": 0.1778060495853424, |
| "rewards/cosine_scaled_reward/mean": -0.22537420690059662, |
| "rewards/cosine_scaled_reward/std": 0.19647939503192902, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1638.0, |
| "completions/mean_length": 1863.78125, |
| "completions/mean_terminated_length": 976.1818237304688, |
| "completions/min_length": 669.0, |
| "completions/min_terminated_length": 669.0, |
| "epoch": 0.04914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23907455801963806, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5446577.0, |
| "reward": -0.1142776757478714, |
| "reward_std": 0.21804723143577576, |
| "rewards/cosine_scaled_reward/mean": -0.1142776757478714, |
| "rewards/cosine_scaled_reward/std": 0.3637608587741852, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1771.125, |
| "completions/mean_terminated_length": 940.5, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.05028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2888188362121582, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5570625.0, |
| "reward": -0.11845305562019348, |
| "reward_std": 0.2729855477809906, |
| "rewards/cosine_scaled_reward/mean": -0.11845306307077408, |
| "rewards/cosine_scaled_reward/std": 0.4279690086841583, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.96875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1296.0, |
| "completions/mean_length": 2020.859375, |
| "completions/mean_terminated_length": 1179.5, |
| "completions/min_length": 1063.0, |
| "completions/min_terminated_length": 1063.0, |
| "epoch": 0.05142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2232045829296112, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5711616.0, |
| "reward": -0.1830526441335678, |
| "reward_std": 0.20074567198753357, |
| "rewards/cosine_scaled_reward/mean": -0.1830526441335678, |
| "rewards/cosine_scaled_reward/std": 0.3221423327922821, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1121.0, |
| "completions/mean_length": 1843.328125, |
| "completions/mean_terminated_length": 857.1818237304688, |
| "completions/min_length": 608.0, |
| "completions/min_terminated_length": 608.0, |
| "epoch": 0.052571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2569328844547272, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "num_tokens": 5840757.0, |
| "reward": -0.21247822046279907, |
| "reward_std": 0.17188501358032227, |
| "rewards/cosine_scaled_reward/mean": -0.21247822046279907, |
| "rewards/cosine_scaled_reward/std": 0.183182492852211, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1772.984375, |
| "completions/mean_terminated_length": 1012.6470336914062, |
| "completions/min_length": 461.0, |
| "completions/min_terminated_length": 461.0, |
| "epoch": 0.053714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2800576090812683, |
| "learning_rate": 9.2e-07, |
| "loss": -0.0, |
| "num_tokens": 5964628.0, |
| "reward": -0.1755329668521881, |
| "reward_std": 0.19662824273109436, |
| "rewards/cosine_scaled_reward/mean": -0.1755329668521881, |
| "rewards/cosine_scaled_reward/std": 0.3987559974193573, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1949.0, |
| "completions/mean_length": 1787.046875, |
| "completions/mean_terminated_length": 1120.1666259765625, |
| "completions/min_length": 630.0, |
| "completions/min_terminated_length": 630.0, |
| "epoch": 0.054857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2499135434627533, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 6089543.0, |
| "reward": -0.07469595968723297, |
| "reward_std": 0.2802818715572357, |
| "rewards/cosine_scaled_reward/mean": -0.07469595968723297, |
| "rewards/cosine_scaled_reward/std": 0.39331451058387756, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1818.0, |
| "completions/mean_length": 1611.65625, |
| "completions/mean_terminated_length": 1013.7037353515625, |
| "completions/min_length": 298.0, |
| "completions/min_terminated_length": 298.0, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2976716160774231, |
| "learning_rate": 9.6e-07, |
| "loss": -0.0, |
| "num_tokens": 6202753.0, |
| "reward": -0.14219576120376587, |
| "reward_std": 0.3252427875995636, |
| "rewards/cosine_scaled_reward/mean": -0.14219576120376587, |
| "rewards/cosine_scaled_reward/std": 0.41946855187416077, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1826.90625, |
| "completions/mean_terminated_length": 761.6364135742188, |
| "completions/min_length": 341.0, |
| "completions/min_terminated_length": 341.0, |
| "epoch": 0.05714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2344626933336258, |
| "learning_rate": 9.8e-07, |
| "loss": -0.0, |
| "num_tokens": 6330491.0, |
| "reward": -0.098542720079422, |
| "reward_std": 0.20483215153217316, |
| "rewards/cosine_scaled_reward/mean": -0.0985427126288414, |
| "rewards/cosine_scaled_reward/std": 0.396296888589859, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 1520.1875, |
| "completions/mean_terminated_length": 922.0000610351562, |
| "completions/min_length": 450.0, |
| "completions/min_terminated_length": 450.0, |
| "epoch": 0.05828571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30348992347717285, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "num_tokens": 6437991.0, |
| "reward": -0.12996003031730652, |
| "reward_std": 0.2803010940551758, |
| "rewards/cosine_scaled_reward/mean": -0.12996003031730652, |
| "rewards/cosine_scaled_reward/std": 0.3464147746562958, |
| "step": 51 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1930.0, |
| "completions/mean_length": 1726.71875, |
| "completions/mean_terminated_length": 838.4705810546875, |
| "completions/min_length": 315.0, |
| "completions/min_terminated_length": 315.0, |
| "epoch": 0.05942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2905585765838623, |
| "learning_rate": 9.999890338174275e-07, |
| "loss": -0.0, |
| "num_tokens": 6559853.0, |
| "reward": -0.2443142831325531, |
| "reward_std": 0.21010473370552063, |
| "rewards/cosine_scaled_reward/mean": -0.2443142831325531, |
| "rewards/cosine_scaled_reward/std": 0.32864055037498474, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1585.0, |
| "completions/mean_length": 1757.015625, |
| "completions/mean_terminated_length": 952.5294189453125, |
| "completions/min_length": 463.0, |
| "completions/min_terminated_length": 463.0, |
| "epoch": 0.060571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2699633538722992, |
| "learning_rate": 9.999561358041868e-07, |
| "loss": 0.0, |
| "num_tokens": 6683134.0, |
| "reward": -0.18116676807403564, |
| "reward_std": 0.2308851182460785, |
| "rewards/cosine_scaled_reward/mean": -0.18116676807403564, |
| "rewards/cosine_scaled_reward/std": 0.27486056089401245, |
| "step": 53 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2018.0, |
| "completions/mean_length": 1850.65625, |
| "completions/mean_terminated_length": 1206.0001220703125, |
| "completions/min_length": 695.0, |
| "completions/min_terminated_length": 695.0, |
| "epoch": 0.061714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23383355140686035, |
| "learning_rate": 9.999013075636804e-07, |
| "loss": -0.0, |
| "num_tokens": 6812720.0, |
| "reward": -0.14257444441318512, |
| "reward_std": 0.29668545722961426, |
| "rewards/cosine_scaled_reward/mean": -0.14257442951202393, |
| "rewards/cosine_scaled_reward/std": 0.4257228672504425, |
| "step": 54 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1449.0, |
| "completions/mean_length": 1754.640625, |
| "completions/mean_terminated_length": 874.5625, |
| "completions/min_length": 581.0, |
| "completions/min_terminated_length": 581.0, |
| "epoch": 0.06285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23320119082927704, |
| "learning_rate": 9.998245517681593e-07, |
| "loss": -0.0, |
| "num_tokens": 6935305.0, |
| "reward": -0.14078931510448456, |
| "reward_std": 0.17466726899147034, |
| "rewards/cosine_scaled_reward/mean": -0.14078931510448456, |
| "rewards/cosine_scaled_reward/std": 0.3331747353076935, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1918.0, |
| "completions/mean_length": 1853.78125, |
| "completions/mean_terminated_length": 918.0, |
| "completions/min_length": 571.0, |
| "completions/min_terminated_length": 571.0, |
| "epoch": 0.064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23405365645885468, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": -0.0, |
| "num_tokens": 7064907.0, |
| "reward": -0.11611534655094147, |
| "reward_std": 0.19285616278648376, |
| "rewards/cosine_scaled_reward/mean": -0.11611534655094147, |
| "rewards/cosine_scaled_reward/std": 0.47406119108200073, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1851.0, |
| "completions/mean_length": 1971.640625, |
| "completions/mean_terminated_length": 1437.125, |
| "completions/min_length": 1009.0, |
| "completions/min_terminated_length": 1009.0, |
| "epoch": 0.06514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20449356734752655, |
| "learning_rate": 9.996052735444862e-07, |
| "loss": 0.0, |
| "num_tokens": 7202660.0, |
| "reward": -0.27627938985824585, |
| "reward_std": 0.2080146074295044, |
| "rewards/cosine_scaled_reward/mean": -0.27627938985824585, |
| "rewards/cosine_scaled_reward/std": 0.2397139072418213, |
| "step": 57 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1813.0, |
| "completions/mean_length": 1678.09375, |
| "completions/mean_terminated_length": 971.9091186523438, |
| "completions/min_length": 540.0, |
| "completions/min_terminated_length": 540.0, |
| "epoch": 0.06628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.251164048910141, |
| "learning_rate": 9.994627618036452e-07, |
| "loss": -0.0, |
| "num_tokens": 7320154.0, |
| "reward": -0.1333095282316208, |
| "reward_std": 0.27265745401382446, |
| "rewards/cosine_scaled_reward/mean": -0.1333095282316208, |
| "rewards/cosine_scaled_reward/std": 0.3821713328361511, |
| "step": 58 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1453.0, |
| "completions/mean_length": 1732.171875, |
| "completions/mean_terminated_length": 859.0, |
| "completions/min_length": 531.0, |
| "completions/min_terminated_length": 531.0, |
| "epoch": 0.06742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22478283941745758, |
| "learning_rate": 9.992983438818915e-07, |
| "loss": -0.0, |
| "num_tokens": 7441477.0, |
| "reward": -0.18278491497039795, |
| "reward_std": 0.2154037207365036, |
| "rewards/cosine_scaled_reward/mean": -0.18278491497039795, |
| "rewards/cosine_scaled_reward/std": 0.3414745628833771, |
| "step": 59 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1935.0, |
| "completions/mean_length": 1798.375, |
| "completions/mean_terminated_length": 982.9334106445312, |
| "completions/min_length": 613.0, |
| "completions/min_terminated_length": 613.0, |
| "epoch": 0.06857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22602440416812897, |
| "learning_rate": 9.991120277927223e-07, |
| "loss": -0.0, |
| "num_tokens": 7567461.0, |
| "reward": -0.265900194644928, |
| "reward_std": 0.1530904918909073, |
| "rewards/cosine_scaled_reward/mean": -0.265900194644928, |
| "rewards/cosine_scaled_reward/std": 0.18254056572914124, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1974.0, |
| "completions/mean_length": 1950.578125, |
| "completions/mean_terminated_length": 1424.5, |
| "completions/min_length": 808.0, |
| "completions/min_terminated_length": 808.0, |
| "epoch": 0.06971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22684067487716675, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": -0.0, |
| "num_tokens": 7703818.0, |
| "reward": -0.05269922316074371, |
| "reward_std": 0.3038993775844574, |
| "rewards/cosine_scaled_reward/mean": -0.052699219435453415, |
| "rewards/cosine_scaled_reward/std": 0.36445698142051697, |
| "step": 61 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1717.703125, |
| "completions/mean_terminated_length": 1041.3809814453125, |
| "completions/min_length": 432.0, |
| "completions/min_terminated_length": 432.0, |
| "epoch": 0.07085714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23552638292312622, |
| "learning_rate": 9.98673738502114e-07, |
| "loss": 0.0, |
| "num_tokens": 7823983.0, |
| "reward": -0.07779724895954132, |
| "reward_std": 0.2913648784160614, |
| "rewards/cosine_scaled_reward/mean": -0.07779725641012192, |
| "rewards/cosine_scaled_reward/std": 0.4099881649017334, |
| "step": 62 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1751.0, |
| "completions/mean_length": 1600.625, |
| "completions/mean_terminated_length": 1180.3636474609375, |
| "completions/min_length": 420.0, |
| "completions/min_terminated_length": 420.0, |
| "epoch": 0.072, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28230276703834534, |
| "learning_rate": 9.98421786662277e-07, |
| "loss": 0.0, |
| "num_tokens": 7936679.0, |
| "reward": -0.02632874622941017, |
| "reward_std": 0.25066205859184265, |
| "rewards/cosine_scaled_reward/mean": -0.02632874995470047, |
| "rewards/cosine_scaled_reward/std": 0.4263686537742615, |
| "step": 63 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1743.578125, |
| "completions/mean_terminated_length": 1073.8499755859375, |
| "completions/min_length": 496.0, |
| "completions/min_terminated_length": 496.0, |
| "epoch": 0.07314285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.266590416431427, |
| "learning_rate": 9.981479793771866e-07, |
| "loss": 0.0, |
| "num_tokens": 8059220.0, |
| "reward": -0.10920079052448273, |
| "reward_std": 0.3089619576931, |
| "rewards/cosine_scaled_reward/mean": -0.10920079052448273, |
| "rewards/cosine_scaled_reward/std": 0.43342384696006775, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1127.0, |
| "completions/mean_length": 1690.609375, |
| "completions/mean_terminated_length": 618.4375, |
| "completions/min_length": 331.0, |
| "completions/min_terminated_length": 331.0, |
| "epoch": 0.07428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2891872525215149, |
| "learning_rate": 9.97852329991824e-07, |
| "loss": 0.0, |
| "num_tokens": 8178123.0, |
| "reward": -0.2091352641582489, |
| "reward_std": 0.18792679905891418, |
| "rewards/cosine_scaled_reward/mean": -0.2091352641582489, |
| "rewards/cosine_scaled_reward/std": 0.40636762976646423, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1529.0, |
| "completions/mean_length": 1320.453125, |
| "completions/mean_terminated_length": 678.5, |
| "completions/min_length": 219.0, |
| "completions/min_terminated_length": 219.0, |
| "epoch": 0.07542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30139341950416565, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": -0.0, |
| "num_tokens": 8272864.0, |
| "reward": -0.012375831604003906, |
| "reward_std": 0.2539718747138977, |
| "rewards/cosine_scaled_reward/mean": -0.01237582415342331, |
| "rewards/cosine_scaled_reward/std": 0.45652061700820923, |
| "step": 66 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1368.0, |
| "completions/mean_length": 2001.21875, |
| "completions/mean_terminated_length": 1050.0, |
| "completions/min_length": 817.0, |
| "completions/min_terminated_length": 817.0, |
| "epoch": 0.07657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21435414254665375, |
| "learning_rate": 9.971955636222684e-07, |
| "loss": 0.0, |
| "num_tokens": 8411678.0, |
| "reward": -0.27966073155403137, |
| "reward_std": 0.14496129751205444, |
| "rewards/cosine_scaled_reward/mean": -0.27966073155403137, |
| "rewards/cosine_scaled_reward/std": 0.1733873188495636, |
| "step": 67 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.453125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1963.0, |
| "completions/mean_length": 1359.65625, |
| "completions/mean_terminated_length": 789.3142700195312, |
| "completions/min_length": 347.0, |
| "completions/min_terminated_length": 347.0, |
| "epoch": 0.07771428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3244759440422058, |
| "learning_rate": 9.968344786479415e-07, |
| "loss": -0.0, |
| "num_tokens": 8507952.0, |
| "reward": -0.06231251358985901, |
| "reward_std": 0.31347835063934326, |
| "rewards/cosine_scaled_reward/mean": -0.062312521040439606, |
| "rewards/cosine_scaled_reward/std": 0.40184450149536133, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1859.0, |
| "completions/mean_length": 1572.78125, |
| "completions/mean_terminated_length": 831.4400024414062, |
| "completions/min_length": 358.0, |
| "completions/min_terminated_length": 358.0, |
| "epoch": 0.07885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3429071605205536, |
| "learning_rate": 9.964516155915151e-07, |
| "loss": 0.0, |
| "num_tokens": 8618954.0, |
| "reward": -0.24097035825252533, |
| "reward_std": 0.22784993052482605, |
| "rewards/cosine_scaled_reward/mean": -0.24097035825252533, |
| "rewards/cosine_scaled_reward/std": 0.2594495415687561, |
| "step": 69 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1921.0, |
| "completions/mean_length": 1859.578125, |
| "completions/mean_terminated_length": 951.727294921875, |
| "completions/min_length": 532.0, |
| "completions/min_terminated_length": 532.0, |
| "epoch": 0.08, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.221941277384758, |
| "learning_rate": 9.960469931131936e-07, |
| "loss": -0.0, |
| "num_tokens": 8749423.0, |
| "reward": -0.27105003595352173, |
| "reward_std": 0.16835230588912964, |
| "rewards/cosine_scaled_reward/mean": -0.27105003595352173, |
| "rewards/cosine_scaled_reward/std": 0.21196867525577545, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1629.0, |
| "completions/mean_length": 1668.265625, |
| "completions/mean_terminated_length": 832.8500366210938, |
| "completions/min_length": 489.0, |
| "completions/min_terminated_length": 489.0, |
| "epoch": 0.08114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2909034192562103, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.0, |
| "num_tokens": 8866912.0, |
| "reward": -0.09497882425785065, |
| "reward_std": 0.2813299000263214, |
| "rewards/cosine_scaled_reward/mean": -0.09497880935668945, |
| "rewards/cosine_scaled_reward/std": 0.4832696318626404, |
| "step": 71 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1903.0, |
| "completions/mean_length": 1697.671875, |
| "completions/mean_terminated_length": 926.9500122070312, |
| "completions/min_length": 275.0, |
| "completions/min_terminated_length": 275.0, |
| "epoch": 0.08228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3016415238380432, |
| "learning_rate": 9.951725498333448e-07, |
| "loss": -0.0, |
| "num_tokens": 8985915.0, |
| "reward": -0.22967606782913208, |
| "reward_std": 0.18875859677791595, |
| "rewards/cosine_scaled_reward/mean": -0.2296760529279709, |
| "rewards/cosine_scaled_reward/std": 0.22012120485305786, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 2020.703125, |
| "completions/mean_terminated_length": 1465.666748046875, |
| "completions/min_length": 1143.0, |
| "completions/min_terminated_length": 1143.0, |
| "epoch": 0.08342857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21586637198925018, |
| "learning_rate": 9.947027716509488e-07, |
| "loss": 0.0, |
| "num_tokens": 9125968.0, |
| "reward": -0.24284613132476807, |
| "reward_std": 0.22862236201763153, |
| "rewards/cosine_scaled_reward/mean": -0.24284613132476807, |
| "rewards/cosine_scaled_reward/std": 0.24740919470787048, |
| "step": 73 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1975.09375, |
| "completions/mean_terminated_length": 1381.4285888671875, |
| "completions/min_length": 532.0, |
| "completions/min_terminated_length": 532.0, |
| "epoch": 0.08457142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21680164337158203, |
| "learning_rate": 9.942113192828444e-07, |
| "loss": 0.0, |
| "num_tokens": 9262302.0, |
| "reward": -0.1543380469083786, |
| "reward_std": 0.24083258211612701, |
| "rewards/cosine_scaled_reward/mean": -0.1543380618095398, |
| "rewards/cosine_scaled_reward/std": 0.3356986939907074, |
| "step": 74 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2038.0, |
| "completions/mean_length": 1836.3125, |
| "completions/mean_terminated_length": 1295.3333740234375, |
| "completions/min_length": 653.0, |
| "completions/min_terminated_length": 653.0, |
| "epoch": 0.08571428571428572, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.1845395565032959, |
| "learning_rate": 9.93698216681727e-07, |
| "loss": 0.0, |
| "num_tokens": 9390786.0, |
| "reward": -0.12792138755321503, |
| "reward_std": 0.10224759578704834, |
| "rewards/cosine_scaled_reward/mean": -0.12792138755321503, |
| "rewards/cosine_scaled_reward/std": 0.4530969560146332, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1422.0, |
| "completions/mean_length": 1764.109375, |
| "completions/mean_terminated_length": 836.7333984375, |
| "completions/min_length": 320.0, |
| "completions/min_terminated_length": 320.0, |
| "epoch": 0.08685714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26535236835479736, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": -0.0, |
| "num_tokens": 9514089.0, |
| "reward": -0.27717918157577515, |
| "reward_std": 0.19932743906974792, |
| "rewards/cosine_scaled_reward/mean": -0.27717918157577515, |
| "rewards/cosine_scaled_reward/std": 0.20844916999340057, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1864.0, |
| "completions/mean_length": 1945.109375, |
| "completions/mean_terminated_length": 1224.875, |
| "completions/min_length": 702.0, |
| "completions/min_terminated_length": 702.0, |
| "epoch": 0.088, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2047174870967865, |
| "learning_rate": 9.926071618660237e-07, |
| "loss": -0.0, |
| "num_tokens": 9650152.0, |
| "reward": -0.09873012453317642, |
| "reward_std": 0.22244854271411896, |
| "rewards/cosine_scaled_reward/mean": -0.09873010218143463, |
| "rewards/cosine_scaled_reward/std": 0.34491515159606934, |
| "step": 77 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1976.0, |
| "completions/mean_length": 1928.703125, |
| "completions/mean_terminated_length": 1199.6666259765625, |
| "completions/min_length": 722.0, |
| "completions/min_terminated_length": 722.0, |
| "epoch": 0.08914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22559019923210144, |
| "learning_rate": 9.9202926282791e-07, |
| "loss": 0.0, |
| "num_tokens": 9784309.0, |
| "reward": -0.09572747349739075, |
| "reward_std": 0.23068635165691376, |
| "rewards/cosine_scaled_reward/mean": -0.09572747349739075, |
| "rewards/cosine_scaled_reward/std": 0.38660773634910583, |
| "step": 78 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1864.0, |
| "completions/mean_length": 1508.40625, |
| "completions/mean_terminated_length": 814.6428833007812, |
| "completions/min_length": 339.0, |
| "completions/min_terminated_length": 339.0, |
| "epoch": 0.09028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24668477475643158, |
| "learning_rate": 9.91429819907136e-07, |
| "loss": 0.0, |
| "num_tokens": 9890943.0, |
| "reward": -0.1618795394897461, |
| "reward_std": 0.22540031373500824, |
| "rewards/cosine_scaled_reward/mean": -0.1618795245885849, |
| "rewards/cosine_scaled_reward/std": 0.3233039081096649, |
| "step": 79 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1996.0, |
| "completions/mean_length": 2012.671875, |
| "completions/mean_terminated_length": 1725.0001220703125, |
| "completions/min_length": 1283.0, |
| "completions/min_terminated_length": 1283.0, |
| "epoch": 0.09142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24282054603099823, |
| "learning_rate": 9.908088623197048e-07, |
| "loss": -0.0, |
| "num_tokens": 10030146.0, |
| "reward": -0.25591158866882324, |
| "reward_std": 0.15104801952838898, |
| "rewards/cosine_scaled_reward/mean": -0.25591158866882324, |
| "rewards/cosine_scaled_reward/std": 0.18741995096206665, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1878.0, |
| "completions/mean_length": 1821.921875, |
| "completions/mean_terminated_length": 935.0000610351562, |
| "completions/min_length": 580.0, |
| "completions/min_terminated_length": 580.0, |
| "epoch": 0.09257142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3027254641056061, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0, |
| "num_tokens": 10158021.0, |
| "reward": -0.15331333875656128, |
| "reward_std": 0.18424856662750244, |
| "rewards/cosine_scaled_reward/mean": -0.15331333875656128, |
| "rewards/cosine_scaled_reward/std": 0.24023762345314026, |
| "step": 81 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1852.0, |
| "completions/mean_length": 1734.28125, |
| "completions/mean_terminated_length": 991.26318359375, |
| "completions/min_length": 477.0, |
| "completions/min_terminated_length": 477.0, |
| "epoch": 0.09371428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2466808557510376, |
| "learning_rate": 9.895025252503755e-07, |
| "loss": -0.0, |
| "num_tokens": 10279343.0, |
| "reward": -0.07192108780145645, |
| "reward_std": 0.2587333917617798, |
| "rewards/cosine_scaled_reward/mean": -0.07192108780145645, |
| "rewards/cosine_scaled_reward/std": 0.46087121963500977, |
| "step": 82 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1571.0, |
| "completions/mean_length": 1688.78125, |
| "completions/mean_terminated_length": 953.2380981445312, |
| "completions/min_length": 495.0, |
| "completions/min_terminated_length": 495.0, |
| "epoch": 0.09485714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2600877285003662, |
| "learning_rate": 9.888172094375033e-07, |
| "loss": 0.0, |
| "num_tokens": 10398513.0, |
| "reward": -0.1718086451292038, |
| "reward_std": 0.2223512828350067, |
| "rewards/cosine_scaled_reward/mean": -0.1718086451292038, |
| "rewards/cosine_scaled_reward/std": 0.2828122675418854, |
| "step": 83 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1666.0, |
| "completions/mean_length": 1838.203125, |
| "completions/mean_terminated_length": 705.2999877929688, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.096, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22531215846538544, |
| "learning_rate": 9.881105062929221e-07, |
| "loss": 0.0, |
| "num_tokens": 10526854.0, |
| "reward": -0.2154863476753235, |
| "reward_std": 0.261901319026947, |
| "rewards/cosine_scaled_reward/mean": -0.2154863476753235, |
| "rewards/cosine_scaled_reward/std": 0.29268571734428406, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2026.0, |
| "completions/mean_length": 1926.34375, |
| "completions/mean_terminated_length": 1399.166748046875, |
| "completions/min_length": 880.0, |
| "completions/min_terminated_length": 880.0, |
| "epoch": 0.09714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19031891226768494, |
| "learning_rate": 9.873824502603459e-07, |
| "loss": -0.0, |
| "num_tokens": 10660460.0, |
| "reward": -0.21009978652000427, |
| "reward_std": 0.19575349986553192, |
| "rewards/cosine_scaled_reward/mean": -0.21009978652000427, |
| "rewards/cosine_scaled_reward/std": 0.2456056773662567, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1713.21875, |
| "completions/mean_terminated_length": 787.6470336914062, |
| "completions/min_length": 547.0, |
| "completions/min_terminated_length": 547.0, |
| "epoch": 0.09828571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.258359432220459, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": -0.0, |
| "num_tokens": 10780962.0, |
| "reward": -0.1955144852399826, |
| "reward_std": 0.24323132634162903, |
| "rewards/cosine_scaled_reward/mean": -0.1955144852399826, |
| "rewards/cosine_scaled_reward/std": 0.3071554899215698, |
| "step": 86 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1815.0, |
| "completions/mean_length": 1656.0, |
| "completions/mean_terminated_length": 1002.6666870117188, |
| "completions/min_length": 532.0, |
| "completions/min_terminated_length": 532.0, |
| "epoch": 0.09942857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2636864185333252, |
| "learning_rate": 9.85862422507884e-07, |
| "loss": 0.0, |
| "num_tokens": 10897066.0, |
| "reward": -0.1988150179386139, |
| "reward_std": 0.24088150262832642, |
| "rewards/cosine_scaled_reward/mean": -0.1988150179386139, |
| "rewards/cosine_scaled_reward/std": 0.2925129532814026, |
| "step": 87 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1755.234375, |
| "completions/mean_terminated_length": 1061.8421630859375, |
| "completions/min_length": 500.0, |
| "completions/min_terminated_length": 500.0, |
| "epoch": 0.10057142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29112017154693604, |
| "learning_rate": 9.850705248720068e-07, |
| "loss": 0.0, |
| "num_tokens": 11019913.0, |
| "reward": -0.02967459335923195, |
| "reward_std": 0.3240855932235718, |
| "rewards/cosine_scaled_reward/mean": -0.029674597084522247, |
| "rewards/cosine_scaled_reward/std": 0.3718070983886719, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1768.0, |
| "completions/mean_length": 1752.78125, |
| "completions/mean_terminated_length": 1148.2857666015625, |
| "completions/min_length": 619.0, |
| "completions/min_terminated_length": 619.0, |
| "epoch": 0.10171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2357943207025528, |
| "learning_rate": 9.8425742251254e-07, |
| "loss": -0.0, |
| "num_tokens": 11143091.0, |
| "reward": -0.1188301220536232, |
| "reward_std": 0.296513170003891, |
| "rewards/cosine_scaled_reward/mean": -0.1188301220536232, |
| "rewards/cosine_scaled_reward/std": 0.3878798484802246, |
| "step": 89 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1693.0, |
| "completions/mean_length": 1633.84375, |
| "completions/mean_terminated_length": 1101.357177734375, |
| "completions/min_length": 568.0, |
| "completions/min_terminated_length": 568.0, |
| "epoch": 0.10285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32384219765663147, |
| "learning_rate": 9.83423155058946e-07, |
| "loss": -0.0, |
| "num_tokens": 11257657.0, |
| "reward": -0.22837099432945251, |
| "reward_std": 0.18625205755233765, |
| "rewards/cosine_scaled_reward/mean": -0.22837099432945251, |
| "rewards/cosine_scaled_reward/std": 0.23636196553707123, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1833.0, |
| "completions/mean_length": 1847.21875, |
| "completions/mean_terminated_length": 1244.875, |
| "completions/min_length": 716.0, |
| "completions/min_terminated_length": 716.0, |
| "epoch": 0.104, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24563109874725342, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": -0.0, |
| "num_tokens": 11386447.0, |
| "reward": -0.11780542880296707, |
| "reward_std": 0.3100074827671051, |
| "rewards/cosine_scaled_reward/mean": -0.11780542135238647, |
| "rewards/cosine_scaled_reward/std": 0.39149248600006104, |
| "step": 91 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1462.0, |
| "completions/mean_length": 1595.125, |
| "completions/mean_terminated_length": 888.6399536132812, |
| "completions/min_length": 464.0, |
| "completions/min_terminated_length": 464.0, |
| "epoch": 0.10514285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2472057044506073, |
| "learning_rate": 9.816912885430258e-07, |
| "loss": -0.0, |
| "num_tokens": 11498527.0, |
| "reward": -0.2128506749868393, |
| "reward_std": 0.20926561951637268, |
| "rewards/cosine_scaled_reward/mean": -0.2128506898880005, |
| "rewards/cosine_scaled_reward/std": 0.23348061740398407, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1221.0, |
| "completions/mean_length": 1979.953125, |
| "completions/mean_terminated_length": 959.25, |
| "completions/min_length": 822.0, |
| "completions/min_terminated_length": 822.0, |
| "epoch": 0.10628571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2550150454044342, |
| "learning_rate": 9.807937738894303e-07, |
| "loss": -0.0, |
| "num_tokens": 11636588.0, |
| "reward": -0.2922024428844452, |
| "reward_std": 0.1515069603919983, |
| "rewards/cosine_scaled_reward/mean": -0.2922024726867676, |
| "rewards/cosine_scaled_reward/std": 0.18899379670619965, |
| "step": 93 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1544.0, |
| "completions/mean_length": 1830.609375, |
| "completions/mean_terminated_length": 977.769287109375, |
| "completions/min_length": 533.0, |
| "completions/min_terminated_length": 533.0, |
| "epoch": 0.10742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27164825797080994, |
| "learning_rate": 9.798752629550546e-07, |
| "loss": -0.0, |
| "num_tokens": 11763515.0, |
| "reward": -0.18001651763916016, |
| "reward_std": 0.18973413109779358, |
| "rewards/cosine_scaled_reward/mean": -0.18001650273799896, |
| "rewards/cosine_scaled_reward/std": 0.4316568076610565, |
| "step": 94 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 2004.671875, |
| "completions/mean_terminated_length": 1493.4000244140625, |
| "completions/min_length": 960.0, |
| "completions/min_terminated_length": 960.0, |
| "epoch": 0.10857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20861269533634186, |
| "learning_rate": 9.78935800506826e-07, |
| "loss": -0.0, |
| "num_tokens": 11902342.0, |
| "reward": -0.24148261547088623, |
| "reward_std": 0.18629083037376404, |
| "rewards/cosine_scaled_reward/mean": -0.24148263037204742, |
| "rewards/cosine_scaled_reward/std": 0.23122739791870117, |
| "step": 95 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1298.0, |
| "completions/mean_length": 1703.359375, |
| "completions/mean_terminated_length": 945.1500244140625, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.10971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2585296928882599, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0, |
| "num_tokens": 12022493.0, |
| "reward": -0.11465626955032349, |
| "reward_std": 0.24939197301864624, |
| "rewards/cosine_scaled_reward/mean": -0.11465626955032349, |
| "rewards/cosine_scaled_reward/std": 0.4384477138519287, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1809.0, |
| "completions/mean_length": 1819.921875, |
| "completions/mean_terminated_length": 1135.6875, |
| "completions/min_length": 425.0, |
| "completions/min_terminated_length": 425.0, |
| "epoch": 0.11085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3019813299179077, |
| "learning_rate": 9.769942052400235e-07, |
| "loss": 0.0, |
| "num_tokens": 12149232.0, |
| "reward": -0.18846748769283295, |
| "reward_std": 0.2666187584400177, |
| "rewards/cosine_scaled_reward/mean": -0.18846750259399414, |
| "rewards/cosine_scaled_reward/std": 0.3043021559715271, |
| "step": 97 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2003.0, |
| "completions/mean_length": 1677.296875, |
| "completions/mean_terminated_length": 1099.0, |
| "completions/min_length": 315.0, |
| "completions/min_terminated_length": 315.0, |
| "epoch": 0.112, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2722402513027191, |
| "learning_rate": 9.759921670520634e-07, |
| "loss": 0.0, |
| "num_tokens": 12267643.0, |
| "reward": -0.09557384252548218, |
| "reward_std": 0.2643275558948517, |
| "rewards/cosine_scaled_reward/mean": -0.09557383507490158, |
| "rewards/cosine_scaled_reward/std": 0.3361329138278961, |
| "step": 98 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1141.0, |
| "completions/mean_length": 1716.59375, |
| "completions/mean_terminated_length": 634.0000610351562, |
| "completions/min_length": 393.0, |
| "completions/min_terminated_length": 393.0, |
| "epoch": 0.11314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2789485454559326, |
| "learning_rate": 9.749693666068663e-07, |
| "loss": -0.0, |
| "num_tokens": 12388673.0, |
| "reward": -0.11132554709911346, |
| "reward_std": 0.1736970841884613, |
| "rewards/cosine_scaled_reward/mean": -0.11132554709911346, |
| "rewards/cosine_scaled_reward/std": 0.38663193583488464, |
| "step": 99 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1906.0, |
| "completions/mean_length": 1627.78125, |
| "completions/mean_terminated_length": 927.4166870117188, |
| "completions/min_length": 426.0, |
| "completions/min_terminated_length": 426.0, |
| "epoch": 0.11428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2479974329471588, |
| "learning_rate": 9.739258537542835e-07, |
| "loss": 0.0, |
| "num_tokens": 12502563.0, |
| "reward": 0.05247430503368378, |
| "reward_std": 0.2633323669433594, |
| "rewards/cosine_scaled_reward/mean": 0.05247429758310318, |
| "rewards/cosine_scaled_reward/std": 0.44700634479522705, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1780.0, |
| "completions/mean_length": 1684.75, |
| "completions/mean_terminated_length": 1037.2174072265625, |
| "completions/min_length": 555.0, |
| "completions/min_terminated_length": 555.0, |
| "epoch": 0.11542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2880499064922333, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": -0.0, |
| "num_tokens": 12621819.0, |
| "reward": -0.09590694308280945, |
| "reward_std": 0.21176990866661072, |
| "rewards/cosine_scaled_reward/mean": -0.09590694308280945, |
| "rewards/cosine_scaled_reward/std": 0.426421195268631, |
| "step": 101 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1585.0, |
| "completions/mean_length": 1361.265625, |
| "completions/mean_terminated_length": 860.1351318359375, |
| "completions/min_length": 416.0, |
| "completions/min_terminated_length": 416.0, |
| "epoch": 0.11657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2874862551689148, |
| "learning_rate": 9.717768952713511e-07, |
| "loss": -0.0, |
| "num_tokens": 12719092.0, |
| "reward": -0.19330359995365143, |
| "reward_std": 0.1932550072669983, |
| "rewards/cosine_scaled_reward/mean": -0.19330358505249023, |
| "rewards/cosine_scaled_reward/std": 0.34549427032470703, |
| "step": 102 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1384.0, |
| "completions/mean_length": 1687.90625, |
| "completions/mean_terminated_length": 607.625, |
| "completions/min_length": 221.0, |
| "completions/min_terminated_length": 221.0, |
| "epoch": 0.11771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29745906591415405, |
| "learning_rate": 9.706715543782064e-07, |
| "loss": -0.0, |
| "num_tokens": 12837470.0, |
| "reward": -0.2588111162185669, |
| "reward_std": 0.26013171672821045, |
| "rewards/cosine_scaled_reward/mean": -0.2588111162185669, |
| "rewards/cosine_scaled_reward/std": 0.32377612590789795, |
| "step": 103 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1627.0, |
| "completions/mean_length": 1679.59375, |
| "completions/mean_terminated_length": 925.2380981445312, |
| "completions/min_length": 585.0, |
| "completions/min_terminated_length": 585.0, |
| "epoch": 0.11885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27166086435317993, |
| "learning_rate": 9.695457105469804e-07, |
| "loss": -0.0, |
| "num_tokens": 12955428.0, |
| "reward": -0.17275363206863403, |
| "reward_std": 0.20137225091457367, |
| "rewards/cosine_scaled_reward/mean": -0.17275363206863403, |
| "rewards/cosine_scaled_reward/std": 0.2731510400772095, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1993.0, |
| "completions/mean_length": 1568.203125, |
| "completions/mean_terminated_length": 819.719970703125, |
| "completions/min_length": 510.0, |
| "completions/min_terminated_length": 510.0, |
| "epoch": 0.12, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26759475469589233, |
| "learning_rate": 9.683994186497132e-07, |
| "loss": -0.0, |
| "num_tokens": 13067081.0, |
| "reward": -0.1266355961561203, |
| "reward_std": 0.3027850389480591, |
| "rewards/cosine_scaled_reward/mean": -0.1266355961561203, |
| "rewards/cosine_scaled_reward/std": 0.4276663362979889, |
| "step": 105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1958.0, |
| "completions/mean_length": 1432.09375, |
| "completions/mean_terminated_length": 816.1875, |
| "completions/min_length": 219.0, |
| "completions/min_terminated_length": 219.0, |
| "epoch": 0.12114285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2912415862083435, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.0, |
| "num_tokens": 13169567.0, |
| "reward": 0.052130524069070816, |
| "reward_std": 0.30294427275657654, |
| "rewards/cosine_scaled_reward/mean": 0.052130527794361115, |
| "rewards/cosine_scaled_reward/std": 0.43769362568855286, |
| "step": 106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2024.0, |
| "completions/mean_length": 1721.28125, |
| "completions/mean_terminated_length": 1097.5455322265625, |
| "completions/min_length": 425.0, |
| "completions/min_terminated_length": 425.0, |
| "epoch": 0.12228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26628872752189636, |
| "learning_rate": 9.66045715125541e-07, |
| "loss": 0.0, |
| "num_tokens": 13290881.0, |
| "reward": -0.18292994797229767, |
| "reward_std": 0.25176504254341125, |
| "rewards/cosine_scaled_reward/mean": -0.18292994797229767, |
| "rewards/cosine_scaled_reward/std": 0.33385229110717773, |
| "step": 107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1518.6875, |
| "completions/mean_terminated_length": 989.375, |
| "completions/min_length": 430.0, |
| "completions/min_terminated_length": 430.0, |
| "epoch": 0.12342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25796031951904297, |
| "learning_rate": 9.648384182148252e-07, |
| "loss": -0.0, |
| "num_tokens": 13398437.0, |
| "reward": -0.17732736468315125, |
| "reward_std": 0.32095974683761597, |
| "rewards/cosine_scaled_reward/mean": -0.17732736468315125, |
| "rewards/cosine_scaled_reward/std": 0.3682377338409424, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1963.0, |
| "completions/mean_length": 1871.890625, |
| "completions/mean_terminated_length": 1108.75, |
| "completions/min_length": 673.0, |
| "completions/min_terminated_length": 673.0, |
| "epoch": 0.12457142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2274676412343979, |
| "learning_rate": 9.636109026648554e-07, |
| "loss": 0.0, |
| "num_tokens": 13529486.0, |
| "reward": -0.13115660846233368, |
| "reward_std": 0.15383467078208923, |
| "rewards/cosine_scaled_reward/mean": -0.13115662336349487, |
| "rewards/cosine_scaled_reward/std": 0.4183727204799652, |
| "step": 109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1941.0, |
| "completions/mean_length": 1584.125, |
| "completions/mean_terminated_length": 811.0, |
| "completions/min_length": 397.0, |
| "completions/min_terminated_length": 397.0, |
| "epoch": 0.12571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2768951952457428, |
| "learning_rate": 9.623632283030077e-07, |
| "loss": -0.0, |
| "num_tokens": 13641646.0, |
| "reward": -0.27792292833328247, |
| "reward_std": 0.18945851922035217, |
| "rewards/cosine_scaled_reward/mean": -0.27792292833328247, |
| "rewards/cosine_scaled_reward/std": 0.20238204300403595, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2032.0, |
| "completions/mean_length": 1709.1875, |
| "completions/mean_terminated_length": 1062.3636474609375, |
| "completions/min_length": 485.0, |
| "completions/min_terminated_length": 485.0, |
| "epoch": 0.12685714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24532362818717957, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": -0.0, |
| "num_tokens": 13761154.0, |
| "reward": -0.0890636295080185, |
| "reward_std": 0.33067381381988525, |
| "rewards/cosine_scaled_reward/mean": -0.0890636295080185, |
| "rewards/cosine_scaled_reward/std": 0.40376362204551697, |
| "step": 111 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1895.234375, |
| "completions/mean_terminated_length": 1436.9375, |
| "completions/min_length": 783.0, |
| "completions/min_terminated_length": 783.0, |
| "epoch": 0.128, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22462251782417297, |
| "learning_rate": 9.598076473627796e-07, |
| "loss": -0.0, |
| "num_tokens": 13893545.0, |
| "reward": -0.1325383186340332, |
| "reward_std": 0.330952525138855, |
| "rewards/cosine_scaled_reward/mean": -0.1325383186340332, |
| "rewards/cosine_scaled_reward/std": 0.4280668795108795, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1695.0, |
| "completions/mean_length": 1606.890625, |
| "completions/mean_terminated_length": 871.7083740234375, |
| "completions/min_length": 284.0, |
| "completions/min_terminated_length": 284.0, |
| "epoch": 0.12914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3009057939052582, |
| "learning_rate": 9.58499865339809e-07, |
| "loss": 0.0, |
| "num_tokens": 14006682.0, |
| "reward": -0.05043189600110054, |
| "reward_std": 0.300018846988678, |
| "rewards/cosine_scaled_reward/mean": -0.050431910902261734, |
| "rewards/cosine_scaled_reward/std": 0.43634143471717834, |
| "step": 113 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1898.0, |
| "completions/mean_length": 1562.515625, |
| "completions/mean_terminated_length": 753.375, |
| "completions/min_length": 121.0, |
| "completions/min_terminated_length": 121.0, |
| "epoch": 0.13028571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37847185134887695, |
| "learning_rate": 9.571721736097088e-07, |
| "loss": 0.0, |
| "num_tokens": 14116531.0, |
| "reward": -0.27539706230163574, |
| "reward_std": 0.18451666831970215, |
| "rewards/cosine_scaled_reward/mean": -0.27539709210395813, |
| "rewards/cosine_scaled_reward/std": 0.23580753803253174, |
| "step": 114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2041.0, |
| "completions/mean_length": 1656.484375, |
| "completions/mean_terminated_length": 958.5652465820312, |
| "completions/min_length": 351.0, |
| "completions/min_terminated_length": 351.0, |
| "epoch": 0.13142857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26879096031188965, |
| "learning_rate": 9.55824636882301e-07, |
| "loss": 0.0, |
| "num_tokens": 14233762.0, |
| "reward": -0.058682698756456375, |
| "reward_std": 0.2945008873939514, |
| "rewards/cosine_scaled_reward/mean": -0.05868269130587578, |
| "rewards/cosine_scaled_reward/std": 0.40092962980270386, |
| "step": 115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1413.0, |
| "completions/mean_length": 1924.84375, |
| "completions/mean_terminated_length": 734.3333740234375, |
| "completions/min_length": 426.0, |
| "completions/min_terminated_length": 426.0, |
| "epoch": 0.13257142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2654048800468445, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0, |
| "num_tokens": 14368336.0, |
| "reward": -0.2030428647994995, |
| "reward_std": 0.18692326545715332, |
| "rewards/cosine_scaled_reward/mean": -0.2030428647994995, |
| "rewards/cosine_scaled_reward/std": 0.2246093899011612, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1420.0, |
| "completions/mean_length": 1785.484375, |
| "completions/mean_terminated_length": 997.9375, |
| "completions/min_length": 549.0, |
| "completions/min_terminated_length": 549.0, |
| "epoch": 0.1337142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26004910469055176, |
| "learning_rate": 9.530702921077358e-07, |
| "loss": -0.0, |
| "num_tokens": 14493631.0, |
| "reward": -0.19770082831382751, |
| "reward_std": 0.25534579157829285, |
| "rewards/cosine_scaled_reward/mean": -0.19770082831382751, |
| "rewards/cosine_scaled_reward/std": 0.33773326873779297, |
| "step": 117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1906.0, |
| "completions/mean_length": 1802.84375, |
| "completions/mean_terminated_length": 1067.375, |
| "completions/min_length": 554.0, |
| "completions/min_terminated_length": 554.0, |
| "epoch": 0.13485714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22992977499961853, |
| "learning_rate": 9.516636183034564e-07, |
| "loss": 0.0, |
| "num_tokens": 14619549.0, |
| "reward": -0.011579632759094238, |
| "reward_std": 0.3697226643562317, |
| "rewards/cosine_scaled_reward/mean": -0.011579625308513641, |
| "rewards/cosine_scaled_reward/std": 0.4647332727909088, |
| "step": 118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1343.578125, |
| "completions/mean_terminated_length": 920.9249877929688, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.136, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3279743492603302, |
| "learning_rate": 9.502373679810839e-07, |
| "loss": -0.0, |
| "num_tokens": 14715946.0, |
| "reward": -0.0004618987441062927, |
| "reward_std": 0.27856603264808655, |
| "rewards/cosine_scaled_reward/mean": -0.0004618987441062927, |
| "rewards/cosine_scaled_reward/std": 0.45174649357795715, |
| "step": 119 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1546.0, |
| "completions/mean_length": 1286.75, |
| "completions/mean_terminated_length": 859.707275390625, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 256.0, |
| "epoch": 0.13714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3185117244720459, |
| "learning_rate": 9.487916106540465e-07, |
| "loss": -0.0, |
| "num_tokens": 14808754.0, |
| "reward": -0.06128609925508499, |
| "reward_std": 0.3139324188232422, |
| "rewards/cosine_scaled_reward/mean": -0.06128609925508499, |
| "rewards/cosine_scaled_reward/std": 0.46217504143714905, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1103.90625, |
| "completions/mean_terminated_length": 789.2083740234375, |
| "completions/min_length": 312.0, |
| "completions/min_terminated_length": 312.0, |
| "epoch": 0.1382857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3791055381298065, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0, |
| "num_tokens": 14889100.0, |
| "reward": -0.012373358011245728, |
| "reward_std": 0.3332873284816742, |
| "rewards/cosine_scaled_reward/mean": -0.012373358011245728, |
| "rewards/cosine_scaled_reward/std": 0.4969451427459717, |
| "step": 121 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1765.0625, |
| "completions/mean_terminated_length": 1042.0, |
| "completions/min_length": 330.0, |
| "completions/min_terminated_length": 330.0, |
| "epoch": 0.13942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27713218331336975, |
| "learning_rate": 9.458418577899774e-07, |
| "loss": -0.0, |
| "num_tokens": 15013624.0, |
| "reward": -0.1387348771095276, |
| "reward_std": 0.25947195291519165, |
| "rewards/cosine_scaled_reward/mean": -0.1387348771095276, |
| "rewards/cosine_scaled_reward/std": 0.3304338753223419, |
| "step": 122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1576.28125, |
| "completions/mean_terminated_length": 1006.9655151367188, |
| "completions/min_length": 502.0, |
| "completions/min_terminated_length": 502.0, |
| "epoch": 0.14057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2664856016635895, |
| "learning_rate": 9.443380060197385e-07, |
| "loss": 0.0, |
| "num_tokens": 15124738.0, |
| "reward": -0.18317654728889465, |
| "reward_std": 0.16592136025428772, |
| "rewards/cosine_scaled_reward/mean": -0.18317654728889465, |
| "rewards/cosine_scaled_reward/std": 0.33475980162620544, |
| "step": 123 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1814.0, |
| "completions/mean_length": 1395.78125, |
| "completions/mean_terminated_length": 888.5, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.1417142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2889535427093506, |
| "learning_rate": 9.428149347714143e-07, |
| "loss": 0.0, |
| "num_tokens": 15225020.0, |
| "reward": -0.12295320630073547, |
| "reward_std": 0.30637824535369873, |
| "rewards/cosine_scaled_reward/mean": -0.12295320630073547, |
| "rewards/cosine_scaled_reward/std": 0.4125574827194214, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2027.0, |
| "completions/mean_length": 1622.8125, |
| "completions/mean_terminated_length": 914.1666870117188, |
| "completions/min_length": 379.0, |
| "completions/min_terminated_length": 379.0, |
| "epoch": 0.14285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24003510177135468, |
| "learning_rate": 9.412727182773486e-07, |
| "loss": -0.0, |
| "num_tokens": 15339808.0, |
| "reward": -0.06917156279087067, |
| "reward_std": 0.19467812776565552, |
| "rewards/cosine_scaled_reward/mean": -0.06917153298854828, |
| "rewards/cosine_scaled_reward/std": 0.44139373302459717, |
| "step": 125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1735.96875, |
| "completions/mean_terminated_length": 1097.047607421875, |
| "completions/min_length": 610.0, |
| "completions/min_terminated_length": 610.0, |
| "epoch": 0.144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23693455755710602, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": -0.0, |
| "num_tokens": 15462206.0, |
| "reward": -0.15823431313037872, |
| "reward_std": 0.26196378469467163, |
| "rewards/cosine_scaled_reward/mean": -0.15823431313037872, |
| "rewards/cosine_scaled_reward/std": 0.3110467195510864, |
| "step": 126 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1990.0, |
| "completions/mean_length": 1729.421875, |
| "completions/mean_terminated_length": 1161.521728515625, |
| "completions/min_length": 655.0, |
| "completions/min_terminated_length": 655.0, |
| "epoch": 0.14514285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23715488612651825, |
| "learning_rate": 9.381311511432658e-07, |
| "loss": -0.0, |
| "num_tokens": 15583985.0, |
| "reward": -0.2520313262939453, |
| "reward_std": 0.1912405639886856, |
| "rewards/cosine_scaled_reward/mean": -0.2520313262939453, |
| "rewards/cosine_scaled_reward/std": 0.276276558637619, |
| "step": 127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2025.0, |
| "completions/mean_length": 1614.125, |
| "completions/mean_terminated_length": 1090.4827880859375, |
| "completions/min_length": 392.0, |
| "completions/min_terminated_length": 392.0, |
| "epoch": 0.1462857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25245338678359985, |
| "learning_rate": 9.36531953618799e-07, |
| "loss": -0.0, |
| "num_tokens": 15697641.0, |
| "reward": 0.029929369688034058, |
| "reward_std": 0.2960119843482971, |
| "rewards/cosine_scaled_reward/mean": 0.029929369688034058, |
| "rewards/cosine_scaled_reward/std": 0.40772902965545654, |
| "step": 128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1810.0, |
| "completions/mean_length": 1892.953125, |
| "completions/mean_terminated_length": 945.4444580078125, |
| "completions/min_length": 490.0, |
| "completions/min_terminated_length": 490.0, |
| "epoch": 0.14742857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22934643924236298, |
| "learning_rate": 9.34913917072228e-07, |
| "loss": -0.0, |
| "num_tokens": 15829494.0, |
| "reward": -0.27538371086120605, |
| "reward_std": 0.2161153256893158, |
| "rewards/cosine_scaled_reward/mean": -0.27538371086120605, |
| "rewards/cosine_scaled_reward/std": 0.25140947103500366, |
| "step": 129 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1384.0, |
| "completions/mean_length": 1631.5625, |
| "completions/mean_terminated_length": 889.2174072265625, |
| "completions/min_length": 545.0, |
| "completions/min_terminated_length": 545.0, |
| "epoch": 0.14857142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2312338650226593, |
| "learning_rate": 9.332771203643714e-07, |
| "loss": -0.0, |
| "num_tokens": 15944418.0, |
| "reward": -0.16326984763145447, |
| "reward_std": 0.22974258661270142, |
| "rewards/cosine_scaled_reward/mean": -0.16326983273029327, |
| "rewards/cosine_scaled_reward/std": 0.3127349317073822, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1709.0, |
| "completions/mean_length": 1549.453125, |
| "completions/mean_terminated_length": 820.8077392578125, |
| "completions/min_length": 280.0, |
| "completions/min_terminated_length": 280.0, |
| "epoch": 0.14971428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28737154603004456, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0, |
| "num_tokens": 16053319.0, |
| "reward": -0.060378547757864, |
| "reward_std": 0.23251818120479584, |
| "rewards/cosine_scaled_reward/mean": -0.060378558933734894, |
| "rewards/cosine_scaled_reward/std": 0.4743967354297638, |
| "step": 131 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1667.0, |
| "completions/mean_length": 1536.859375, |
| "completions/mean_terminated_length": 957.5667114257812, |
| "completions/min_length": 370.0, |
| "completions/min_terminated_length": 370.0, |
| "epoch": 0.15085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24873872101306915, |
| "learning_rate": 9.299475664759068e-07, |
| "loss": 0.0, |
| "num_tokens": 16162742.0, |
| "reward": -0.10933490097522736, |
| "reward_std": 0.2869688868522644, |
| "rewards/cosine_scaled_reward/mean": -0.10933491587638855, |
| "rewards/cosine_scaled_reward/std": 0.45436573028564453, |
| "step": 132 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1964.0, |
| "completions/mean_length": 1817.453125, |
| "completions/mean_terminated_length": 1125.8125, |
| "completions/min_length": 526.0, |
| "completions/min_terminated_length": 526.0, |
| "epoch": 0.152, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2753625810146332, |
| "learning_rate": 9.282549715730579e-07, |
| "loss": 0.0, |
| "num_tokens": 16290283.0, |
| "reward": -0.1931842416524887, |
| "reward_std": 0.2315790057182312, |
| "rewards/cosine_scaled_reward/mean": -0.1931842565536499, |
| "rewards/cosine_scaled_reward/std": 0.26366862654685974, |
| "step": 133 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1907.0, |
| "completions/mean_length": 1685.390625, |
| "completions/mean_terminated_length": 1119.719970703125, |
| "completions/min_length": 660.0, |
| "completions/min_terminated_length": 660.0, |
| "epoch": 0.15314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25077056884765625, |
| "learning_rate": 9.265439410565328e-07, |
| "loss": -0.0, |
| "num_tokens": 16408716.0, |
| "reward": -0.1305551677942276, |
| "reward_std": 0.15626969933509827, |
| "rewards/cosine_scaled_reward/mean": -0.1305551677942276, |
| "rewards/cosine_scaled_reward/std": 0.35703787207603455, |
| "step": 134 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1869.0, |
| "completions/mean_length": 1111.578125, |
| "completions/mean_terminated_length": 654.2557983398438, |
| "completions/min_length": 259.0, |
| "completions/min_terminated_length": 259.0, |
| "epoch": 0.15428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3456169366836548, |
| "learning_rate": 9.248145583195447e-07, |
| "loss": -0.0, |
| "num_tokens": 16490329.0, |
| "reward": 0.08614158630371094, |
| "reward_std": 0.3152117133140564, |
| "rewards/cosine_scaled_reward/mean": 0.08614158630371094, |
| "rewards/cosine_scaled_reward/std": 0.5073397159576416, |
| "step": 135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1600.0, |
| "completions/mean_length": 1485.703125, |
| "completions/mean_terminated_length": 848.433349609375, |
| "completions/min_length": 401.0, |
| "completions/min_terminated_length": 401.0, |
| "epoch": 0.15542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28029024600982666, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": -0.0, |
| "num_tokens": 16596086.0, |
| "reward": 0.01799224689602852, |
| "reward_std": 0.28087177872657776, |
| "rewards/cosine_scaled_reward/mean": 0.017992250621318817, |
| "rewards/cosine_scaled_reward/std": 0.5039587020874023, |
| "step": 136 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1632.0, |
| "completions/mean_length": 1765.421875, |
| "completions/mean_terminated_length": 1043.27783203125, |
| "completions/min_length": 659.0, |
| "completions/min_terminated_length": 659.0, |
| "epoch": 0.15657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21782204508781433, |
| "learning_rate": 9.213010742252327e-07, |
| "loss": 0.0, |
| "num_tokens": 16719681.0, |
| "reward": -0.2635670304298401, |
| "reward_std": 0.16446365416049957, |
| "rewards/cosine_scaled_reward/mean": -0.2635670304298401, |
| "rewards/cosine_scaled_reward/std": 0.1840340793132782, |
| "step": 137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1888.0, |
| "completions/mean_length": 1529.734375, |
| "completions/mean_terminated_length": 1072.441162109375, |
| "completions/min_length": 362.0, |
| "completions/min_terminated_length": 362.0, |
| "epoch": 0.15771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26588714122772217, |
| "learning_rate": 9.195171441101668e-07, |
| "loss": -0.0, |
| "num_tokens": 16828896.0, |
| "reward": -0.08665560930967331, |
| "reward_std": 0.23063711822032928, |
| "rewards/cosine_scaled_reward/mean": -0.08665560930967331, |
| "rewards/cosine_scaled_reward/std": 0.44113171100616455, |
| "step": 138 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2036.0, |
| "completions/mean_length": 1667.8125, |
| "completions/mean_terminated_length": 990.0869750976562, |
| "completions/min_length": 306.0, |
| "completions/min_terminated_length": 306.0, |
| "epoch": 0.15885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2917172610759735, |
| "learning_rate": 9.177152042508077e-07, |
| "loss": -0.0, |
| "num_tokens": 16946276.0, |
| "reward": -0.19403964281082153, |
| "reward_std": 0.2673150300979614, |
| "rewards/cosine_scaled_reward/mean": -0.19403962790966034, |
| "rewards/cosine_scaled_reward/std": 0.32773110270500183, |
| "step": 139 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1934.0, |
| "completions/mean_length": 1412.796875, |
| "completions/mean_terminated_length": 949.270263671875, |
| "completions/min_length": 384.0, |
| "completions/min_terminated_length": 384.0, |
| "epoch": 0.16, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.28324976563453674, |
| "learning_rate": 9.158953424711624e-07, |
| "loss": -0.0, |
| "num_tokens": 17046919.0, |
| "reward": -0.13130062818527222, |
| "reward_std": 0.13907812535762787, |
| "rewards/cosine_scaled_reward/mean": -0.13130061328411102, |
| "rewards/cosine_scaled_reward/std": 0.46400320529937744, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1615.0, |
| "completions/mean_length": 1272.25, |
| "completions/mean_terminated_length": 893.3953247070312, |
| "completions/min_length": 518.0, |
| "completions/min_terminated_length": 518.0, |
| "epoch": 0.16114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28660058975219727, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0, |
| "num_tokens": 17138903.0, |
| "reward": -0.044462256133556366, |
| "reward_std": 0.3412697911262512, |
| "rewards/cosine_scaled_reward/mean": -0.04446224868297577, |
| "rewards/cosine_scaled_reward/std": 0.4661441445350647, |
| "step": 141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1959.0, |
| "completions/mean_length": 1662.734375, |
| "completions/mean_terminated_length": 1226.10009765625, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.16228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3415294587612152, |
| "learning_rate": 9.122022088101613e-07, |
| "loss": -0.0, |
| "num_tokens": 17255822.0, |
| "reward": -0.15457069873809814, |
| "reward_std": 0.31260305643081665, |
| "rewards/cosine_scaled_reward/mean": -0.15457069873809814, |
| "rewards/cosine_scaled_reward/std": 0.3450033664703369, |
| "step": 142 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1791.0, |
| "completions/mean_length": 1441.203125, |
| "completions/mean_terminated_length": 998.4054565429688, |
| "completions/min_length": 462.0, |
| "completions/min_terminated_length": 462.0, |
| "epoch": 0.16342857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2900330424308777, |
| "learning_rate": 9.103291169269299e-07, |
| "loss": 0.0, |
| "num_tokens": 17358875.0, |
| "reward": -0.1936979442834854, |
| "reward_std": 0.26940327882766724, |
| "rewards/cosine_scaled_reward/mean": -0.1936979442834854, |
| "rewards/cosine_scaled_reward/std": 0.31407564878463745, |
| "step": 143 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1560.703125, |
| "completions/mean_terminated_length": 1008.4334106445312, |
| "completions/min_length": 318.0, |
| "completions/min_terminated_length": 318.0, |
| "epoch": 0.16457142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29284507036209106, |
| "learning_rate": 9.084384631108882e-07, |
| "loss": -0.0, |
| "num_tokens": 17470248.0, |
| "reward": -0.14136260747909546, |
| "reward_std": 0.2985552251338959, |
| "rewards/cosine_scaled_reward/mean": -0.14136262238025665, |
| "rewards/cosine_scaled_reward/std": 0.4261241853237152, |
| "step": 144 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1864.0, |
| "completions/mean_length": 1226.0, |
| "completions/mean_terminated_length": 852.3636474609375, |
| "completions/min_length": 316.0, |
| "completions/min_terminated_length": 316.0, |
| "epoch": 0.1657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30853384733200073, |
| "learning_rate": 9.065303395098358e-07, |
| "loss": -0.0, |
| "num_tokens": 17558656.0, |
| "reward": -0.011180020868778229, |
| "reward_std": 0.3104313910007477, |
| "rewards/cosine_scaled_reward/mean": -0.011180016212165356, |
| "rewards/cosine_scaled_reward/std": 0.502927303314209, |
| "step": 145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1440.0, |
| "completions/mean_length": 1468.8125, |
| "completions/mean_terminated_length": 889.625, |
| "completions/min_length": 537.0, |
| "completions/min_terminated_length": 537.0, |
| "epoch": 0.16685714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25645971298217773, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": -0.0, |
| "num_tokens": 17663276.0, |
| "reward": -0.1956520974636078, |
| "reward_std": 0.24750414490699768, |
| "rewards/cosine_scaled_reward/mean": -0.1956520974636078, |
| "rewards/cosine_scaled_reward/std": 0.30754002928733826, |
| "step": 146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.453125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1517.84375, |
| "completions/mean_terminated_length": 1078.5714111328125, |
| "completions/min_length": 615.0, |
| "completions/min_terminated_length": 615.0, |
| "epoch": 0.168, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28331542015075684, |
| "learning_rate": 9.026620557966279e-07, |
| "loss": -0.0, |
| "num_tokens": 17771202.0, |
| "reward": -0.14546620845794678, |
| "reward_std": 0.307411253452301, |
| "rewards/cosine_scaled_reward/mean": -0.14546619355678558, |
| "rewards/cosine_scaled_reward/std": 0.3964070975780487, |
| "step": 147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1735.0, |
| "completions/mean_length": 1319.75, |
| "completions/mean_terminated_length": 882.7999877929688, |
| "completions/min_length": 412.0, |
| "completions/min_terminated_length": 412.0, |
| "epoch": 0.16914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24973155558109283, |
| "learning_rate": 9.007020842191634e-07, |
| "loss": -0.0, |
| "num_tokens": 17866850.0, |
| "reward": -0.05917578190565109, |
| "reward_std": 0.24221420288085938, |
| "rewards/cosine_scaled_reward/mean": -0.05917578190565109, |
| "rewards/cosine_scaled_reward/std": 0.39783161878585815, |
| "step": 148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1718.0, |
| "completions/mean_length": 1641.578125, |
| "completions/mean_terminated_length": 1007.5599975585938, |
| "completions/min_length": 624.0, |
| "completions/min_terminated_length": 624.0, |
| "epoch": 0.1702857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23923377692699432, |
| "learning_rate": 8.987250199168808e-07, |
| "loss": 0.0, |
| "num_tokens": 17983807.0, |
| "reward": -0.16958971321582794, |
| "reward_std": 0.3115168809890747, |
| "rewards/cosine_scaled_reward/mean": -0.16958969831466675, |
| "rewards/cosine_scaled_reward/std": 0.4009650945663452, |
| "step": 149 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1699.0, |
| "completions/mean_length": 1294.734375, |
| "completions/mean_terminated_length": 976.6889038085938, |
| "completions/min_length": 359.0, |
| "completions/min_terminated_length": 359.0, |
| "epoch": 0.17142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2846779525279999, |
| "learning_rate": 8.967309592491052e-07, |
| "loss": 0.0, |
| "num_tokens": 18077174.0, |
| "reward": -0.16757264733314514, |
| "reward_std": 0.26536184549331665, |
| "rewards/cosine_scaled_reward/mean": -0.16757264733314514, |
| "rewards/cosine_scaled_reward/std": 0.32911255955696106, |
| "step": 150 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 18077174, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|