| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.22857142857142856, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1702.03125, |
| "completions/mean_terminated_length": 993.6190795898438, |
| "completions/min_length": 483.0, |
| "completions/min_terminated_length": 483.0, |
| "epoch": 0.001142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28377610445022583, |
| "learning_rate": 0.0, |
| "loss": -0.0, |
| "num_tokens": 118418.0, |
| "reward": -0.09800112247467041, |
| "reward_std": 0.3028089702129364, |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1738.90625, |
| "completions/mean_terminated_length": 949.0, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.002285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24221572279930115, |
| "learning_rate": 2e-08, |
| "loss": -0.0, |
| "num_tokens": 239748.0, |
| "reward": 0.020556632429361343, |
| "reward_std": 0.3545936942100525, |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1542.0, |
| "completions/mean_length": 1964.078125, |
| "completions/mean_terminated_length": 973.7999877929688, |
| "completions/min_length": 733.0, |
| "completions/min_terminated_length": 733.0, |
| "epoch": 0.0034285714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2472974807024002, |
| "learning_rate": 4e-08, |
| "loss": 0.0, |
| "num_tokens": 375921.0, |
| "reward": -0.20954538881778717, |
| "reward_std": 0.13813795149326324, |
| "rewards/cosine_scaled_reward/mean": -0.20954540371894836, |
| "rewards/cosine_scaled_reward/std": 0.16814909875392914, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2010.0, |
| "completions/mean_length": 1555.6875, |
| "completions/mean_terminated_length": 1093.212158203125, |
| "completions/min_length": 502.0, |
| "completions/min_terminated_length": 502.0, |
| "epoch": 0.004571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2868657112121582, |
| "learning_rate": 6e-08, |
| "loss": -0.0, |
| "num_tokens": 485293.0, |
| "reward": -0.12192361056804657, |
| "reward_std": 0.31710442900657654, |
| "rewards/cosine_scaled_reward/mean": -0.12192361056804657, |
| "rewards/cosine_scaled_reward/std": 0.35428565740585327, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1758.0, |
| "completions/mean_length": 1958.5625, |
| "completions/mean_terminated_length": 1332.5, |
| "completions/min_length": 932.0, |
| "completions/min_terminated_length": 932.0, |
| "epoch": 0.005714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2460148185491562, |
| "learning_rate": 8e-08, |
| "loss": -0.0, |
| "num_tokens": 621457.0, |
| "reward": -0.21145480871200562, |
| "reward_std": 0.14890719950199127, |
| "rewards/cosine_scaled_reward/mean": -0.21145479381084442, |
| "rewards/cosine_scaled_reward/std": 0.20399661362171173, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1670.0, |
| "completions/mean_length": 1908.375, |
| "completions/mean_terminated_length": 931.0, |
| "completions/min_length": 593.0, |
| "completions/min_terminated_length": 593.0, |
| "epoch": 0.006857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26549720764160156, |
| "learning_rate": 1e-07, |
| "loss": -0.0, |
| "num_tokens": 755241.0, |
| "reward": -0.2408866286277771, |
| "reward_std": 0.16572487354278564, |
| "rewards/cosine_scaled_reward/mean": -0.2408866286277771, |
| "rewards/cosine_scaled_reward/std": 0.17492830753326416, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1889.296875, |
| "completions/mean_terminated_length": 1201.5833740234375, |
| "completions/min_length": 396.0, |
| "completions/min_terminated_length": 396.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23518230020999908, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0, |
| "num_tokens": 886564.0, |
| "reward": -0.16087877750396729, |
| "reward_std": 0.24579641222953796, |
| "rewards/cosine_scaled_reward/mean": -0.16087877750396729, |
| "rewards/cosine_scaled_reward/std": 0.37339961528778076, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1751.578125, |
| "completions/mean_terminated_length": 994.0555419921875, |
| "completions/min_length": 330.0, |
| "completions/min_terminated_length": 330.0, |
| "epoch": 0.009142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2354528158903122, |
| "learning_rate": 1.4e-07, |
| "loss": 0.0, |
| "num_tokens": 1009081.0, |
| "reward": -0.023812226951122284, |
| "reward_std": 0.2823081314563751, |
| "rewards/cosine_scaled_reward/mean": -0.02381223440170288, |
| "rewards/cosine_scaled_reward/std": 0.4484662115573883, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1889.0, |
| "completions/mean_length": 2000.59375, |
| "completions/mean_terminated_length": 1289.5, |
| "completions/min_length": 903.0, |
| "completions/min_terminated_length": 903.0, |
| "epoch": 0.010285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24302220344543457, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1148575.0, |
| "reward": -0.2453702688217163, |
| "reward_std": 0.18811637163162231, |
| "rewards/cosine_scaled_reward/mean": -0.2453702688217163, |
| "rewards/cosine_scaled_reward/std": 0.22203005850315094, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1739.0, |
| "completions/mean_length": 1701.140625, |
| "completions/mean_terminated_length": 879.631591796875, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "epoch": 0.011428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25642141699790955, |
| "learning_rate": 1.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1268280.0, |
| "reward": -0.15177705883979797, |
| "reward_std": 0.2125300019979477, |
| "rewards/cosine_scaled_reward/mean": -0.15177705883979797, |
| "rewards/cosine_scaled_reward/std": 0.3240113854408264, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1683.0, |
| "completions/mean_length": 1950.609375, |
| "completions/mean_terminated_length": 1157.571533203125, |
| "completions/min_length": 584.0, |
| "completions/min_terminated_length": 584.0, |
| "epoch": 0.012571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24372951686382294, |
| "learning_rate": 2e-07, |
| "loss": 0.0, |
| "num_tokens": 1404791.0, |
| "reward": -0.23502977192401886, |
| "reward_std": 0.18896539509296417, |
| "rewards/cosine_scaled_reward/mean": -0.23502977192401886, |
| "rewards/cosine_scaled_reward/std": 0.24224351346492767, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1944.0, |
| "completions/mean_length": 1751.03125, |
| "completions/mean_terminated_length": 1221.6522216796875, |
| "completions/min_length": 489.0, |
| "completions/min_terminated_length": 489.0, |
| "epoch": 0.013714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28422027826309204, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": -0.0, |
| "num_tokens": 1527801.0, |
| "reward": -0.14280016720294952, |
| "reward_std": 0.32843896746635437, |
| "rewards/cosine_scaled_reward/mean": -0.14280015230178833, |
| "rewards/cosine_scaled_reward/std": 0.41895967721939087, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1793.0, |
| "completions/mean_length": 1834.453125, |
| "completions/mean_terminated_length": 1193.8125, |
| "completions/min_length": 783.0, |
| "completions/min_terminated_length": 783.0, |
| "epoch": 0.014857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24033738672733307, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0, |
| "num_tokens": 1656246.0, |
| "reward": -0.17057427763938904, |
| "reward_std": 0.24429668486118317, |
| "rewards/cosine_scaled_reward/mean": -0.17057427763938904, |
| "rewards/cosine_scaled_reward/std": 0.27816399931907654, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1474.0, |
| "completions/mean_length": 1800.65625, |
| "completions/mean_terminated_length": 1116.823486328125, |
| "completions/min_length": 495.0, |
| "completions/min_terminated_length": 495.0, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2312558889389038, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1782096.0, |
| "reward": -0.11817245185375214, |
| "reward_std": 0.24491220712661743, |
| "rewards/cosine_scaled_reward/mean": -0.11817245930433273, |
| "rewards/cosine_scaled_reward/std": 0.3942086696624756, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1636.0, |
| "completions/mean_length": 1692.828125, |
| "completions/mean_terminated_length": 785.1666870117188, |
| "completions/min_length": 438.0, |
| "completions/min_terminated_length": 438.0, |
| "epoch": 0.017142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2563658654689789, |
| "learning_rate": 2.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1901357.0, |
| "reward": -0.027107469737529755, |
| "reward_std": 0.1853453516960144, |
| "rewards/cosine_scaled_reward/mean": -0.027107462286949158, |
| "rewards/cosine_scaled_reward/std": 0.4734213352203369, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2048.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2048.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24149107933044434, |
| "learning_rate": 3e-07, |
| "loss": -0.0, |
| "num_tokens": 2042869.0, |
| "reward": -0.2542623281478882, |
| "reward_std": 0.14302438497543335, |
| "rewards/cosine_scaled_reward/mean": -0.2542623281478882, |
| "rewards/cosine_scaled_reward/std": 0.160969540476799, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1827.0, |
| "completions/mean_length": 1548.75, |
| "completions/mean_terminated_length": 864.5925903320312, |
| "completions/min_length": 357.0, |
| "completions/min_terminated_length": 357.0, |
| "epoch": 0.019428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31088724732398987, |
| "learning_rate": 3.2e-07, |
| "loss": 0.0, |
| "num_tokens": 2152509.0, |
| "reward": -0.12113451957702637, |
| "reward_std": 0.284165620803833, |
| "rewards/cosine_scaled_reward/mean": -0.12113452702760696, |
| "rewards/cosine_scaled_reward/std": 0.4259316623210907, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1899.0, |
| "completions/mean_length": 1793.03125, |
| "completions/mean_terminated_length": 1028.125, |
| "completions/min_length": 531.0, |
| "completions/min_terminated_length": 531.0, |
| "epoch": 0.02057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2451843023300171, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0, |
| "num_tokens": 2277639.0, |
| "reward": -0.18317042291164398, |
| "reward_std": 0.20634235441684723, |
| "rewards/cosine_scaled_reward/mean": -0.18317043781280518, |
| "rewards/cosine_scaled_reward/std": 0.27781662344932556, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1633.0, |
| "completions/mean_length": 1735.984375, |
| "completions/mean_terminated_length": 997.0, |
| "completions/min_length": 462.0, |
| "completions/min_terminated_length": 462.0, |
| "epoch": 0.021714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24677637219429016, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0, |
| "num_tokens": 2399998.0, |
| "reward": -0.04996331408619881, |
| "reward_std": 0.2841629385948181, |
| "rewards/cosine_scaled_reward/mean": -0.04996330291032791, |
| "rewards/cosine_scaled_reward/std": 0.4186851680278778, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1643.0, |
| "completions/mean_length": 1614.890625, |
| "completions/mean_terminated_length": 842.8261108398438, |
| "completions/min_length": 411.0, |
| "completions/min_terminated_length": 411.0, |
| "epoch": 0.022857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2543003559112549, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": -0.0, |
| "num_tokens": 2514703.0, |
| "reward": -0.09282197058200836, |
| "reward_std": 0.2568933367729187, |
| "rewards/cosine_scaled_reward/mean": -0.09282197058200836, |
| "rewards/cosine_scaled_reward/std": 0.4104878604412079, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1988.0, |
| "completions/mean_length": 1786.734375, |
| "completions/mean_terminated_length": 1119.0555419921875, |
| "completions/min_length": 348.0, |
| "completions/min_terminated_length": 348.0, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3147278130054474, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "num_tokens": 2639862.0, |
| "reward": -0.16029146313667297, |
| "reward_std": 0.2322564721107483, |
| "rewards/cosine_scaled_reward/mean": -0.16029146313667297, |
| "rewards/cosine_scaled_reward/std": 0.36191171407699585, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1833.0, |
| "completions/mean_length": 1300.484375, |
| "completions/mean_terminated_length": 789.0263061523438, |
| "completions/min_length": 287.0, |
| "completions/min_terminated_length": 287.0, |
| "epoch": 0.025142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32522445917129517, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0, |
| "num_tokens": 2732109.0, |
| "reward": 0.0033364146947860718, |
| "reward_std": 0.18878400325775146, |
| "rewards/cosine_scaled_reward/mean": 0.0033364109694957733, |
| "rewards/cosine_scaled_reward/std": 0.45390966534614563, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1641.03125, |
| "completions/mean_terminated_length": 1046.2308349609375, |
| "completions/min_length": 422.0, |
| "completions/min_terminated_length": 422.0, |
| "epoch": 0.026285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28244850039482117, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0, |
| "num_tokens": 2847927.0, |
| "reward": -0.21077856421470642, |
| "reward_std": 0.24399788677692413, |
| "rewards/cosine_scaled_reward/mean": -0.21077856421470642, |
| "rewards/cosine_scaled_reward/std": 0.2925592362880707, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1965.0, |
| "completions/mean_length": 1789.59375, |
| "completions/mean_terminated_length": 1129.2222900390625, |
| "completions/min_length": 560.0, |
| "completions/min_terminated_length": 560.0, |
| "epoch": 0.027428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24896308779716492, |
| "learning_rate": 4.6e-07, |
| "loss": -0.0, |
| "num_tokens": 2973389.0, |
| "reward": -0.1665852814912796, |
| "reward_std": 0.307574987411499, |
| "rewards/cosine_scaled_reward/mean": -0.1665852665901184, |
| "rewards/cosine_scaled_reward/std": 0.4072873294353485, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1851.0, |
| "completions/mean_length": 1696.40625, |
| "completions/mean_terminated_length": 1025.181884765625, |
| "completions/min_length": 434.0, |
| "completions/min_terminated_length": 434.0, |
| "epoch": 0.02857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.262716144323349, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0, |
| "num_tokens": 3092255.0, |
| "reward": -0.14361324906349182, |
| "reward_std": 0.3466429114341736, |
| "rewards/cosine_scaled_reward/mean": -0.14361326396465302, |
| "rewards/cosine_scaled_reward/std": 0.3933021128177643, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1975.0, |
| "completions/mean_length": 1973.046875, |
| "completions/mean_terminated_length": 1448.375, |
| "completions/min_length": 1035.0, |
| "completions/min_terminated_length": 1035.0, |
| "epoch": 0.029714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2365841567516327, |
| "learning_rate": 5e-07, |
| "loss": -0.0, |
| "num_tokens": 3229162.0, |
| "reward": -0.050574399530887604, |
| "reward_std": 0.22459164261817932, |
| "rewards/cosine_scaled_reward/mean": -0.050574399530887604, |
| "rewards/cosine_scaled_reward/std": 0.37290775775909424, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1878.53125, |
| "completions/mean_terminated_length": 1213.6923828125, |
| "completions/min_length": 498.0, |
| "completions/min_terminated_length": 498.0, |
| "epoch": 0.030857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2821083068847656, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0, |
| "num_tokens": 3359676.0, |
| "reward": -0.13096781075000763, |
| "reward_std": 0.26249831914901733, |
| "rewards/cosine_scaled_reward/mean": -0.13096781075000763, |
| "rewards/cosine_scaled_reward/std": 0.3478032350540161, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1933.0, |
| "completions/mean_length": 1827.453125, |
| "completions/mean_terminated_length": 1039.7857666015625, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2539210915565491, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0, |
| "num_tokens": 3486969.0, |
| "reward": -0.11822876334190369, |
| "reward_std": 0.2370690554380417, |
| "rewards/cosine_scaled_reward/mean": -0.11822875589132309, |
| "rewards/cosine_scaled_reward/std": 0.4236762225627899, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 2020.5, |
| "completions/mean_terminated_length": 1608.0, |
| "completions/min_length": 887.0, |
| "completions/min_terminated_length": 887.0, |
| "epoch": 0.03314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23259545862674713, |
| "learning_rate": 5.6e-07, |
| "loss": -0.0, |
| "num_tokens": 3626753.0, |
| "reward": -0.20220182836055756, |
| "reward_std": 0.15910759568214417, |
| "rewards/cosine_scaled_reward/mean": -0.20220182836055756, |
| "rewards/cosine_scaled_reward/std": 0.20781411230564117, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1662.0, |
| "completions/mean_length": 1903.703125, |
| "completions/mean_terminated_length": 1208.45458984375, |
| "completions/min_length": 961.0, |
| "completions/min_terminated_length": 961.0, |
| "epoch": 0.03428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24027252197265625, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0, |
| "num_tokens": 3759126.0, |
| "reward": -0.19193249940872192, |
| "reward_std": 0.24584847688674927, |
| "rewards/cosine_scaled_reward/mean": -0.19193249940872192, |
| "rewards/cosine_scaled_reward/std": 0.28378522396087646, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1922.0, |
| "completions/mean_length": 1847.34375, |
| "completions/mean_terminated_length": 1060.1539306640625, |
| "completions/min_length": 311.0, |
| "completions/min_terminated_length": 311.0, |
| "epoch": 0.03542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2703397274017334, |
| "learning_rate": 6e-07, |
| "loss": -0.0, |
| "num_tokens": 3887852.0, |
| "reward": -0.25379180908203125, |
| "reward_std": 0.24661941826343536, |
| "rewards/cosine_scaled_reward/mean": -0.25379180908203125, |
| "rewards/cosine_scaled_reward/std": 0.29188498854637146, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 1950.3125, |
| "completions/mean_terminated_length": 1479.6363525390625, |
| "completions/min_length": 766.0, |
| "completions/min_terminated_length": 766.0, |
| "epoch": 0.036571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21763876080513, |
| "learning_rate": 6.2e-07, |
| "loss": -0.0, |
| "num_tokens": 4023024.0, |
| "reward": -0.16017228364944458, |
| "reward_std": 0.2255343496799469, |
| "rewards/cosine_scaled_reward/mean": -0.16017228364944458, |
| "rewards/cosine_scaled_reward/std": 0.3709539771080017, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 1996.28125, |
| "completions/mean_terminated_length": 1634.25, |
| "completions/min_length": 1237.0, |
| "completions/min_terminated_length": 1237.0, |
| "epoch": 0.037714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22758260369300842, |
| "learning_rate": 6.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4162002.0, |
| "reward": -0.20318198204040527, |
| "reward_std": 0.18396919965744019, |
| "rewards/cosine_scaled_reward/mean": -0.20318198204040527, |
| "rewards/cosine_scaled_reward/std": 0.34913352131843567, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1850.0, |
| "completions/mean_length": 1703.265625, |
| "completions/mean_terminated_length": 1230.851806640625, |
| "completions/min_length": 651.0, |
| "completions/min_terminated_length": 651.0, |
| "epoch": 0.038857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31658875942230225, |
| "learning_rate": 6.6e-07, |
| "loss": -0.0, |
| "num_tokens": 4280563.0, |
| "reward": -0.05977274850010872, |
| "reward_std": 0.30437377095222473, |
| "rewards/cosine_scaled_reward/mean": -0.059772733598947525, |
| "rewards/cosine_scaled_reward/std": 0.4424094259738922, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1846.0, |
| "completions/mean_length": 1807.546875, |
| "completions/mean_terminated_length": 765.5833740234375, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2792847156524658, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": -0.0, |
| "num_tokens": 4407742.0, |
| "reward": -0.18658886849880219, |
| "reward_std": 0.2910658121109009, |
| "rewards/cosine_scaled_reward/mean": -0.18658888339996338, |
| "rewards/cosine_scaled_reward/std": 0.34802255034446716, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1912.0, |
| "completions/mean_length": 1995.65625, |
| "completions/mean_terminated_length": 1378.0, |
| "completions/min_length": 1090.0, |
| "completions/min_terminated_length": 1090.0, |
| "epoch": 0.04114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23547738790512085, |
| "learning_rate": 7e-07, |
| "loss": 0.0, |
| "num_tokens": 4546576.0, |
| "reward": -0.23918019235134125, |
| "reward_std": 0.19598917663097382, |
| "rewards/cosine_scaled_reward/mean": -0.23918019235134125, |
| "rewards/cosine_scaled_reward/std": 0.2425125539302826, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2015.0, |
| "completions/mean_length": 1994.75, |
| "completions/mean_terminated_length": 1480.0, |
| "completions/min_length": 545.0, |
| "completions/min_terminated_length": 545.0, |
| "epoch": 0.04228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22962674498558044, |
| "learning_rate": 7.2e-07, |
| "loss": -0.0, |
| "num_tokens": 4685264.0, |
| "reward": -0.25335729122161865, |
| "reward_std": 0.15323391556739807, |
| "rewards/cosine_scaled_reward/mean": -0.25335729122161865, |
| "rewards/cosine_scaled_reward/std": 0.17556406557559967, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1934.0, |
| "completions/mean_length": 1957.484375, |
| "completions/mean_terminated_length": 1220.4285888671875, |
| "completions/min_length": 965.0, |
| "completions/min_terminated_length": 965.0, |
| "epoch": 0.04342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24781912565231323, |
| "learning_rate": 7.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4822255.0, |
| "reward": -0.13536512851715088, |
| "reward_std": 0.19208545982837677, |
| "rewards/cosine_scaled_reward/mean": -0.13536511361598969, |
| "rewards/cosine_scaled_reward/std": 0.30052343010902405, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1824.0, |
| "completions/mean_length": 1744.421875, |
| "completions/mean_terminated_length": 833.6875, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.044571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2562144994735718, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 4944682.0, |
| "reward": -0.041110455989837646, |
| "reward_std": 0.21381449699401855, |
| "rewards/cosine_scaled_reward/mean": -0.04111045226454735, |
| "rewards/cosine_scaled_reward/std": 0.35980772972106934, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1774.359375, |
| "completions/mean_terminated_length": 1017.8235473632812, |
| "completions/min_length": 445.0, |
| "completions/min_terminated_length": 445.0, |
| "epoch": 0.045714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25478634238243103, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5068313.0, |
| "reward": -0.12165145576000214, |
| "reward_std": 0.17204006016254425, |
| "rewards/cosine_scaled_reward/mean": -0.12165144830942154, |
| "rewards/cosine_scaled_reward/std": 0.4099982678890228, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1991.0, |
| "completions/mean_length": 1814.375, |
| "completions/mean_terminated_length": 1397.9130859375, |
| "completions/min_length": 968.0, |
| "completions/min_terminated_length": 968.0, |
| "epoch": 0.046857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21750310063362122, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "num_tokens": 5195585.0, |
| "reward": -0.25668060779571533, |
| "reward_std": 0.2832298278808594, |
| "rewards/cosine_scaled_reward/mean": -0.25668060779571533, |
| "rewards/cosine_scaled_reward/std": 0.3347759544849396, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1764.0, |
| "completions/mean_length": 1714.59375, |
| "completions/mean_terminated_length": 625.4666748046875, |
| "completions/min_length": 186.0, |
| "completions/min_terminated_length": 186.0, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34486907720565796, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 5315679.0, |
| "reward": -0.2253742218017578, |
| "reward_std": 0.1778060495853424, |
| "rewards/cosine_scaled_reward/mean": -0.22537420690059662, |
| "rewards/cosine_scaled_reward/std": 0.19647939503192902, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1638.0, |
| "completions/mean_length": 1863.78125, |
| "completions/mean_terminated_length": 976.1818237304688, |
| "completions/min_length": 669.0, |
| "completions/min_terminated_length": 669.0, |
| "epoch": 0.04914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23907455801963806, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5446577.0, |
| "reward": -0.1142776757478714, |
| "reward_std": 0.21804723143577576, |
| "rewards/cosine_scaled_reward/mean": -0.1142776757478714, |
| "rewards/cosine_scaled_reward/std": 0.3637608587741852, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1771.125, |
| "completions/mean_terminated_length": 940.5, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.05028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2888188362121582, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5570625.0, |
| "reward": -0.11845305562019348, |
| "reward_std": 0.2729855477809906, |
| "rewards/cosine_scaled_reward/mean": -0.11845306307077408, |
| "rewards/cosine_scaled_reward/std": 0.4279690086841583, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.96875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1296.0, |
| "completions/mean_length": 2020.859375, |
| "completions/mean_terminated_length": 1179.5, |
| "completions/min_length": 1063.0, |
| "completions/min_terminated_length": 1063.0, |
| "epoch": 0.05142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2232045829296112, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5711616.0, |
| "reward": -0.1830526441335678, |
| "reward_std": 0.20074567198753357, |
| "rewards/cosine_scaled_reward/mean": -0.1830526441335678, |
| "rewards/cosine_scaled_reward/std": 0.3221423327922821, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1121.0, |
| "completions/mean_length": 1843.328125, |
| "completions/mean_terminated_length": 857.1818237304688, |
| "completions/min_length": 608.0, |
| "completions/min_terminated_length": 608.0, |
| "epoch": 0.052571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2569328844547272, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "num_tokens": 5840757.0, |
| "reward": -0.21247822046279907, |
| "reward_std": 0.17188501358032227, |
| "rewards/cosine_scaled_reward/mean": -0.21247822046279907, |
| "rewards/cosine_scaled_reward/std": 0.183182492852211, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1772.984375, |
| "completions/mean_terminated_length": 1012.6470336914062, |
| "completions/min_length": 461.0, |
| "completions/min_terminated_length": 461.0, |
| "epoch": 0.053714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2800576090812683, |
| "learning_rate": 9.2e-07, |
| "loss": -0.0, |
| "num_tokens": 5964628.0, |
| "reward": -0.1755329668521881, |
| "reward_std": 0.19662824273109436, |
| "rewards/cosine_scaled_reward/mean": -0.1755329668521881, |
| "rewards/cosine_scaled_reward/std": 0.3987559974193573, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1949.0, |
| "completions/mean_length": 1787.046875, |
| "completions/mean_terminated_length": 1120.1666259765625, |
| "completions/min_length": 630.0, |
| "completions/min_terminated_length": 630.0, |
| "epoch": 0.054857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2499135434627533, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 6089543.0, |
| "reward": -0.07469595968723297, |
| "reward_std": 0.2802818715572357, |
| "rewards/cosine_scaled_reward/mean": -0.07469595968723297, |
| "rewards/cosine_scaled_reward/std": 0.39331451058387756, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1818.0, |
| "completions/mean_length": 1611.65625, |
| "completions/mean_terminated_length": 1013.7037353515625, |
| "completions/min_length": 298.0, |
| "completions/min_terminated_length": 298.0, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2976716160774231, |
| "learning_rate": 9.6e-07, |
| "loss": -0.0, |
| "num_tokens": 6202753.0, |
| "reward": -0.14219576120376587, |
| "reward_std": 0.3252427875995636, |
| "rewards/cosine_scaled_reward/mean": -0.14219576120376587, |
| "rewards/cosine_scaled_reward/std": 0.41946855187416077, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1826.90625, |
| "completions/mean_terminated_length": 761.6364135742188, |
| "completions/min_length": 341.0, |
| "completions/min_terminated_length": 341.0, |
| "epoch": 0.05714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2344626933336258, |
| "learning_rate": 9.8e-07, |
| "loss": -0.0, |
| "num_tokens": 6330491.0, |
| "reward": -0.098542720079422, |
| "reward_std": 0.20483215153217316, |
| "rewards/cosine_scaled_reward/mean": -0.0985427126288414, |
| "rewards/cosine_scaled_reward/std": 0.396296888589859, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 1520.1875, |
| "completions/mean_terminated_length": 922.0000610351562, |
| "completions/min_length": 450.0, |
| "completions/min_terminated_length": 450.0, |
| "epoch": 0.05828571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30348992347717285, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "num_tokens": 6437991.0, |
| "reward": -0.12996003031730652, |
| "reward_std": 0.2803010940551758, |
| "rewards/cosine_scaled_reward/mean": -0.12996003031730652, |
| "rewards/cosine_scaled_reward/std": 0.3464147746562958, |
| "step": 51 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1930.0, |
| "completions/mean_length": 1726.71875, |
| "completions/mean_terminated_length": 838.4705810546875, |
| "completions/min_length": 315.0, |
| "completions/min_terminated_length": 315.0, |
| "epoch": 0.05942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2905585765838623, |
| "learning_rate": 9.999890338174275e-07, |
| "loss": -0.0, |
| "num_tokens": 6559853.0, |
| "reward": -0.2443142831325531, |
| "reward_std": 0.21010473370552063, |
| "rewards/cosine_scaled_reward/mean": -0.2443142831325531, |
| "rewards/cosine_scaled_reward/std": 0.32864055037498474, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1585.0, |
| "completions/mean_length": 1757.015625, |
| "completions/mean_terminated_length": 952.5294189453125, |
| "completions/min_length": 463.0, |
| "completions/min_terminated_length": 463.0, |
| "epoch": 0.060571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2699633538722992, |
| "learning_rate": 9.999561358041868e-07, |
| "loss": 0.0, |
| "num_tokens": 6683134.0, |
| "reward": -0.18116676807403564, |
| "reward_std": 0.2308851182460785, |
| "rewards/cosine_scaled_reward/mean": -0.18116676807403564, |
| "rewards/cosine_scaled_reward/std": 0.27486056089401245, |
| "step": 53 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2018.0, |
| "completions/mean_length": 1850.65625, |
| "completions/mean_terminated_length": 1206.0001220703125, |
| "completions/min_length": 695.0, |
| "completions/min_terminated_length": 695.0, |
| "epoch": 0.061714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23383355140686035, |
| "learning_rate": 9.999013075636804e-07, |
| "loss": -0.0, |
| "num_tokens": 6812720.0, |
| "reward": -0.14257444441318512, |
| "reward_std": 0.29668545722961426, |
| "rewards/cosine_scaled_reward/mean": -0.14257442951202393, |
| "rewards/cosine_scaled_reward/std": 0.4257228672504425, |
| "step": 54 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1449.0, |
| "completions/mean_length": 1754.640625, |
| "completions/mean_terminated_length": 874.5625, |
| "completions/min_length": 581.0, |
| "completions/min_terminated_length": 581.0, |
| "epoch": 0.06285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23320119082927704, |
| "learning_rate": 9.998245517681593e-07, |
| "loss": -0.0, |
| "num_tokens": 6935305.0, |
| "reward": -0.14078931510448456, |
| "reward_std": 0.17466726899147034, |
| "rewards/cosine_scaled_reward/mean": -0.14078931510448456, |
| "rewards/cosine_scaled_reward/std": 0.3331747353076935, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1918.0, |
| "completions/mean_length": 1853.78125, |
| "completions/mean_terminated_length": 918.0, |
| "completions/min_length": 571.0, |
| "completions/min_terminated_length": 571.0, |
| "epoch": 0.064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23405365645885468, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": -0.0, |
| "num_tokens": 7064907.0, |
| "reward": -0.11611534655094147, |
| "reward_std": 0.19285616278648376, |
| "rewards/cosine_scaled_reward/mean": -0.11611534655094147, |
| "rewards/cosine_scaled_reward/std": 0.47406119108200073, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1851.0, |
| "completions/mean_length": 1971.640625, |
| "completions/mean_terminated_length": 1437.125, |
| "completions/min_length": 1009.0, |
| "completions/min_terminated_length": 1009.0, |
| "epoch": 0.06514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20449356734752655, |
| "learning_rate": 9.996052735444862e-07, |
| "loss": 0.0, |
| "num_tokens": 7202660.0, |
| "reward": -0.27627938985824585, |
| "reward_std": 0.2080146074295044, |
| "rewards/cosine_scaled_reward/mean": -0.27627938985824585, |
| "rewards/cosine_scaled_reward/std": 0.2397139072418213, |
| "step": 57 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1813.0, |
| "completions/mean_length": 1678.09375, |
| "completions/mean_terminated_length": 971.9091186523438, |
| "completions/min_length": 540.0, |
| "completions/min_terminated_length": 540.0, |
| "epoch": 0.06628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.251164048910141, |
| "learning_rate": 9.994627618036452e-07, |
| "loss": -0.0, |
| "num_tokens": 7320154.0, |
| "reward": -0.1333095282316208, |
| "reward_std": 0.27265745401382446, |
| "rewards/cosine_scaled_reward/mean": -0.1333095282316208, |
| "rewards/cosine_scaled_reward/std": 0.3821713328361511, |
| "step": 58 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1453.0, |
| "completions/mean_length": 1732.171875, |
| "completions/mean_terminated_length": 859.0, |
| "completions/min_length": 531.0, |
| "completions/min_terminated_length": 531.0, |
| "epoch": 0.06742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22478283941745758, |
| "learning_rate": 9.992983438818915e-07, |
| "loss": -0.0, |
| "num_tokens": 7441477.0, |
| "reward": -0.18278491497039795, |
| "reward_std": 0.2154037207365036, |
| "rewards/cosine_scaled_reward/mean": -0.18278491497039795, |
| "rewards/cosine_scaled_reward/std": 0.3414745628833771, |
| "step": 59 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1935.0, |
| "completions/mean_length": 1798.375, |
| "completions/mean_terminated_length": 982.9334106445312, |
| "completions/min_length": 613.0, |
| "completions/min_terminated_length": 613.0, |
| "epoch": 0.06857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22602440416812897, |
| "learning_rate": 9.991120277927223e-07, |
| "loss": -0.0, |
| "num_tokens": 7567461.0, |
| "reward": -0.265900194644928, |
| "reward_std": 0.1530904918909073, |
| "rewards/cosine_scaled_reward/mean": -0.265900194644928, |
| "rewards/cosine_scaled_reward/std": 0.18254056572914124, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1974.0, |
| "completions/mean_length": 1950.578125, |
| "completions/mean_terminated_length": 1424.5, |
| "completions/min_length": 808.0, |
| "completions/min_terminated_length": 808.0, |
| "epoch": 0.06971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22684067487716675, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": -0.0, |
| "num_tokens": 7703818.0, |
| "reward": -0.05269922316074371, |
| "reward_std": 0.3038993775844574, |
| "rewards/cosine_scaled_reward/mean": -0.052699219435453415, |
| "rewards/cosine_scaled_reward/std": 0.36445698142051697, |
| "step": 61 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1717.703125, |
| "completions/mean_terminated_length": 1041.3809814453125, |
| "completions/min_length": 432.0, |
| "completions/min_terminated_length": 432.0, |
| "epoch": 0.07085714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23552638292312622, |
| "learning_rate": 9.98673738502114e-07, |
| "loss": 0.0, |
| "num_tokens": 7823983.0, |
| "reward": -0.07779724895954132, |
| "reward_std": 0.2913648784160614, |
| "rewards/cosine_scaled_reward/mean": -0.07779725641012192, |
| "rewards/cosine_scaled_reward/std": 0.4099881649017334, |
| "step": 62 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1751.0, |
| "completions/mean_length": 1600.625, |
| "completions/mean_terminated_length": 1180.3636474609375, |
| "completions/min_length": 420.0, |
| "completions/min_terminated_length": 420.0, |
| "epoch": 0.072, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28230276703834534, |
| "learning_rate": 9.98421786662277e-07, |
| "loss": 0.0, |
| "num_tokens": 7936679.0, |
| "reward": -0.02632874622941017, |
| "reward_std": 0.25066205859184265, |
| "rewards/cosine_scaled_reward/mean": -0.02632874995470047, |
| "rewards/cosine_scaled_reward/std": 0.4263686537742615, |
| "step": 63 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1743.578125, |
| "completions/mean_terminated_length": 1073.8499755859375, |
| "completions/min_length": 496.0, |
| "completions/min_terminated_length": 496.0, |
| "epoch": 0.07314285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.266590416431427, |
| "learning_rate": 9.981479793771866e-07, |
| "loss": 0.0, |
| "num_tokens": 8059220.0, |
| "reward": -0.10920079052448273, |
| "reward_std": 0.3089619576931, |
| "rewards/cosine_scaled_reward/mean": -0.10920079052448273, |
| "rewards/cosine_scaled_reward/std": 0.43342384696006775, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1127.0, |
| "completions/mean_length": 1690.609375, |
| "completions/mean_terminated_length": 618.4375, |
| "completions/min_length": 331.0, |
| "completions/min_terminated_length": 331.0, |
| "epoch": 0.07428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2891872525215149, |
| "learning_rate": 9.97852329991824e-07, |
| "loss": 0.0, |
| "num_tokens": 8178123.0, |
| "reward": -0.2091352641582489, |
| "reward_std": 0.18792679905891418, |
| "rewards/cosine_scaled_reward/mean": -0.2091352641582489, |
| "rewards/cosine_scaled_reward/std": 0.40636762976646423, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1529.0, |
| "completions/mean_length": 1320.453125, |
| "completions/mean_terminated_length": 678.5, |
| "completions/min_length": 219.0, |
| "completions/min_terminated_length": 219.0, |
| "epoch": 0.07542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30139341950416565, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": -0.0, |
| "num_tokens": 8272864.0, |
| "reward": -0.012375831604003906, |
| "reward_std": 0.2539718747138977, |
| "rewards/cosine_scaled_reward/mean": -0.01237582415342331, |
| "rewards/cosine_scaled_reward/std": 0.45652061700820923, |
| "step": 66 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1368.0, |
| "completions/mean_length": 2001.21875, |
| "completions/mean_terminated_length": 1050.0, |
| "completions/min_length": 817.0, |
| "completions/min_terminated_length": 817.0, |
| "epoch": 0.07657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21435414254665375, |
| "learning_rate": 9.971955636222684e-07, |
| "loss": 0.0, |
| "num_tokens": 8411678.0, |
| "reward": -0.27966073155403137, |
| "reward_std": 0.14496129751205444, |
| "rewards/cosine_scaled_reward/mean": -0.27966073155403137, |
| "rewards/cosine_scaled_reward/std": 0.1733873188495636, |
| "step": 67 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.453125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1963.0, |
| "completions/mean_length": 1359.65625, |
| "completions/mean_terminated_length": 789.3142700195312, |
| "completions/min_length": 347.0, |
| "completions/min_terminated_length": 347.0, |
| "epoch": 0.07771428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3244759440422058, |
| "learning_rate": 9.968344786479415e-07, |
| "loss": -0.0, |
| "num_tokens": 8507952.0, |
| "reward": -0.06231251358985901, |
| "reward_std": 0.31347835063934326, |
| "rewards/cosine_scaled_reward/mean": -0.062312521040439606, |
| "rewards/cosine_scaled_reward/std": 0.40184450149536133, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1859.0, |
| "completions/mean_length": 1572.78125, |
| "completions/mean_terminated_length": 831.4400024414062, |
| "completions/min_length": 358.0, |
| "completions/min_terminated_length": 358.0, |
| "epoch": 0.07885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3429071605205536, |
| "learning_rate": 9.964516155915151e-07, |
| "loss": 0.0, |
| "num_tokens": 8618954.0, |
| "reward": -0.24097035825252533, |
| "reward_std": 0.22784993052482605, |
| "rewards/cosine_scaled_reward/mean": -0.24097035825252533, |
| "rewards/cosine_scaled_reward/std": 0.2594495415687561, |
| "step": 69 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1921.0, |
| "completions/mean_length": 1859.578125, |
| "completions/mean_terminated_length": 951.727294921875, |
| "completions/min_length": 532.0, |
| "completions/min_terminated_length": 532.0, |
| "epoch": 0.08, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.221941277384758, |
| "learning_rate": 9.960469931131936e-07, |
| "loss": -0.0, |
| "num_tokens": 8749423.0, |
| "reward": -0.27105003595352173, |
| "reward_std": 0.16835230588912964, |
| "rewards/cosine_scaled_reward/mean": -0.27105003595352173, |
| "rewards/cosine_scaled_reward/std": 0.21196867525577545, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1629.0, |
| "completions/mean_length": 1668.265625, |
| "completions/mean_terminated_length": 832.8500366210938, |
| "completions/min_length": 489.0, |
| "completions/min_terminated_length": 489.0, |
| "epoch": 0.08114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2909034192562103, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.0, |
| "num_tokens": 8866912.0, |
| "reward": -0.09497882425785065, |
| "reward_std": 0.2813299000263214, |
| "rewards/cosine_scaled_reward/mean": -0.09497880935668945, |
| "rewards/cosine_scaled_reward/std": 0.4832696318626404, |
| "step": 71 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1903.0, |
| "completions/mean_length": 1697.671875, |
| "completions/mean_terminated_length": 926.9500122070312, |
| "completions/min_length": 275.0, |
| "completions/min_terminated_length": 275.0, |
| "epoch": 0.08228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3016415238380432, |
| "learning_rate": 9.951725498333448e-07, |
| "loss": -0.0, |
| "num_tokens": 8985915.0, |
| "reward": -0.22967606782913208, |
| "reward_std": 0.18875859677791595, |
| "rewards/cosine_scaled_reward/mean": -0.2296760529279709, |
| "rewards/cosine_scaled_reward/std": 0.22012120485305786, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 2020.703125, |
| "completions/mean_terminated_length": 1465.666748046875, |
| "completions/min_length": 1143.0, |
| "completions/min_terminated_length": 1143.0, |
| "epoch": 0.08342857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21586637198925018, |
| "learning_rate": 9.947027716509488e-07, |
| "loss": 0.0, |
| "num_tokens": 9125968.0, |
| "reward": -0.24284613132476807, |
| "reward_std": 0.22862236201763153, |
| "rewards/cosine_scaled_reward/mean": -0.24284613132476807, |
| "rewards/cosine_scaled_reward/std": 0.24740919470787048, |
| "step": 73 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1975.09375, |
| "completions/mean_terminated_length": 1381.4285888671875, |
| "completions/min_length": 532.0, |
| "completions/min_terminated_length": 532.0, |
| "epoch": 0.08457142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21680164337158203, |
| "learning_rate": 9.942113192828444e-07, |
| "loss": 0.0, |
| "num_tokens": 9262302.0, |
| "reward": -0.1543380469083786, |
| "reward_std": 0.24083258211612701, |
| "rewards/cosine_scaled_reward/mean": -0.1543380618095398, |
| "rewards/cosine_scaled_reward/std": 0.3356986939907074, |
| "step": 74 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2038.0, |
| "completions/mean_length": 1836.3125, |
| "completions/mean_terminated_length": 1295.3333740234375, |
| "completions/min_length": 653.0, |
| "completions/min_terminated_length": 653.0, |
| "epoch": 0.08571428571428572, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.1845395565032959, |
| "learning_rate": 9.93698216681727e-07, |
| "loss": 0.0, |
| "num_tokens": 9390786.0, |
| "reward": -0.12792138755321503, |
| "reward_std": 0.10224759578704834, |
| "rewards/cosine_scaled_reward/mean": -0.12792138755321503, |
| "rewards/cosine_scaled_reward/std": 0.4530969560146332, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1422.0, |
| "completions/mean_length": 1764.109375, |
| "completions/mean_terminated_length": 836.7333984375, |
| "completions/min_length": 320.0, |
| "completions/min_terminated_length": 320.0, |
| "epoch": 0.08685714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26535236835479736, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": -0.0, |
| "num_tokens": 9514089.0, |
| "reward": -0.27717918157577515, |
| "reward_std": 0.19932743906974792, |
| "rewards/cosine_scaled_reward/mean": -0.27717918157577515, |
| "rewards/cosine_scaled_reward/std": 0.20844916999340057, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1864.0, |
| "completions/mean_length": 1945.109375, |
| "completions/mean_terminated_length": 1224.875, |
| "completions/min_length": 702.0, |
| "completions/min_terminated_length": 702.0, |
| "epoch": 0.088, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2047174870967865, |
| "learning_rate": 9.926071618660237e-07, |
| "loss": -0.0, |
| "num_tokens": 9650152.0, |
| "reward": -0.09873012453317642, |
| "reward_std": 0.22244854271411896, |
| "rewards/cosine_scaled_reward/mean": -0.09873010218143463, |
| "rewards/cosine_scaled_reward/std": 0.34491515159606934, |
| "step": 77 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1976.0, |
| "completions/mean_length": 1928.703125, |
| "completions/mean_terminated_length": 1199.6666259765625, |
| "completions/min_length": 722.0, |
| "completions/min_terminated_length": 722.0, |
| "epoch": 0.08914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22559019923210144, |
| "learning_rate": 9.9202926282791e-07, |
| "loss": 0.0, |
| "num_tokens": 9784309.0, |
| "reward": -0.09572747349739075, |
| "reward_std": 0.23068635165691376, |
| "rewards/cosine_scaled_reward/mean": -0.09572747349739075, |
| "rewards/cosine_scaled_reward/std": 0.38660773634910583, |
| "step": 78 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1864.0, |
| "completions/mean_length": 1508.40625, |
| "completions/mean_terminated_length": 814.6428833007812, |
| "completions/min_length": 339.0, |
| "completions/min_terminated_length": 339.0, |
| "epoch": 0.09028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24668477475643158, |
| "learning_rate": 9.91429819907136e-07, |
| "loss": 0.0, |
| "num_tokens": 9890943.0, |
| "reward": -0.1618795394897461, |
| "reward_std": 0.22540031373500824, |
| "rewards/cosine_scaled_reward/mean": -0.1618795245885849, |
| "rewards/cosine_scaled_reward/std": 0.3233039081096649, |
| "step": 79 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1996.0, |
| "completions/mean_length": 2012.671875, |
| "completions/mean_terminated_length": 1725.0001220703125, |
| "completions/min_length": 1283.0, |
| "completions/min_terminated_length": 1283.0, |
| "epoch": 0.09142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24282054603099823, |
| "learning_rate": 9.908088623197048e-07, |
| "loss": -0.0, |
| "num_tokens": 10030146.0, |
| "reward": -0.25591158866882324, |
| "reward_std": 0.15104801952838898, |
| "rewards/cosine_scaled_reward/mean": -0.25591158866882324, |
| "rewards/cosine_scaled_reward/std": 0.18741995096206665, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1878.0, |
| "completions/mean_length": 1821.921875, |
| "completions/mean_terminated_length": 935.0000610351562, |
| "completions/min_length": 580.0, |
| "completions/min_terminated_length": 580.0, |
| "epoch": 0.09257142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3027254641056061, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0, |
| "num_tokens": 10158021.0, |
| "reward": -0.15331333875656128, |
| "reward_std": 0.18424856662750244, |
| "rewards/cosine_scaled_reward/mean": -0.15331333875656128, |
| "rewards/cosine_scaled_reward/std": 0.24023762345314026, |
| "step": 81 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1852.0, |
| "completions/mean_length": 1734.28125, |
| "completions/mean_terminated_length": 991.26318359375, |
| "completions/min_length": 477.0, |
| "completions/min_terminated_length": 477.0, |
| "epoch": 0.09371428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2466808557510376, |
| "learning_rate": 9.895025252503755e-07, |
| "loss": -0.0, |
| "num_tokens": 10279343.0, |
| "reward": -0.07192108780145645, |
| "reward_std": 0.2587333917617798, |
| "rewards/cosine_scaled_reward/mean": -0.07192108780145645, |
| "rewards/cosine_scaled_reward/std": 0.46087121963500977, |
| "step": 82 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1571.0, |
| "completions/mean_length": 1688.78125, |
| "completions/mean_terminated_length": 953.2380981445312, |
| "completions/min_length": 495.0, |
| "completions/min_terminated_length": 495.0, |
| "epoch": 0.09485714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2600877285003662, |
| "learning_rate": 9.888172094375033e-07, |
| "loss": 0.0, |
| "num_tokens": 10398513.0, |
| "reward": -0.1718086451292038, |
| "reward_std": 0.2223512828350067, |
| "rewards/cosine_scaled_reward/mean": -0.1718086451292038, |
| "rewards/cosine_scaled_reward/std": 0.2828122675418854, |
| "step": 83 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1666.0, |
| "completions/mean_length": 1838.203125, |
| "completions/mean_terminated_length": 705.2999877929688, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.096, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22531215846538544, |
| "learning_rate": 9.881105062929221e-07, |
| "loss": 0.0, |
| "num_tokens": 10526854.0, |
| "reward": -0.2154863476753235, |
| "reward_std": 0.261901319026947, |
| "rewards/cosine_scaled_reward/mean": -0.2154863476753235, |
| "rewards/cosine_scaled_reward/std": 0.29268571734428406, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2026.0, |
| "completions/mean_length": 1926.34375, |
| "completions/mean_terminated_length": 1399.166748046875, |
| "completions/min_length": 880.0, |
| "completions/min_terminated_length": 880.0, |
| "epoch": 0.09714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19031891226768494, |
| "learning_rate": 9.873824502603459e-07, |
| "loss": -0.0, |
| "num_tokens": 10660460.0, |
| "reward": -0.21009978652000427, |
| "reward_std": 0.19575349986553192, |
| "rewards/cosine_scaled_reward/mean": -0.21009978652000427, |
| "rewards/cosine_scaled_reward/std": 0.2456056773662567, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1713.21875, |
| "completions/mean_terminated_length": 787.6470336914062, |
| "completions/min_length": 547.0, |
| "completions/min_terminated_length": 547.0, |
| "epoch": 0.09828571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.258359432220459, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": -0.0, |
| "num_tokens": 10780962.0, |
| "reward": -0.1955144852399826, |
| "reward_std": 0.24323132634162903, |
| "rewards/cosine_scaled_reward/mean": -0.1955144852399826, |
| "rewards/cosine_scaled_reward/std": 0.3071554899215698, |
| "step": 86 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1815.0, |
| "completions/mean_length": 1656.0, |
| "completions/mean_terminated_length": 1002.6666870117188, |
| "completions/min_length": 532.0, |
| "completions/min_terminated_length": 532.0, |
| "epoch": 0.09942857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2636864185333252, |
| "learning_rate": 9.85862422507884e-07, |
| "loss": 0.0, |
| "num_tokens": 10897066.0, |
| "reward": -0.1988150179386139, |
| "reward_std": 0.24088150262832642, |
| "rewards/cosine_scaled_reward/mean": -0.1988150179386139, |
| "rewards/cosine_scaled_reward/std": 0.2925129532814026, |
| "step": 87 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1755.234375, |
| "completions/mean_terminated_length": 1061.8421630859375, |
| "completions/min_length": 500.0, |
| "completions/min_terminated_length": 500.0, |
| "epoch": 0.10057142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29112017154693604, |
| "learning_rate": 9.850705248720068e-07, |
| "loss": 0.0, |
| "num_tokens": 11019913.0, |
| "reward": -0.02967459335923195, |
| "reward_std": 0.3240855932235718, |
| "rewards/cosine_scaled_reward/mean": -0.029674597084522247, |
| "rewards/cosine_scaled_reward/std": 0.3718070983886719, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1768.0, |
| "completions/mean_length": 1752.78125, |
| "completions/mean_terminated_length": 1148.2857666015625, |
| "completions/min_length": 619.0, |
| "completions/min_terminated_length": 619.0, |
| "epoch": 0.10171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2357943207025528, |
| "learning_rate": 9.8425742251254e-07, |
| "loss": -0.0, |
| "num_tokens": 11143091.0, |
| "reward": -0.1188301220536232, |
| "reward_std": 0.296513170003891, |
| "rewards/cosine_scaled_reward/mean": -0.1188301220536232, |
| "rewards/cosine_scaled_reward/std": 0.3878798484802246, |
| "step": 89 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1693.0, |
| "completions/mean_length": 1633.84375, |
| "completions/mean_terminated_length": 1101.357177734375, |
| "completions/min_length": 568.0, |
| "completions/min_terminated_length": 568.0, |
| "epoch": 0.10285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32384219765663147, |
| "learning_rate": 9.83423155058946e-07, |
| "loss": -0.0, |
| "num_tokens": 11257657.0, |
| "reward": -0.22837099432945251, |
| "reward_std": 0.18625205755233765, |
| "rewards/cosine_scaled_reward/mean": -0.22837099432945251, |
| "rewards/cosine_scaled_reward/std": 0.23636196553707123, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1833.0, |
| "completions/mean_length": 1847.21875, |
| "completions/mean_terminated_length": 1244.875, |
| "completions/min_length": 716.0, |
| "completions/min_terminated_length": 716.0, |
| "epoch": 0.104, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24563109874725342, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": -0.0, |
| "num_tokens": 11386447.0, |
| "reward": -0.11780542880296707, |
| "reward_std": 0.3100074827671051, |
| "rewards/cosine_scaled_reward/mean": -0.11780542135238647, |
| "rewards/cosine_scaled_reward/std": 0.39149248600006104, |
| "step": 91 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1462.0, |
| "completions/mean_length": 1595.125, |
| "completions/mean_terminated_length": 888.6399536132812, |
| "completions/min_length": 464.0, |
| "completions/min_terminated_length": 464.0, |
| "epoch": 0.10514285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2472057044506073, |
| "learning_rate": 9.816912885430258e-07, |
| "loss": -0.0, |
| "num_tokens": 11498527.0, |
| "reward": -0.2128506749868393, |
| "reward_std": 0.20926561951637268, |
| "rewards/cosine_scaled_reward/mean": -0.2128506898880005, |
| "rewards/cosine_scaled_reward/std": 0.23348061740398407, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1221.0, |
| "completions/mean_length": 1979.953125, |
| "completions/mean_terminated_length": 959.25, |
| "completions/min_length": 822.0, |
| "completions/min_terminated_length": 822.0, |
| "epoch": 0.10628571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2550150454044342, |
| "learning_rate": 9.807937738894303e-07, |
| "loss": -0.0, |
| "num_tokens": 11636588.0, |
| "reward": -0.2922024428844452, |
| "reward_std": 0.1515069603919983, |
| "rewards/cosine_scaled_reward/mean": -0.2922024726867676, |
| "rewards/cosine_scaled_reward/std": 0.18899379670619965, |
| "step": 93 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1544.0, |
| "completions/mean_length": 1830.609375, |
| "completions/mean_terminated_length": 977.769287109375, |
| "completions/min_length": 533.0, |
| "completions/min_terminated_length": 533.0, |
| "epoch": 0.10742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27164825797080994, |
| "learning_rate": 9.798752629550546e-07, |
| "loss": -0.0, |
| "num_tokens": 11763515.0, |
| "reward": -0.18001651763916016, |
| "reward_std": 0.18973413109779358, |
| "rewards/cosine_scaled_reward/mean": -0.18001650273799896, |
| "rewards/cosine_scaled_reward/std": 0.4316568076610565, |
| "step": 94 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 2004.671875, |
| "completions/mean_terminated_length": 1493.4000244140625, |
| "completions/min_length": 960.0, |
| "completions/min_terminated_length": 960.0, |
| "epoch": 0.10857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20861269533634186, |
| "learning_rate": 9.78935800506826e-07, |
| "loss": -0.0, |
| "num_tokens": 11902342.0, |
| "reward": -0.24148261547088623, |
| "reward_std": 0.18629083037376404, |
| "rewards/cosine_scaled_reward/mean": -0.24148263037204742, |
| "rewards/cosine_scaled_reward/std": 0.23122739791870117, |
| "step": 95 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1298.0, |
| "completions/mean_length": 1703.359375, |
| "completions/mean_terminated_length": 945.1500244140625, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.10971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2585296928882599, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0, |
| "num_tokens": 12022493.0, |
| "reward": -0.11465626955032349, |
| "reward_std": 0.24939197301864624, |
| "rewards/cosine_scaled_reward/mean": -0.11465626955032349, |
| "rewards/cosine_scaled_reward/std": 0.4384477138519287, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1809.0, |
| "completions/mean_length": 1819.921875, |
| "completions/mean_terminated_length": 1135.6875, |
| "completions/min_length": 425.0, |
| "completions/min_terminated_length": 425.0, |
| "epoch": 0.11085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3019813299179077, |
| "learning_rate": 9.769942052400235e-07, |
| "loss": 0.0, |
| "num_tokens": 12149232.0, |
| "reward": -0.18846748769283295, |
| "reward_std": 0.2666187584400177, |
| "rewards/cosine_scaled_reward/mean": -0.18846750259399414, |
| "rewards/cosine_scaled_reward/std": 0.3043021559715271, |
| "step": 97 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2003.0, |
| "completions/mean_length": 1677.296875, |
| "completions/mean_terminated_length": 1099.0, |
| "completions/min_length": 315.0, |
| "completions/min_terminated_length": 315.0, |
| "epoch": 0.112, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2722402513027191, |
| "learning_rate": 9.759921670520634e-07, |
| "loss": 0.0, |
| "num_tokens": 12267643.0, |
| "reward": -0.09557384252548218, |
| "reward_std": 0.2643275558948517, |
| "rewards/cosine_scaled_reward/mean": -0.09557383507490158, |
| "rewards/cosine_scaled_reward/std": 0.3361329138278961, |
| "step": 98 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1141.0, |
| "completions/mean_length": 1716.59375, |
| "completions/mean_terminated_length": 634.0000610351562, |
| "completions/min_length": 393.0, |
| "completions/min_terminated_length": 393.0, |
| "epoch": 0.11314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2789485454559326, |
| "learning_rate": 9.749693666068663e-07, |
| "loss": -0.0, |
| "num_tokens": 12388673.0, |
| "reward": -0.11132554709911346, |
| "reward_std": 0.1736970841884613, |
| "rewards/cosine_scaled_reward/mean": -0.11132554709911346, |
| "rewards/cosine_scaled_reward/std": 0.38663193583488464, |
| "step": 99 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1906.0, |
| "completions/mean_length": 1627.78125, |
| "completions/mean_terminated_length": 927.4166870117188, |
| "completions/min_length": 426.0, |
| "completions/min_terminated_length": 426.0, |
| "epoch": 0.11428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2479974329471588, |
| "learning_rate": 9.739258537542835e-07, |
| "loss": 0.0, |
| "num_tokens": 12502563.0, |
| "reward": 0.05247430503368378, |
| "reward_std": 0.2633323669433594, |
| "rewards/cosine_scaled_reward/mean": 0.05247429758310318, |
| "rewards/cosine_scaled_reward/std": 0.44700634479522705, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1780.0, |
| "completions/mean_length": 1684.75, |
| "completions/mean_terminated_length": 1037.2174072265625, |
| "completions/min_length": 555.0, |
| "completions/min_terminated_length": 555.0, |
| "epoch": 0.11542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2880499064922333, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": -0.0, |
| "num_tokens": 12621819.0, |
| "reward": -0.09590694308280945, |
| "reward_std": 0.21176990866661072, |
| "rewards/cosine_scaled_reward/mean": -0.09590694308280945, |
| "rewards/cosine_scaled_reward/std": 0.426421195268631, |
| "step": 101 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1585.0, |
| "completions/mean_length": 1361.265625, |
| "completions/mean_terminated_length": 860.1351318359375, |
| "completions/min_length": 416.0, |
| "completions/min_terminated_length": 416.0, |
| "epoch": 0.11657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2874862551689148, |
| "learning_rate": 9.717768952713511e-07, |
| "loss": -0.0, |
| "num_tokens": 12719092.0, |
| "reward": -0.19330359995365143, |
| "reward_std": 0.1932550072669983, |
| "rewards/cosine_scaled_reward/mean": -0.19330358505249023, |
| "rewards/cosine_scaled_reward/std": 0.34549427032470703, |
| "step": 102 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1384.0, |
| "completions/mean_length": 1687.90625, |
| "completions/mean_terminated_length": 607.625, |
| "completions/min_length": 221.0, |
| "completions/min_terminated_length": 221.0, |
| "epoch": 0.11771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29745906591415405, |
| "learning_rate": 9.706715543782064e-07, |
| "loss": -0.0, |
| "num_tokens": 12837470.0, |
| "reward": -0.2588111162185669, |
| "reward_std": 0.26013171672821045, |
| "rewards/cosine_scaled_reward/mean": -0.2588111162185669, |
| "rewards/cosine_scaled_reward/std": 0.32377612590789795, |
| "step": 103 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1627.0, |
| "completions/mean_length": 1679.59375, |
| "completions/mean_terminated_length": 925.2380981445312, |
| "completions/min_length": 585.0, |
| "completions/min_terminated_length": 585.0, |
| "epoch": 0.11885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27166086435317993, |
| "learning_rate": 9.695457105469804e-07, |
| "loss": -0.0, |
| "num_tokens": 12955428.0, |
| "reward": -0.17275363206863403, |
| "reward_std": 0.20137225091457367, |
| "rewards/cosine_scaled_reward/mean": -0.17275363206863403, |
| "rewards/cosine_scaled_reward/std": 0.2731510400772095, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1993.0, |
| "completions/mean_length": 1568.203125, |
| "completions/mean_terminated_length": 819.719970703125, |
| "completions/min_length": 510.0, |
| "completions/min_terminated_length": 510.0, |
| "epoch": 0.12, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26759475469589233, |
| "learning_rate": 9.683994186497132e-07, |
| "loss": -0.0, |
| "num_tokens": 13067081.0, |
| "reward": -0.1266355961561203, |
| "reward_std": 0.3027850389480591, |
| "rewards/cosine_scaled_reward/mean": -0.1266355961561203, |
| "rewards/cosine_scaled_reward/std": 0.4276663362979889, |
| "step": 105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1958.0, |
| "completions/mean_length": 1432.09375, |
| "completions/mean_terminated_length": 816.1875, |
| "completions/min_length": 219.0, |
| "completions/min_terminated_length": 219.0, |
| "epoch": 0.12114285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2912415862083435, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.0, |
| "num_tokens": 13169567.0, |
| "reward": 0.052130524069070816, |
| "reward_std": 0.30294427275657654, |
| "rewards/cosine_scaled_reward/mean": 0.052130527794361115, |
| "rewards/cosine_scaled_reward/std": 0.43769362568855286, |
| "step": 106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2024.0, |
| "completions/mean_length": 1721.28125, |
| "completions/mean_terminated_length": 1097.5455322265625, |
| "completions/min_length": 425.0, |
| "completions/min_terminated_length": 425.0, |
| "epoch": 0.12228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26628872752189636, |
| "learning_rate": 9.66045715125541e-07, |
| "loss": 0.0, |
| "num_tokens": 13290881.0, |
| "reward": -0.18292994797229767, |
| "reward_std": 0.25176504254341125, |
| "rewards/cosine_scaled_reward/mean": -0.18292994797229767, |
| "rewards/cosine_scaled_reward/std": 0.33385229110717773, |
| "step": 107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1518.6875, |
| "completions/mean_terminated_length": 989.375, |
| "completions/min_length": 430.0, |
| "completions/min_terminated_length": 430.0, |
| "epoch": 0.12342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25796031951904297, |
| "learning_rate": 9.648384182148252e-07, |
| "loss": -0.0, |
| "num_tokens": 13398437.0, |
| "reward": -0.17732736468315125, |
| "reward_std": 0.32095974683761597, |
| "rewards/cosine_scaled_reward/mean": -0.17732736468315125, |
| "rewards/cosine_scaled_reward/std": 0.3682377338409424, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1963.0, |
| "completions/mean_length": 1871.890625, |
| "completions/mean_terminated_length": 1108.75, |
| "completions/min_length": 673.0, |
| "completions/min_terminated_length": 673.0, |
| "epoch": 0.12457142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2274676412343979, |
| "learning_rate": 9.636109026648554e-07, |
| "loss": 0.0, |
| "num_tokens": 13529486.0, |
| "reward": -0.13115660846233368, |
| "reward_std": 0.15383467078208923, |
| "rewards/cosine_scaled_reward/mean": -0.13115662336349487, |
| "rewards/cosine_scaled_reward/std": 0.4183727204799652, |
| "step": 109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1941.0, |
| "completions/mean_length": 1584.125, |
| "completions/mean_terminated_length": 811.0, |
| "completions/min_length": 397.0, |
| "completions/min_terminated_length": 397.0, |
| "epoch": 0.12571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2768951952457428, |
| "learning_rate": 9.623632283030077e-07, |
| "loss": -0.0, |
| "num_tokens": 13641646.0, |
| "reward": -0.27792292833328247, |
| "reward_std": 0.18945851922035217, |
| "rewards/cosine_scaled_reward/mean": -0.27792292833328247, |
| "rewards/cosine_scaled_reward/std": 0.20238204300403595, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2032.0, |
| "completions/mean_length": 1709.1875, |
| "completions/mean_terminated_length": 1062.3636474609375, |
| "completions/min_length": 485.0, |
| "completions/min_terminated_length": 485.0, |
| "epoch": 0.12685714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24532362818717957, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": -0.0, |
| "num_tokens": 13761154.0, |
| "reward": -0.0890636295080185, |
| "reward_std": 0.33067381381988525, |
| "rewards/cosine_scaled_reward/mean": -0.0890636295080185, |
| "rewards/cosine_scaled_reward/std": 0.40376362204551697, |
| "step": 111 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1895.234375, |
| "completions/mean_terminated_length": 1436.9375, |
| "completions/min_length": 783.0, |
| "completions/min_terminated_length": 783.0, |
| "epoch": 0.128, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22462251782417297, |
| "learning_rate": 9.598076473627796e-07, |
| "loss": -0.0, |
| "num_tokens": 13893545.0, |
| "reward": -0.1325383186340332, |
| "reward_std": 0.330952525138855, |
| "rewards/cosine_scaled_reward/mean": -0.1325383186340332, |
| "rewards/cosine_scaled_reward/std": 0.4280668795108795, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1695.0, |
| "completions/mean_length": 1606.890625, |
| "completions/mean_terminated_length": 871.7083740234375, |
| "completions/min_length": 284.0, |
| "completions/min_terminated_length": 284.0, |
| "epoch": 0.12914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3009057939052582, |
| "learning_rate": 9.58499865339809e-07, |
| "loss": 0.0, |
| "num_tokens": 14006682.0, |
| "reward": -0.05043189600110054, |
| "reward_std": 0.300018846988678, |
| "rewards/cosine_scaled_reward/mean": -0.050431910902261734, |
| "rewards/cosine_scaled_reward/std": 0.43634143471717834, |
| "step": 113 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1898.0, |
| "completions/mean_length": 1562.515625, |
| "completions/mean_terminated_length": 753.375, |
| "completions/min_length": 121.0, |
| "completions/min_terminated_length": 121.0, |
| "epoch": 0.13028571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37847185134887695, |
| "learning_rate": 9.571721736097088e-07, |
| "loss": 0.0, |
| "num_tokens": 14116531.0, |
| "reward": -0.27539706230163574, |
| "reward_std": 0.18451666831970215, |
| "rewards/cosine_scaled_reward/mean": -0.27539709210395813, |
| "rewards/cosine_scaled_reward/std": 0.23580753803253174, |
| "step": 114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2041.0, |
| "completions/mean_length": 1656.484375, |
| "completions/mean_terminated_length": 958.5652465820312, |
| "completions/min_length": 351.0, |
| "completions/min_terminated_length": 351.0, |
| "epoch": 0.13142857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26879096031188965, |
| "learning_rate": 9.55824636882301e-07, |
| "loss": 0.0, |
| "num_tokens": 14233762.0, |
| "reward": -0.058682698756456375, |
| "reward_std": 0.2945008873939514, |
| "rewards/cosine_scaled_reward/mean": -0.05868269130587578, |
| "rewards/cosine_scaled_reward/std": 0.40092962980270386, |
| "step": 115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1413.0, |
| "completions/mean_length": 1924.84375, |
| "completions/mean_terminated_length": 734.3333740234375, |
| "completions/min_length": 426.0, |
| "completions/min_terminated_length": 426.0, |
| "epoch": 0.13257142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2654048800468445, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0, |
| "num_tokens": 14368336.0, |
| "reward": -0.2030428647994995, |
| "reward_std": 0.18692326545715332, |
| "rewards/cosine_scaled_reward/mean": -0.2030428647994995, |
| "rewards/cosine_scaled_reward/std": 0.2246093899011612, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1420.0, |
| "completions/mean_length": 1785.484375, |
| "completions/mean_terminated_length": 997.9375, |
| "completions/min_length": 549.0, |
| "completions/min_terminated_length": 549.0, |
| "epoch": 0.1337142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26004910469055176, |
| "learning_rate": 9.530702921077358e-07, |
| "loss": -0.0, |
| "num_tokens": 14493631.0, |
| "reward": -0.19770082831382751, |
| "reward_std": 0.25534579157829285, |
| "rewards/cosine_scaled_reward/mean": -0.19770082831382751, |
| "rewards/cosine_scaled_reward/std": 0.33773326873779297, |
| "step": 117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1906.0, |
| "completions/mean_length": 1802.84375, |
| "completions/mean_terminated_length": 1067.375, |
| "completions/min_length": 554.0, |
| "completions/min_terminated_length": 554.0, |
| "epoch": 0.13485714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22992977499961853, |
| "learning_rate": 9.516636183034564e-07, |
| "loss": 0.0, |
| "num_tokens": 14619549.0, |
| "reward": -0.011579632759094238, |
| "reward_std": 0.3697226643562317, |
| "rewards/cosine_scaled_reward/mean": -0.011579625308513641, |
| "rewards/cosine_scaled_reward/std": 0.4647332727909088, |
| "step": 118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1343.578125, |
| "completions/mean_terminated_length": 920.9249877929688, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.136, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3279743492603302, |
| "learning_rate": 9.502373679810839e-07, |
| "loss": -0.0, |
| "num_tokens": 14715946.0, |
| "reward": -0.0004618987441062927, |
| "reward_std": 0.27856603264808655, |
| "rewards/cosine_scaled_reward/mean": -0.0004618987441062927, |
| "rewards/cosine_scaled_reward/std": 0.45174649357795715, |
| "step": 119 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1546.0, |
| "completions/mean_length": 1286.75, |
| "completions/mean_terminated_length": 859.707275390625, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 256.0, |
| "epoch": 0.13714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3185117244720459, |
| "learning_rate": 9.487916106540465e-07, |
| "loss": -0.0, |
| "num_tokens": 14808754.0, |
| "reward": -0.06128609925508499, |
| "reward_std": 0.3139324188232422, |
| "rewards/cosine_scaled_reward/mean": -0.06128609925508499, |
| "rewards/cosine_scaled_reward/std": 0.46217504143714905, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1103.90625, |
| "completions/mean_terminated_length": 789.2083740234375, |
| "completions/min_length": 312.0, |
| "completions/min_terminated_length": 312.0, |
| "epoch": 0.1382857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3791055381298065, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0, |
| "num_tokens": 14889100.0, |
| "reward": -0.012373358011245728, |
| "reward_std": 0.3332873284816742, |
| "rewards/cosine_scaled_reward/mean": -0.012373358011245728, |
| "rewards/cosine_scaled_reward/std": 0.4969451427459717, |
| "step": 121 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1765.0625, |
| "completions/mean_terminated_length": 1042.0, |
| "completions/min_length": 330.0, |
| "completions/min_terminated_length": 330.0, |
| "epoch": 0.13942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27713218331336975, |
| "learning_rate": 9.458418577899774e-07, |
| "loss": -0.0, |
| "num_tokens": 15013624.0, |
| "reward": -0.1387348771095276, |
| "reward_std": 0.25947195291519165, |
| "rewards/cosine_scaled_reward/mean": -0.1387348771095276, |
| "rewards/cosine_scaled_reward/std": 0.3304338753223419, |
| "step": 122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1576.28125, |
| "completions/mean_terminated_length": 1006.9655151367188, |
| "completions/min_length": 502.0, |
| "completions/min_terminated_length": 502.0, |
| "epoch": 0.14057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2664856016635895, |
| "learning_rate": 9.443380060197385e-07, |
| "loss": 0.0, |
| "num_tokens": 15124738.0, |
| "reward": -0.18317654728889465, |
| "reward_std": 0.16592136025428772, |
| "rewards/cosine_scaled_reward/mean": -0.18317654728889465, |
| "rewards/cosine_scaled_reward/std": 0.33475980162620544, |
| "step": 123 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1814.0, |
| "completions/mean_length": 1395.78125, |
| "completions/mean_terminated_length": 888.5, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.1417142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2889535427093506, |
| "learning_rate": 9.428149347714143e-07, |
| "loss": 0.0, |
| "num_tokens": 15225020.0, |
| "reward": -0.12295320630073547, |
| "reward_std": 0.30637824535369873, |
| "rewards/cosine_scaled_reward/mean": -0.12295320630073547, |
| "rewards/cosine_scaled_reward/std": 0.4125574827194214, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2027.0, |
| "completions/mean_length": 1622.8125, |
| "completions/mean_terminated_length": 914.1666870117188, |
| "completions/min_length": 379.0, |
| "completions/min_terminated_length": 379.0, |
| "epoch": 0.14285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24003510177135468, |
| "learning_rate": 9.412727182773486e-07, |
| "loss": -0.0, |
| "num_tokens": 15339808.0, |
| "reward": -0.06917156279087067, |
| "reward_std": 0.19467812776565552, |
| "rewards/cosine_scaled_reward/mean": -0.06917153298854828, |
| "rewards/cosine_scaled_reward/std": 0.44139373302459717, |
| "step": 125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1735.96875, |
| "completions/mean_terminated_length": 1097.047607421875, |
| "completions/min_length": 610.0, |
| "completions/min_terminated_length": 610.0, |
| "epoch": 0.144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23693455755710602, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": -0.0, |
| "num_tokens": 15462206.0, |
| "reward": -0.15823431313037872, |
| "reward_std": 0.26196378469467163, |
| "rewards/cosine_scaled_reward/mean": -0.15823431313037872, |
| "rewards/cosine_scaled_reward/std": 0.3110467195510864, |
| "step": 126 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1990.0, |
| "completions/mean_length": 1729.421875, |
| "completions/mean_terminated_length": 1161.521728515625, |
| "completions/min_length": 655.0, |
| "completions/min_terminated_length": 655.0, |
| "epoch": 0.14514285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23715488612651825, |
| "learning_rate": 9.381311511432658e-07, |
| "loss": -0.0, |
| "num_tokens": 15583985.0, |
| "reward": -0.2520313262939453, |
| "reward_std": 0.1912405639886856, |
| "rewards/cosine_scaled_reward/mean": -0.2520313262939453, |
| "rewards/cosine_scaled_reward/std": 0.276276558637619, |
| "step": 127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2025.0, |
| "completions/mean_length": 1614.125, |
| "completions/mean_terminated_length": 1090.4827880859375, |
| "completions/min_length": 392.0, |
| "completions/min_terminated_length": 392.0, |
| "epoch": 0.1462857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25245338678359985, |
| "learning_rate": 9.36531953618799e-07, |
| "loss": -0.0, |
| "num_tokens": 15697641.0, |
| "reward": 0.029929369688034058, |
| "reward_std": 0.2960119843482971, |
| "rewards/cosine_scaled_reward/mean": 0.029929369688034058, |
| "rewards/cosine_scaled_reward/std": 0.40772902965545654, |
| "step": 128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1810.0, |
| "completions/mean_length": 1892.953125, |
| "completions/mean_terminated_length": 945.4444580078125, |
| "completions/min_length": 490.0, |
| "completions/min_terminated_length": 490.0, |
| "epoch": 0.14742857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22934643924236298, |
| "learning_rate": 9.34913917072228e-07, |
| "loss": -0.0, |
| "num_tokens": 15829494.0, |
| "reward": -0.27538371086120605, |
| "reward_std": 0.2161153256893158, |
| "rewards/cosine_scaled_reward/mean": -0.27538371086120605, |
| "rewards/cosine_scaled_reward/std": 0.25140947103500366, |
| "step": 129 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1384.0, |
| "completions/mean_length": 1631.5625, |
| "completions/mean_terminated_length": 889.2174072265625, |
| "completions/min_length": 545.0, |
| "completions/min_terminated_length": 545.0, |
| "epoch": 0.14857142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2312338650226593, |
| "learning_rate": 9.332771203643714e-07, |
| "loss": -0.0, |
| "num_tokens": 15944418.0, |
| "reward": -0.16326984763145447, |
| "reward_std": 0.22974258661270142, |
| "rewards/cosine_scaled_reward/mean": -0.16326983273029327, |
| "rewards/cosine_scaled_reward/std": 0.3127349317073822, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1709.0, |
| "completions/mean_length": 1549.453125, |
| "completions/mean_terminated_length": 820.8077392578125, |
| "completions/min_length": 280.0, |
| "completions/min_terminated_length": 280.0, |
| "epoch": 0.14971428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28737154603004456, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0, |
| "num_tokens": 16053319.0, |
| "reward": -0.060378547757864, |
| "reward_std": 0.23251818120479584, |
| "rewards/cosine_scaled_reward/mean": -0.060378558933734894, |
| "rewards/cosine_scaled_reward/std": 0.4743967354297638, |
| "step": 131 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1667.0, |
| "completions/mean_length": 1536.859375, |
| "completions/mean_terminated_length": 957.5667114257812, |
| "completions/min_length": 370.0, |
| "completions/min_terminated_length": 370.0, |
| "epoch": 0.15085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24873872101306915, |
| "learning_rate": 9.299475664759068e-07, |
| "loss": 0.0, |
| "num_tokens": 16162742.0, |
| "reward": -0.10933490097522736, |
| "reward_std": 0.2869688868522644, |
| "rewards/cosine_scaled_reward/mean": -0.10933491587638855, |
| "rewards/cosine_scaled_reward/std": 0.45436573028564453, |
| "step": 132 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1964.0, |
| "completions/mean_length": 1817.453125, |
| "completions/mean_terminated_length": 1125.8125, |
| "completions/min_length": 526.0, |
| "completions/min_terminated_length": 526.0, |
| "epoch": 0.152, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2753625810146332, |
| "learning_rate": 9.282549715730579e-07, |
| "loss": 0.0, |
| "num_tokens": 16290283.0, |
| "reward": -0.1931842416524887, |
| "reward_std": 0.2315790057182312, |
| "rewards/cosine_scaled_reward/mean": -0.1931842565536499, |
| "rewards/cosine_scaled_reward/std": 0.26366862654685974, |
| "step": 133 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1907.0, |
| "completions/mean_length": 1685.390625, |
| "completions/mean_terminated_length": 1119.719970703125, |
| "completions/min_length": 660.0, |
| "completions/min_terminated_length": 660.0, |
| "epoch": 0.15314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25077056884765625, |
| "learning_rate": 9.265439410565328e-07, |
| "loss": -0.0, |
| "num_tokens": 16408716.0, |
| "reward": -0.1305551677942276, |
| "reward_std": 0.15626969933509827, |
| "rewards/cosine_scaled_reward/mean": -0.1305551677942276, |
| "rewards/cosine_scaled_reward/std": 0.35703787207603455, |
| "step": 134 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1869.0, |
| "completions/mean_length": 1111.578125, |
| "completions/mean_terminated_length": 654.2557983398438, |
| "completions/min_length": 259.0, |
| "completions/min_terminated_length": 259.0, |
| "epoch": 0.15428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3456169366836548, |
| "learning_rate": 9.248145583195447e-07, |
| "loss": -0.0, |
| "num_tokens": 16490329.0, |
| "reward": 0.08614158630371094, |
| "reward_std": 0.3152117133140564, |
| "rewards/cosine_scaled_reward/mean": 0.08614158630371094, |
| "rewards/cosine_scaled_reward/std": 0.5073397159576416, |
| "step": 135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1600.0, |
| "completions/mean_length": 1485.703125, |
| "completions/mean_terminated_length": 848.433349609375, |
| "completions/min_length": 401.0, |
| "completions/min_terminated_length": 401.0, |
| "epoch": 0.15542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28029024600982666, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": -0.0, |
| "num_tokens": 16596086.0, |
| "reward": 0.01799224689602852, |
| "reward_std": 0.28087177872657776, |
| "rewards/cosine_scaled_reward/mean": 0.017992250621318817, |
| "rewards/cosine_scaled_reward/std": 0.5039587020874023, |
| "step": 136 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1632.0, |
| "completions/mean_length": 1765.421875, |
| "completions/mean_terminated_length": 1043.27783203125, |
| "completions/min_length": 659.0, |
| "completions/min_terminated_length": 659.0, |
| "epoch": 0.15657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21782204508781433, |
| "learning_rate": 9.213010742252327e-07, |
| "loss": 0.0, |
| "num_tokens": 16719681.0, |
| "reward": -0.2635670304298401, |
| "reward_std": 0.16446365416049957, |
| "rewards/cosine_scaled_reward/mean": -0.2635670304298401, |
| "rewards/cosine_scaled_reward/std": 0.1840340793132782, |
| "step": 137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1888.0, |
| "completions/mean_length": 1529.734375, |
| "completions/mean_terminated_length": 1072.441162109375, |
| "completions/min_length": 362.0, |
| "completions/min_terminated_length": 362.0, |
| "epoch": 0.15771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26588714122772217, |
| "learning_rate": 9.195171441101668e-07, |
| "loss": -0.0, |
| "num_tokens": 16828896.0, |
| "reward": -0.08665560930967331, |
| "reward_std": 0.23063711822032928, |
| "rewards/cosine_scaled_reward/mean": -0.08665560930967331, |
| "rewards/cosine_scaled_reward/std": 0.44113171100616455, |
| "step": 138 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2036.0, |
| "completions/mean_length": 1667.8125, |
| "completions/mean_terminated_length": 990.0869750976562, |
| "completions/min_length": 306.0, |
| "completions/min_terminated_length": 306.0, |
| "epoch": 0.15885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2917172610759735, |
| "learning_rate": 9.177152042508077e-07, |
| "loss": -0.0, |
| "num_tokens": 16946276.0, |
| "reward": -0.19403964281082153, |
| "reward_std": 0.2673150300979614, |
| "rewards/cosine_scaled_reward/mean": -0.19403962790966034, |
| "rewards/cosine_scaled_reward/std": 0.32773110270500183, |
| "step": 139 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1934.0, |
| "completions/mean_length": 1412.796875, |
| "completions/mean_terminated_length": 949.270263671875, |
| "completions/min_length": 384.0, |
| "completions/min_terminated_length": 384.0, |
| "epoch": 0.16, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.28324976563453674, |
| "learning_rate": 9.158953424711624e-07, |
| "loss": -0.0, |
| "num_tokens": 17046919.0, |
| "reward": -0.13130062818527222, |
| "reward_std": 0.13907812535762787, |
| "rewards/cosine_scaled_reward/mean": -0.13130061328411102, |
| "rewards/cosine_scaled_reward/std": 0.46400320529937744, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1615.0, |
| "completions/mean_length": 1272.25, |
| "completions/mean_terminated_length": 893.3953247070312, |
| "completions/min_length": 518.0, |
| "completions/min_terminated_length": 518.0, |
| "epoch": 0.16114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28660058975219727, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0, |
| "num_tokens": 17138903.0, |
| "reward": -0.044462256133556366, |
| "reward_std": 0.3412697911262512, |
| "rewards/cosine_scaled_reward/mean": -0.04446224868297577, |
| "rewards/cosine_scaled_reward/std": 0.4661441445350647, |
| "step": 141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1959.0, |
| "completions/mean_length": 1662.734375, |
| "completions/mean_terminated_length": 1226.10009765625, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.16228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3415294587612152, |
| "learning_rate": 9.122022088101613e-07, |
| "loss": -0.0, |
| "num_tokens": 17255822.0, |
| "reward": -0.15457069873809814, |
| "reward_std": 0.31260305643081665, |
| "rewards/cosine_scaled_reward/mean": -0.15457069873809814, |
| "rewards/cosine_scaled_reward/std": 0.3450033664703369, |
| "step": 142 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1791.0, |
| "completions/mean_length": 1441.203125, |
| "completions/mean_terminated_length": 998.4054565429688, |
| "completions/min_length": 462.0, |
| "completions/min_terminated_length": 462.0, |
| "epoch": 0.16342857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2900330424308777, |
| "learning_rate": 9.103291169269299e-07, |
| "loss": 0.0, |
| "num_tokens": 17358875.0, |
| "reward": -0.1936979442834854, |
| "reward_std": 0.26940327882766724, |
| "rewards/cosine_scaled_reward/mean": -0.1936979442834854, |
| "rewards/cosine_scaled_reward/std": 0.31407564878463745, |
| "step": 143 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1560.703125, |
| "completions/mean_terminated_length": 1008.4334106445312, |
| "completions/min_length": 318.0, |
| "completions/min_terminated_length": 318.0, |
| "epoch": 0.16457142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29284507036209106, |
| "learning_rate": 9.084384631108882e-07, |
| "loss": -0.0, |
| "num_tokens": 17470248.0, |
| "reward": -0.14136260747909546, |
| "reward_std": 0.2985552251338959, |
| "rewards/cosine_scaled_reward/mean": -0.14136262238025665, |
| "rewards/cosine_scaled_reward/std": 0.4261241853237152, |
| "step": 144 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1864.0, |
| "completions/mean_length": 1226.0, |
| "completions/mean_terminated_length": 852.3636474609375, |
| "completions/min_length": 316.0, |
| "completions/min_terminated_length": 316.0, |
| "epoch": 0.1657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30853384733200073, |
| "learning_rate": 9.065303395098358e-07, |
| "loss": -0.0, |
| "num_tokens": 17558656.0, |
| "reward": -0.011180020868778229, |
| "reward_std": 0.3104313910007477, |
| "rewards/cosine_scaled_reward/mean": -0.011180016212165356, |
| "rewards/cosine_scaled_reward/std": 0.502927303314209, |
| "step": 145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1440.0, |
| "completions/mean_length": 1468.8125, |
| "completions/mean_terminated_length": 889.625, |
| "completions/min_length": 537.0, |
| "completions/min_terminated_length": 537.0, |
| "epoch": 0.16685714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25645971298217773, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": -0.0, |
| "num_tokens": 17663276.0, |
| "reward": -0.1956520974636078, |
| "reward_std": 0.24750414490699768, |
| "rewards/cosine_scaled_reward/mean": -0.1956520974636078, |
| "rewards/cosine_scaled_reward/std": 0.30754002928733826, |
| "step": 146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.453125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1517.84375, |
| "completions/mean_terminated_length": 1078.5714111328125, |
| "completions/min_length": 615.0, |
| "completions/min_terminated_length": 615.0, |
| "epoch": 0.168, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28331542015075684, |
| "learning_rate": 9.026620557966279e-07, |
| "loss": -0.0, |
| "num_tokens": 17771202.0, |
| "reward": -0.14546620845794678, |
| "reward_std": 0.307411253452301, |
| "rewards/cosine_scaled_reward/mean": -0.14546619355678558, |
| "rewards/cosine_scaled_reward/std": 0.3964070975780487, |
| "step": 147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1735.0, |
| "completions/mean_length": 1319.75, |
| "completions/mean_terminated_length": 882.7999877929688, |
| "completions/min_length": 412.0, |
| "completions/min_terminated_length": 412.0, |
| "epoch": 0.16914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24973155558109283, |
| "learning_rate": 9.007020842191634e-07, |
| "loss": -0.0, |
| "num_tokens": 17866850.0, |
| "reward": -0.05917578190565109, |
| "reward_std": 0.24221420288085938, |
| "rewards/cosine_scaled_reward/mean": -0.05917578190565109, |
| "rewards/cosine_scaled_reward/std": 0.39783161878585815, |
| "step": 148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1718.0, |
| "completions/mean_length": 1641.578125, |
| "completions/mean_terminated_length": 1007.5599975585938, |
| "completions/min_length": 624.0, |
| "completions/min_terminated_length": 624.0, |
| "epoch": 0.1702857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23923377692699432, |
| "learning_rate": 8.987250199168808e-07, |
| "loss": 0.0, |
| "num_tokens": 17983807.0, |
| "reward": -0.16958971321582794, |
| "reward_std": 0.3115168809890747, |
| "rewards/cosine_scaled_reward/mean": -0.16958969831466675, |
| "rewards/cosine_scaled_reward/std": 0.4009650945663452, |
| "step": 149 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1699.0, |
| "completions/mean_length": 1294.734375, |
| "completions/mean_terminated_length": 976.6889038085938, |
| "completions/min_length": 359.0, |
| "completions/min_terminated_length": 359.0, |
| "epoch": 0.17142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2846779525279999, |
| "learning_rate": 8.967309592491052e-07, |
| "loss": 0.0, |
| "num_tokens": 18077174.0, |
| "reward": -0.16757264733314514, |
| "reward_std": 0.26536184549331665, |
| "rewards/cosine_scaled_reward/mean": -0.16757264733314514, |
| "rewards/cosine_scaled_reward/std": 0.32911255955696106, |
| "step": 150 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1840.0, |
| "completions/mean_length": 1461.640625, |
| "completions/mean_terminated_length": 1005.5833129882812, |
| "completions/min_length": 446.0, |
| "completions/min_terminated_length": 446.0, |
| "epoch": 0.17257142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27918684482574463, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0, |
| "num_tokens": 18181399.0, |
| "reward": -0.04434409737586975, |
| "reward_std": 0.21946659684181213, |
| "rewards/cosine_scaled_reward/mean": -0.04434409365057945, |
| "rewards/cosine_scaled_reward/std": 0.385776162147522, |
| "step": 151 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1827.0, |
| "completions/mean_length": 1685.984375, |
| "completions/mean_terminated_length": 944.7142944335938, |
| "completions/min_length": 191.0, |
| "completions/min_terminated_length": 191.0, |
| "epoch": 0.1737142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31312429904937744, |
| "learning_rate": 8.926922383915315e-07, |
| "loss": -0.0, |
| "num_tokens": 18299966.0, |
| "reward": -0.16299618780612946, |
| "reward_std": 0.2579989731311798, |
| "rewards/cosine_scaled_reward/mean": -0.16299618780612946, |
| "rewards/cosine_scaled_reward/std": 0.2968141436576843, |
| "step": 152 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1977.0, |
| "completions/mean_length": 1507.171875, |
| "completions/mean_terminated_length": 999.1212158203125, |
| "completions/min_length": 365.0, |
| "completions/min_terminated_length": 365.0, |
| "epoch": 0.17485714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27001288533210754, |
| "learning_rate": 8.906477750432903e-07, |
| "loss": -0.0, |
| "num_tokens": 18407569.0, |
| "reward": -0.2650793790817261, |
| "reward_std": 0.2175406664609909, |
| "rewards/cosine_scaled_reward/mean": -0.2650793790817261, |
| "rewards/cosine_scaled_reward/std": 0.2671082317829132, |
| "step": 153 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1925.0, |
| "completions/mean_length": 1655.453125, |
| "completions/mean_terminated_length": 1081.7308349609375, |
| "completions/min_length": 337.0, |
| "completions/min_terminated_length": 337.0, |
| "epoch": 0.176, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23643171787261963, |
| "learning_rate": 8.88586709003076e-07, |
| "loss": -0.0, |
| "num_tokens": 18524582.0, |
| "reward": -0.1807454228401184, |
| "reward_std": 0.28304773569107056, |
| "rewards/cosine_scaled_reward/mean": -0.1807454228401184, |
| "rewards/cosine_scaled_reward/std": 0.35738474130630493, |
| "step": 154 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1884.0, |
| "completions/mean_length": 1342.046875, |
| "completions/mean_terminated_length": 826.8919067382812, |
| "completions/min_length": 55.0, |
| "completions/min_terminated_length": 55.0, |
| "epoch": 0.17714285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3501509726047516, |
| "learning_rate": 8.865091407243394e-07, |
| "loss": 0.0, |
| "num_tokens": 18621097.0, |
| "reward": -0.0294140987098217, |
| "reward_std": 0.1941235363483429, |
| "rewards/cosine_scaled_reward/mean": -0.029414094984531403, |
| "rewards/cosine_scaled_reward/std": 0.41702020168304443, |
| "step": 155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1956.0, |
| "completions/mean_length": 1586.40625, |
| "completions/mean_terminated_length": 911.769287109375, |
| "completions/min_length": 321.0, |
| "completions/min_terminated_length": 321.0, |
| "epoch": 0.1782857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24873077869415283, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": -0.0, |
| "num_tokens": 18732731.0, |
| "reward": -0.09675467014312744, |
| "reward_std": 0.2634894847869873, |
| "rewards/cosine_scaled_reward/mean": -0.09675467014312744, |
| "rewards/cosine_scaled_reward/std": 0.42875486612319946, |
| "step": 156 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2015.0, |
| "completions/mean_length": 1426.203125, |
| "completions/mean_terminated_length": 1053.125, |
| "completions/min_length": 493.0, |
| "completions/min_terminated_length": 493.0, |
| "epoch": 0.17942857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28960639238357544, |
| "learning_rate": 8.823049032816478e-07, |
| "loss": -0.0, |
| "num_tokens": 18833968.0, |
| "reward": -0.19702841341495514, |
| "reward_std": 0.2148652821779251, |
| "rewards/cosine_scaled_reward/mean": -0.19702842831611633, |
| "rewards/cosine_scaled_reward/std": 0.2610262334346771, |
| "step": 157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1971.0, |
| "completions/mean_length": 1293.140625, |
| "completions/mean_terminated_length": 997.7608642578125, |
| "completions/min_length": 190.0, |
| "completions/min_terminated_length": 190.0, |
| "epoch": 0.18057142857142858, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.24434594810009003, |
| "learning_rate": 8.801784390262943e-07, |
| "loss": -0.0, |
| "num_tokens": 18926849.0, |
| "reward": 0.03873754292726517, |
| "reward_std": 0.23464180529117584, |
| "rewards/cosine_scaled_reward/mean": 0.03873754292726517, |
| "rewards/cosine_scaled_reward/std": 0.5250495076179504, |
| "step": 158 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2001.0, |
| "completions/mean_length": 1565.171875, |
| "completions/mean_terminated_length": 1139.1470947265625, |
| "completions/min_length": 571.0, |
| "completions/min_terminated_length": 571.0, |
| "epoch": 0.18171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22470001876354218, |
| "learning_rate": 8.780358823396352e-07, |
| "loss": 0.0, |
| "num_tokens": 19038700.0, |
| "reward": -0.202285498380661, |
| "reward_std": 0.20965763926506042, |
| "rewards/cosine_scaled_reward/mean": -0.202285498380661, |
| "rewards/cosine_scaled_reward/std": 0.3204317092895508, |
| "step": 159 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2042.0, |
| "completions/mean_length": 1400.296875, |
| "completions/mean_terminated_length": 985.1026000976562, |
| "completions/min_length": 466.0, |
| "completions/min_terminated_length": 466.0, |
| "epoch": 0.18285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2976718246936798, |
| "learning_rate": 8.758773376468604e-07, |
| "loss": -0.0, |
| "num_tokens": 19139903.0, |
| "reward": 0.020067960023880005, |
| "reward_std": 0.4074331223964691, |
| "rewards/cosine_scaled_reward/mean": 0.020067960023880005, |
| "rewards/cosine_scaled_reward/std": 0.5162546038627625, |
| "step": 160 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1644.0, |
| "completions/mean_length": 1382.25, |
| "completions/mean_terminated_length": 982.7999877929688, |
| "completions/min_length": 392.0, |
| "completions/min_terminated_length": 392.0, |
| "epoch": 0.184, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27535930275917053, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": -0.0, |
| "num_tokens": 19238359.0, |
| "reward": -0.2095160186290741, |
| "reward_std": 0.20490920543670654, |
| "rewards/cosine_scaled_reward/mean": -0.2095160186290741, |
| "rewards/cosine_scaled_reward/std": 0.22322162985801697, |
| "step": 161 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1960.0, |
| "completions/mean_length": 1301.0, |
| "completions/mean_terminated_length": 985.6000366210938, |
| "completions/min_length": 438.0, |
| "completions/min_terminated_length": 438.0, |
| "epoch": 0.18514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3560119867324829, |
| "learning_rate": 8.715127058347614e-07, |
| "loss": -0.0, |
| "num_tokens": 19331927.0, |
| "reward": -0.23389019072055817, |
| "reward_std": 0.2546258866786957, |
| "rewards/cosine_scaled_reward/mean": -0.23389017581939697, |
| "rewards/cosine_scaled_reward/std": 0.28031107783317566, |
| "step": 162 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1674.0, |
| "completions/mean_length": 1394.09375, |
| "completions/mean_terminated_length": 946.6842041015625, |
| "completions/min_length": 392.0, |
| "completions/min_terminated_length": 392.0, |
| "epoch": 0.18628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3067707121372223, |
| "learning_rate": 8.693068314414344e-07, |
| "loss": 0.0, |
| "num_tokens": 19432333.0, |
| "reward": 0.008387047797441483, |
| "reward_std": 0.2966369390487671, |
| "rewards/cosine_scaled_reward/mean": 0.008387047797441483, |
| "rewards/cosine_scaled_reward/std": 0.47443318367004395, |
| "step": 163 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1863.0, |
| "completions/mean_length": 1301.03125, |
| "completions/mean_terminated_length": 881.9999389648438, |
| "completions/min_length": 271.0, |
| "completions/min_terminated_length": 271.0, |
| "epoch": 0.18742857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26724106073379517, |
| "learning_rate": 8.670853944836176e-07, |
| "loss": -0.0, |
| "num_tokens": 19526127.0, |
| "reward": 0.015163253992795944, |
| "reward_std": 0.2171541154384613, |
| "rewards/cosine_scaled_reward/mean": 0.015163261443376541, |
| "rewards/cosine_scaled_reward/std": 0.43332821130752563, |
| "step": 164 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1984.0, |
| "completions/mean_length": 1290.8125, |
| "completions/mean_terminated_length": 971.1111450195312, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.18857142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2774181663990021, |
| "learning_rate": 8.648485032310144e-07, |
| "loss": 0.0, |
| "num_tokens": 19620155.0, |
| "reward": -0.07460268586874008, |
| "reward_std": 0.25969409942626953, |
| "rewards/cosine_scaled_reward/mean": -0.07460269331932068, |
| "rewards/cosine_scaled_reward/std": 0.391157329082489, |
| "step": 165 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1998.0, |
| "completions/mean_length": 1456.3125, |
| "completions/mean_terminated_length": 1077.025634765625, |
| "completions/min_length": 560.0, |
| "completions/min_terminated_length": 560.0, |
| "epoch": 0.18971428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23980101943016052, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": -0.0, |
| "num_tokens": 19724935.0, |
| "reward": -0.10265599191188812, |
| "reward_std": 0.3349866271018982, |
| "rewards/cosine_scaled_reward/mean": -0.10265599191188812, |
| "rewards/cosine_scaled_reward/std": 0.4455646872520447, |
| "step": 166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.265625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1240.96875, |
| "completions/mean_terminated_length": 949.0637817382812, |
| "completions/min_length": 546.0, |
| "completions/min_terminated_length": 546.0, |
| "epoch": 0.19085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27142858505249023, |
| "learning_rate": 8.603287946810513e-07, |
| "loss": 0.0, |
| "num_tokens": 19815901.0, |
| "reward": -0.11890637874603271, |
| "reward_std": 0.26525112986564636, |
| "rewards/cosine_scaled_reward/mean": -0.11890637129545212, |
| "rewards/cosine_scaled_reward/std": 0.3307341933250427, |
| "step": 167 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1862.0, |
| "completions/mean_length": 1368.515625, |
| "completions/mean_terminated_length": 987.3414306640625, |
| "completions/min_length": 540.0, |
| "completions/min_terminated_length": 540.0, |
| "epoch": 0.192, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23390193283557892, |
| "learning_rate": 8.580461976679099e-07, |
| "loss": 0.0, |
| "num_tokens": 19914326.0, |
| "reward": -0.08119502663612366, |
| "reward_std": 0.21067029237747192, |
| "rewards/cosine_scaled_reward/mean": -0.08119503408670425, |
| "rewards/cosine_scaled_reward/std": 0.3641049563884735, |
| "step": 168 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2037.0, |
| "completions/mean_length": 1400.109375, |
| "completions/mean_terminated_length": 1105.6136474609375, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "epoch": 0.19314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25923916697502136, |
| "learning_rate": 8.557485869176825e-07, |
| "loss": -0.0, |
| "num_tokens": 20014557.0, |
| "reward": 0.2310131937265396, |
| "reward_std": 0.44008710980415344, |
| "rewards/cosine_scaled_reward/mean": 0.2310132086277008, |
| "rewards/cosine_scaled_reward/std": 0.5884551405906677, |
| "step": 169 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1816.0, |
| "completions/mean_length": 1230.984375, |
| "completions/mean_terminated_length": 859.6136474609375, |
| "completions/min_length": 359.0, |
| "completions/min_terminated_length": 359.0, |
| "epoch": 0.19428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.297661691904068, |
| "learning_rate": 8.534360744126753e-07, |
| "loss": -0.0, |
| "num_tokens": 20103124.0, |
| "reward": -0.02752646803855896, |
| "reward_std": 0.2112906128168106, |
| "rewards/cosine_scaled_reward/mean": -0.027526460587978363, |
| "rewards/cosine_scaled_reward/std": 0.4330926835536957, |
| "step": 170 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2044.0, |
| "completions/mean_length": 1330.734375, |
| "completions/mean_terminated_length": 928.3658447265625, |
| "completions/min_length": 440.0, |
| "completions/min_terminated_length": 440.0, |
| "epoch": 0.19542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27631711959838867, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": -0.0, |
| "num_tokens": 20198683.0, |
| "reward": -0.02587110549211502, |
| "reward_std": 0.3332647681236267, |
| "rewards/cosine_scaled_reward/mean": -0.025871101766824722, |
| "rewards/cosine_scaled_reward/std": 0.4695811867713928, |
| "step": 171 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1626.0, |
| "completions/mean_length": 1300.171875, |
| "completions/mean_terminated_length": 880.6585083007812, |
| "completions/min_length": 268.0, |
| "completions/min_terminated_length": 268.0, |
| "epoch": 0.19657142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3011989891529083, |
| "learning_rate": 8.487667956935087e-07, |
| "loss": -0.0, |
| "num_tokens": 20292510.0, |
| "reward": 0.17403244972229004, |
| "reward_std": 0.23184293508529663, |
| "rewards/cosine_scaled_reward/mean": 0.17403244972229004, |
| "rewards/cosine_scaled_reward/std": 0.46001583337783813, |
| "step": 172 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.203125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1064.515625, |
| "completions/mean_terminated_length": 813.8235473632812, |
| "completions/min_length": 235.0, |
| "completions/min_terminated_length": 235.0, |
| "epoch": 0.1977142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3317919075489044, |
| "learning_rate": 8.464102570534061e-07, |
| "loss": 0.0, |
| "num_tokens": 20371951.0, |
| "reward": -0.14008744060993195, |
| "reward_std": 0.23045390844345093, |
| "rewards/cosine_scaled_reward/mean": -0.14008745551109314, |
| "rewards/cosine_scaled_reward/std": 0.327737033367157, |
| "step": 173 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1867.0, |
| "completions/mean_length": 1076.65625, |
| "completions/mean_terminated_length": 994.3389892578125, |
| "completions/min_length": 449.0, |
| "completions/min_terminated_length": 449.0, |
| "epoch": 0.19885714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2918402850627899, |
| "learning_rate": 8.440392717955475e-07, |
| "loss": -0.0, |
| "num_tokens": 20451193.0, |
| "reward": -0.020191974937915802, |
| "reward_std": 0.3699801564216614, |
| "rewards/cosine_scaled_reward/mean": -0.020191967487335205, |
| "rewards/cosine_scaled_reward/std": 0.4890177845954895, |
| "step": 174 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1883.0, |
| "completions/mean_length": 1360.859375, |
| "completions/mean_terminated_length": 1025.279052734375, |
| "completions/min_length": 454.0, |
| "completions/min_terminated_length": 454.0, |
| "epoch": 0.2, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24612702429294586, |
| "learning_rate": 8.416539554784089e-07, |
| "loss": -0.0, |
| "num_tokens": 20549112.0, |
| "reward": -0.07502052187919617, |
| "reward_std": 0.23629868030548096, |
| "rewards/cosine_scaled_reward/mean": -0.07502052187919617, |
| "rewards/cosine_scaled_reward/std": 0.4632040560245514, |
| "step": 175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1966.0, |
| "completions/mean_length": 1351.046875, |
| "completions/mean_terminated_length": 904.2820434570312, |
| "completions/min_length": 305.0, |
| "completions/min_terminated_length": 305.0, |
| "epoch": 0.20114285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2809349298477173, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0, |
| "num_tokens": 20646747.0, |
| "reward": -0.09139305353164673, |
| "reward_std": 0.3010050654411316, |
| "rewards/cosine_scaled_reward/mean": -0.09139305353164673, |
| "rewards/cosine_scaled_reward/std": 0.3958495557308197, |
| "step": 176 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1885.0, |
| "completions/mean_length": 1245.578125, |
| "completions/mean_terminated_length": 978.1041870117188, |
| "completions/min_length": 366.0, |
| "completions/min_terminated_length": 366.0, |
| "epoch": 0.2022857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3317105770111084, |
| "learning_rate": 8.368407953869103e-07, |
| "loss": 0.0, |
| "num_tokens": 20736688.0, |
| "reward": -0.0028449445962905884, |
| "reward_std": 0.3299737870693207, |
| "rewards/cosine_scaled_reward/mean": -0.0028449594974517822, |
| "rewards/cosine_scaled_reward/std": 0.505253255367279, |
| "step": 177 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1975.0, |
| "completions/mean_length": 1192.234375, |
| "completions/mean_terminated_length": 952.6199951171875, |
| "completions/min_length": 414.0, |
| "completions/min_terminated_length": 414.0, |
| "epoch": 0.20342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2933591306209564, |
| "learning_rate": 8.344131861991828e-07, |
| "loss": 0.0, |
| "num_tokens": 20824527.0, |
| "reward": -0.06664696335792542, |
| "reward_std": 0.29735952615737915, |
| "rewards/cosine_scaled_reward/mean": -0.06664696335792542, |
| "rewards/cosine_scaled_reward/std": 0.41459333896636963, |
| "step": 178 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.515625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1992.0, |
| "completions/mean_length": 1546.0625, |
| "completions/mean_terminated_length": 1011.7418823242188, |
| "completions/min_length": 637.0, |
| "completions/min_terminated_length": 637.0, |
| "epoch": 0.20457142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2591443657875061, |
| "learning_rate": 8.319717151140072e-07, |
| "loss": -0.0, |
| "num_tokens": 20934307.0, |
| "reward": -0.18733876943588257, |
| "reward_std": 0.29792603850364685, |
| "rewards/cosine_scaled_reward/mean": -0.18733876943588257, |
| "rewards/cosine_scaled_reward/std": 0.33306172490119934, |
| "step": 179 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2009.0, |
| "completions/mean_length": 1200.609375, |
| "completions/mean_terminated_length": 941.2040405273438, |
| "completions/min_length": 307.0, |
| "completions/min_terminated_length": 307.0, |
| "epoch": 0.2057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3007314205169678, |
| "learning_rate": 8.295165011252396e-07, |
| "loss": 0.0, |
| "num_tokens": 21022322.0, |
| "reward": 0.16183573007583618, |
| "reward_std": 0.3202260136604309, |
| "rewards/cosine_scaled_reward/mean": 0.16183573007583618, |
| "rewards/cosine_scaled_reward/std": 0.4895489513874054, |
| "step": 180 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1438.96875, |
| "completions/mean_terminated_length": 994.5405883789062, |
| "completions/min_length": 445.0, |
| "completions/min_terminated_length": 445.0, |
| "epoch": 0.20685714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26270556449890137, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": -0.0, |
| "num_tokens": 21125888.0, |
| "reward": -0.09954661130905151, |
| "reward_std": 0.19112557172775269, |
| "rewards/cosine_scaled_reward/mean": -0.09954659640789032, |
| "rewards/cosine_scaled_reward/std": 0.4616987109184265, |
| "step": 181 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1810.0, |
| "completions/mean_length": 1401.1875, |
| "completions/mean_terminated_length": 986.5640869140625, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.208, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2312396615743637, |
| "learning_rate": 8.245653237555705e-07, |
| "loss": -0.0, |
| "num_tokens": 21225356.0, |
| "reward": -0.1191510483622551, |
| "reward_std": 0.2993764877319336, |
| "rewards/cosine_scaled_reward/mean": -0.1191510558128357, |
| "rewards/cosine_scaled_reward/std": 0.4002695679664612, |
| "step": 182 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.140625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1778.0, |
| "completions/mean_length": 1059.46875, |
| "completions/mean_terminated_length": 897.7090454101562, |
| "completions/min_length": 384.0, |
| "completions/min_terminated_length": 384.0, |
| "epoch": 0.20914285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31593599915504456, |
| "learning_rate": 8.220696016880687e-07, |
| "loss": 0.0, |
| "num_tokens": 21303778.0, |
| "reward": 0.02695992961525917, |
| "reward_std": 0.33188390731811523, |
| "rewards/cosine_scaled_reward/mean": 0.026959922164678574, |
| "rewards/cosine_scaled_reward/std": 0.462587833404541, |
| "step": 183 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1725.0, |
| "completions/mean_length": 1409.90625, |
| "completions/mean_terminated_length": 944.270263671875, |
| "completions/min_length": 116.0, |
| "completions/min_terminated_length": 116.0, |
| "epoch": 0.2102857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4352613687515259, |
| "learning_rate": 8.195606193320136e-07, |
| "loss": 0.0, |
| "num_tokens": 21405364.0, |
| "reward": -0.1421402245759964, |
| "reward_std": 0.16645817458629608, |
| "rewards/cosine_scaled_reward/mean": -0.1421402394771576, |
| "rewards/cosine_scaled_reward/std": 0.33322539925575256, |
| "step": 184 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1992.0, |
| "completions/mean_length": 1153.75, |
| "completions/mean_terminated_length": 947.3846435546875, |
| "completions/min_length": 263.0, |
| "completions/min_terminated_length": 263.0, |
| "epoch": 0.21142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33058229088783264, |
| "learning_rate": 8.170384989716657e-07, |
| "loss": 0.0, |
| "num_tokens": 21489388.0, |
| "reward": -0.13177143037319183, |
| "reward_std": 0.26749324798583984, |
| "rewards/cosine_scaled_reward/mean": -0.13177144527435303, |
| "rewards/cosine_scaled_reward/std": 0.42720580101013184, |
| "step": 185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2031.0, |
| "completions/mean_length": 1299.109375, |
| "completions/mean_terminated_length": 1089.419921875, |
| "completions/min_length": 589.0, |
| "completions/min_terminated_length": 589.0, |
| "epoch": 0.21257142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2624933421611786, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": 0.0, |
| "num_tokens": 21583227.0, |
| "reward": -0.03228667005896568, |
| "reward_std": 0.3138354420661926, |
| "rewards/cosine_scaled_reward/mean": -0.03228667378425598, |
| "rewards/cosine_scaled_reward/std": 0.47089555859565735, |
| "step": 186 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1941.0, |
| "completions/mean_length": 1294.046875, |
| "completions/mean_terminated_length": 1042.729248046875, |
| "completions/min_length": 478.0, |
| "completions/min_terminated_length": 478.0, |
| "epoch": 0.21371428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3034888207912445, |
| "learning_rate": 8.119553365707802e-07, |
| "loss": -0.0, |
| "num_tokens": 21676470.0, |
| "reward": -0.11602523177862167, |
| "reward_std": 0.2424153983592987, |
| "rewards/cosine_scaled_reward/mean": -0.11602522432804108, |
| "rewards/cosine_scaled_reward/std": 0.4187147617340088, |
| "step": 187 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1932.0, |
| "completions/mean_length": 1501.09375, |
| "completions/mean_terminated_length": 1102.0, |
| "completions/min_length": 488.0, |
| "completions/min_terminated_length": 488.0, |
| "epoch": 0.21485714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2895212769508362, |
| "learning_rate": 8.093945422764069e-07, |
| "loss": 0.0, |
| "num_tokens": 21784340.0, |
| "reward": -0.2754727005958557, |
| "reward_std": 0.2081308364868164, |
| "rewards/cosine_scaled_reward/mean": -0.2754727005958557, |
| "rewards/cosine_scaled_reward/std": 0.21545176208019257, |
| "step": 188 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1887.0, |
| "completions/mean_length": 974.765625, |
| "completions/mean_terminated_length": 842.9649047851562, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "epoch": 0.216, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5491365790367126, |
| "learning_rate": 8.068211054579943e-07, |
| "loss": -0.0, |
| "num_tokens": 21856013.0, |
| "reward": -0.030638471245765686, |
| "reward_std": 0.28698021173477173, |
| "rewards/cosine_scaled_reward/mean": -0.030638471245765686, |
| "rewards/cosine_scaled_reward/std": 0.4680361747741699, |
| "step": 189 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2008.0, |
| "completions/mean_length": 1131.0, |
| "completions/mean_terminated_length": 961.1851806640625, |
| "completions/min_length": 459.0, |
| "completions/min_terminated_length": 459.0, |
| "epoch": 0.21714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31247782707214355, |
| "learning_rate": 8.04235151541222e-07, |
| "loss": -0.0, |
| "num_tokens": 21938165.0, |
| "reward": -0.0024216994643211365, |
| "reward_std": 0.3122457265853882, |
| "rewards/cosine_scaled_reward/mean": -0.002421695739030838, |
| "rewards/cosine_scaled_reward/std": 0.46288225054740906, |
| "step": 190 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 2011.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 968.671875, |
| "completions/mean_terminated_length": 968.671875, |
| "completions/min_length": 373.0, |
| "completions/min_terminated_length": 373.0, |
| "epoch": 0.21828571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3146747350692749, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": -0.0, |
| "num_tokens": 22010448.0, |
| "reward": 0.060832589864730835, |
| "reward_std": 0.32057198882102966, |
| "rewards/cosine_scaled_reward/mean": 0.06083259731531143, |
| "rewards/cosine_scaled_reward/std": 0.5255656242370605, |
| "step": 191 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1822.0, |
| "completions/mean_length": 1386.765625, |
| "completions/mean_terminated_length": 1184.346923828125, |
| "completions/min_length": 495.0, |
| "completions/min_terminated_length": 495.0, |
| "epoch": 0.21942857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27087679505348206, |
| "learning_rate": 7.990261971595048e-07, |
| "loss": -0.0, |
| "num_tokens": 22110497.0, |
| "reward": -0.16324350237846375, |
| "reward_std": 0.2538989782333374, |
| "rewards/cosine_scaled_reward/mean": -0.16324350237846375, |
| "rewards/cosine_scaled_reward/std": 0.37434685230255127, |
| "step": 192 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1512.0, |
| "completions/mean_length": 1645.171875, |
| "completions/mean_terminated_length": 973.7916870117188, |
| "completions/min_length": 626.0, |
| "completions/min_terminated_length": 626.0, |
| "epoch": 0.22057142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2821788489818573, |
| "learning_rate": 7.964034505716476e-07, |
| "loss": -0.0, |
| "num_tokens": 22226932.0, |
| "reward": -0.2253284454345703, |
| "reward_std": 0.21184760332107544, |
| "rewards/cosine_scaled_reward/mean": -0.2253284454345703, |
| "rewards/cosine_scaled_reward/std": 0.2249312698841095, |
| "step": 193 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1818.0, |
| "completions/mean_length": 1466.953125, |
| "completions/mean_terminated_length": 1042.9459228515625, |
| "completions/min_length": 470.0, |
| "completions/min_terminated_length": 470.0, |
| "epoch": 0.22171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2751137614250183, |
| "learning_rate": 7.93768694627233e-07, |
| "loss": -0.0, |
| "num_tokens": 22332177.0, |
| "reward": -0.014900192618370056, |
| "reward_std": 0.37233370542526245, |
| "rewards/cosine_scaled_reward/mean": -0.014900196343660355, |
| "rewards/cosine_scaled_reward/std": 0.45363670587539673, |
| "step": 194 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1380.515625, |
| "completions/mean_terminated_length": 1119.3260498046875, |
| "completions/min_length": 380.0, |
| "completions/min_terminated_length": 380.0, |
| "epoch": 0.22285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23562170565128326, |
| "learning_rate": 7.911220577405484e-07, |
| "loss": -0.0, |
| "num_tokens": 22431674.0, |
| "reward": -0.1425207257270813, |
| "reward_std": 0.2992969751358032, |
| "rewards/cosine_scaled_reward/mean": -0.1425207257270813, |
| "rewards/cosine_scaled_reward/std": 0.37265509366989136, |
| "step": 195 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2024.0, |
| "completions/mean_length": 1474.296875, |
| "completions/mean_terminated_length": 1194.1163330078125, |
| "completions/min_length": 551.0, |
| "completions/min_terminated_length": 551.0, |
| "epoch": 0.224, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26706522703170776, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": 0.0, |
| "num_tokens": 22537621.0, |
| "reward": -0.14723923802375793, |
| "reward_std": 0.34929120540618896, |
| "rewards/cosine_scaled_reward/mean": -0.14723923802375793, |
| "rewards/cosine_scaled_reward/std": 0.42120617628097534, |
| "step": 196 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1718.0, |
| "completions/mean_length": 961.5, |
| "completions/mean_terminated_length": 908.0655517578125, |
| "completions/min_length": 368.0, |
| "completions/min_terminated_length": 368.0, |
| "epoch": 0.22514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32898151874542236, |
| "learning_rate": 7.857936576865356e-07, |
| "loss": -0.0, |
| "num_tokens": 22609525.0, |
| "reward": 0.05756232142448425, |
| "reward_std": 0.42182230949401855, |
| "rewards/cosine_scaled_reward/mean": 0.05756233632564545, |
| "rewards/cosine_scaled_reward/std": 0.5148370862007141, |
| "step": 197 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2030.0, |
| "completions/mean_length": 1141.140625, |
| "completions/mean_terminated_length": 1011.58935546875, |
| "completions/min_length": 449.0, |
| "completions/min_terminated_length": 449.0, |
| "epoch": 0.22628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3166671395301819, |
| "learning_rate": 7.831121542179086e-07, |
| "loss": -0.0, |
| "num_tokens": 22694062.0, |
| "reward": 0.11976294219493866, |
| "reward_std": 0.34080085158348083, |
| "rewards/cosine_scaled_reward/mean": 0.11976294219493866, |
| "rewards/cosine_scaled_reward/std": 0.5243961215019226, |
| "step": 198 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2001.0, |
| "completions/mean_length": 1369.25, |
| "completions/mean_terminated_length": 1179.199951171875, |
| "completions/min_length": 78.0, |
| "completions/min_terminated_length": 78.0, |
| "epoch": 0.22742857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33395788073539734, |
| "learning_rate": 7.804192891917571e-07, |
| "loss": 0.0, |
| "num_tokens": 22793198.0, |
| "reward": -0.2272849828004837, |
| "reward_std": 0.21225537359714508, |
| "rewards/cosine_scaled_reward/mean": -0.2272849828004837, |
| "rewards/cosine_scaled_reward/std": 0.2696577310562134, |
| "step": 199 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.265625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1933.0, |
| "completions/mean_length": 1219.734375, |
| "completions/mean_terminated_length": 920.14892578125, |
| "completions/min_length": 323.0, |
| "completions/min_terminated_length": 323.0, |
| "epoch": 0.22857142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27558398246765137, |
| "learning_rate": 7.777151938545235e-07, |
| "loss": 0.0, |
| "num_tokens": 22881381.0, |
| "reward": 0.07320597767829895, |
| "reward_std": 0.3534944951534271, |
| "rewards/cosine_scaled_reward/mean": 0.07320597767829895, |
| "rewards/cosine_scaled_reward/std": 0.4344184398651123, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 22881381, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|