| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.05714285714285714, |
| "eval_steps": 500, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1702.03125, |
| "completions/mean_terminated_length": 993.6190795898438, |
| "completions/min_length": 483.0, |
| "completions/min_terminated_length": 483.0, |
| "epoch": 0.001142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28377610445022583, |
| "learning_rate": 0.0, |
| "loss": -0.0, |
| "num_tokens": 118418.0, |
| "reward": -0.09800112247467041, |
| "reward_std": 0.3028089702129364, |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1738.90625, |
| "completions/mean_terminated_length": 949.0, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.002285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24221572279930115, |
| "learning_rate": 2e-08, |
| "loss": -0.0, |
| "num_tokens": 239748.0, |
| "reward": 0.020556632429361343, |
| "reward_std": 0.3545936942100525, |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1542.0, |
| "completions/mean_length": 1964.078125, |
| "completions/mean_terminated_length": 973.7999877929688, |
| "completions/min_length": 733.0, |
| "completions/min_terminated_length": 733.0, |
| "epoch": 0.0034285714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2472974807024002, |
| "learning_rate": 4e-08, |
| "loss": 0.0, |
| "num_tokens": 375921.0, |
| "reward": -0.20954538881778717, |
| "reward_std": 0.13813795149326324, |
| "rewards/cosine_scaled_reward/mean": -0.20954540371894836, |
| "rewards/cosine_scaled_reward/std": 0.16814909875392914, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2010.0, |
| "completions/mean_length": 1555.6875, |
| "completions/mean_terminated_length": 1093.212158203125, |
| "completions/min_length": 502.0, |
| "completions/min_terminated_length": 502.0, |
| "epoch": 0.004571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2868657112121582, |
| "learning_rate": 6e-08, |
| "loss": -0.0, |
| "num_tokens": 485293.0, |
| "reward": -0.12192361056804657, |
| "reward_std": 0.31710442900657654, |
| "rewards/cosine_scaled_reward/mean": -0.12192361056804657, |
| "rewards/cosine_scaled_reward/std": 0.35428565740585327, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1758.0, |
| "completions/mean_length": 1958.5625, |
| "completions/mean_terminated_length": 1332.5, |
| "completions/min_length": 932.0, |
| "completions/min_terminated_length": 932.0, |
| "epoch": 0.005714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2460148185491562, |
| "learning_rate": 8e-08, |
| "loss": -0.0, |
| "num_tokens": 621457.0, |
| "reward": -0.21145480871200562, |
| "reward_std": 0.14890719950199127, |
| "rewards/cosine_scaled_reward/mean": -0.21145479381084442, |
| "rewards/cosine_scaled_reward/std": 0.20399661362171173, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1670.0, |
| "completions/mean_length": 1908.375, |
| "completions/mean_terminated_length": 931.0, |
| "completions/min_length": 593.0, |
| "completions/min_terminated_length": 593.0, |
| "epoch": 0.006857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26549720764160156, |
| "learning_rate": 1e-07, |
| "loss": -0.0, |
| "num_tokens": 755241.0, |
| "reward": -0.2408866286277771, |
| "reward_std": 0.16572487354278564, |
| "rewards/cosine_scaled_reward/mean": -0.2408866286277771, |
| "rewards/cosine_scaled_reward/std": 0.17492830753326416, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1889.296875, |
| "completions/mean_terminated_length": 1201.5833740234375, |
| "completions/min_length": 396.0, |
| "completions/min_terminated_length": 396.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23518230020999908, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0, |
| "num_tokens": 886564.0, |
| "reward": -0.16087877750396729, |
| "reward_std": 0.24579641222953796, |
| "rewards/cosine_scaled_reward/mean": -0.16087877750396729, |
| "rewards/cosine_scaled_reward/std": 0.37339961528778076, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1751.578125, |
| "completions/mean_terminated_length": 994.0555419921875, |
| "completions/min_length": 330.0, |
| "completions/min_terminated_length": 330.0, |
| "epoch": 0.009142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2354528158903122, |
| "learning_rate": 1.4e-07, |
| "loss": 0.0, |
| "num_tokens": 1009081.0, |
| "reward": -0.023812226951122284, |
| "reward_std": 0.2823081314563751, |
| "rewards/cosine_scaled_reward/mean": -0.02381223440170288, |
| "rewards/cosine_scaled_reward/std": 0.4484662115573883, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1889.0, |
| "completions/mean_length": 2000.59375, |
| "completions/mean_terminated_length": 1289.5, |
| "completions/min_length": 903.0, |
| "completions/min_terminated_length": 903.0, |
| "epoch": 0.010285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24302220344543457, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1148575.0, |
| "reward": -0.2453702688217163, |
| "reward_std": 0.18811637163162231, |
| "rewards/cosine_scaled_reward/mean": -0.2453702688217163, |
| "rewards/cosine_scaled_reward/std": 0.22203005850315094, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1739.0, |
| "completions/mean_length": 1701.140625, |
| "completions/mean_terminated_length": 879.631591796875, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "epoch": 0.011428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25642141699790955, |
| "learning_rate": 1.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1268280.0, |
| "reward": -0.15177705883979797, |
| "reward_std": 0.2125300019979477, |
| "rewards/cosine_scaled_reward/mean": -0.15177705883979797, |
| "rewards/cosine_scaled_reward/std": 0.3240113854408264, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1683.0, |
| "completions/mean_length": 1950.609375, |
| "completions/mean_terminated_length": 1157.571533203125, |
| "completions/min_length": 584.0, |
| "completions/min_terminated_length": 584.0, |
| "epoch": 0.012571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24372951686382294, |
| "learning_rate": 2e-07, |
| "loss": 0.0, |
| "num_tokens": 1404791.0, |
| "reward": -0.23502977192401886, |
| "reward_std": 0.18896539509296417, |
| "rewards/cosine_scaled_reward/mean": -0.23502977192401886, |
| "rewards/cosine_scaled_reward/std": 0.24224351346492767, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1944.0, |
| "completions/mean_length": 1751.03125, |
| "completions/mean_terminated_length": 1221.6522216796875, |
| "completions/min_length": 489.0, |
| "completions/min_terminated_length": 489.0, |
| "epoch": 0.013714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28422027826309204, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": -0.0, |
| "num_tokens": 1527801.0, |
| "reward": -0.14280016720294952, |
| "reward_std": 0.32843896746635437, |
| "rewards/cosine_scaled_reward/mean": -0.14280015230178833, |
| "rewards/cosine_scaled_reward/std": 0.41895967721939087, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1793.0, |
| "completions/mean_length": 1834.453125, |
| "completions/mean_terminated_length": 1193.8125, |
| "completions/min_length": 783.0, |
| "completions/min_terminated_length": 783.0, |
| "epoch": 0.014857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24033738672733307, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0, |
| "num_tokens": 1656246.0, |
| "reward": -0.17057427763938904, |
| "reward_std": 0.24429668486118317, |
| "rewards/cosine_scaled_reward/mean": -0.17057427763938904, |
| "rewards/cosine_scaled_reward/std": 0.27816399931907654, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1474.0, |
| "completions/mean_length": 1800.65625, |
| "completions/mean_terminated_length": 1116.823486328125, |
| "completions/min_length": 495.0, |
| "completions/min_terminated_length": 495.0, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2312558889389038, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1782096.0, |
| "reward": -0.11817245185375214, |
| "reward_std": 0.24491220712661743, |
| "rewards/cosine_scaled_reward/mean": -0.11817245930433273, |
| "rewards/cosine_scaled_reward/std": 0.3942086696624756, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1636.0, |
| "completions/mean_length": 1692.828125, |
| "completions/mean_terminated_length": 785.1666870117188, |
| "completions/min_length": 438.0, |
| "completions/min_terminated_length": 438.0, |
| "epoch": 0.017142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2563658654689789, |
| "learning_rate": 2.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1901357.0, |
| "reward": -0.027107469737529755, |
| "reward_std": 0.1853453516960144, |
| "rewards/cosine_scaled_reward/mean": -0.027107462286949158, |
| "rewards/cosine_scaled_reward/std": 0.4734213352203369, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2048.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2048.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24149107933044434, |
| "learning_rate": 3e-07, |
| "loss": -0.0, |
| "num_tokens": 2042869.0, |
| "reward": -0.2542623281478882, |
| "reward_std": 0.14302438497543335, |
| "rewards/cosine_scaled_reward/mean": -0.2542623281478882, |
| "rewards/cosine_scaled_reward/std": 0.160969540476799, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1827.0, |
| "completions/mean_length": 1548.75, |
| "completions/mean_terminated_length": 864.5925903320312, |
| "completions/min_length": 357.0, |
| "completions/min_terminated_length": 357.0, |
| "epoch": 0.019428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31088724732398987, |
| "learning_rate": 3.2e-07, |
| "loss": 0.0, |
| "num_tokens": 2152509.0, |
| "reward": -0.12113451957702637, |
| "reward_std": 0.284165620803833, |
| "rewards/cosine_scaled_reward/mean": -0.12113452702760696, |
| "rewards/cosine_scaled_reward/std": 0.4259316623210907, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1899.0, |
| "completions/mean_length": 1793.03125, |
| "completions/mean_terminated_length": 1028.125, |
| "completions/min_length": 531.0, |
| "completions/min_terminated_length": 531.0, |
| "epoch": 0.02057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2451843023300171, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0, |
| "num_tokens": 2277639.0, |
| "reward": -0.18317042291164398, |
| "reward_std": 0.20634235441684723, |
| "rewards/cosine_scaled_reward/mean": -0.18317043781280518, |
| "rewards/cosine_scaled_reward/std": 0.27781662344932556, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1633.0, |
| "completions/mean_length": 1735.984375, |
| "completions/mean_terminated_length": 997.0, |
| "completions/min_length": 462.0, |
| "completions/min_terminated_length": 462.0, |
| "epoch": 0.021714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24677637219429016, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0, |
| "num_tokens": 2399998.0, |
| "reward": -0.04996331408619881, |
| "reward_std": 0.2841629385948181, |
| "rewards/cosine_scaled_reward/mean": -0.04996330291032791, |
| "rewards/cosine_scaled_reward/std": 0.4186851680278778, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1643.0, |
| "completions/mean_length": 1614.890625, |
| "completions/mean_terminated_length": 842.8261108398438, |
| "completions/min_length": 411.0, |
| "completions/min_terminated_length": 411.0, |
| "epoch": 0.022857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2543003559112549, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": -0.0, |
| "num_tokens": 2514703.0, |
| "reward": -0.09282197058200836, |
| "reward_std": 0.2568933367729187, |
| "rewards/cosine_scaled_reward/mean": -0.09282197058200836, |
| "rewards/cosine_scaled_reward/std": 0.4104878604412079, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1988.0, |
| "completions/mean_length": 1786.734375, |
| "completions/mean_terminated_length": 1119.0555419921875, |
| "completions/min_length": 348.0, |
| "completions/min_terminated_length": 348.0, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3147278130054474, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "num_tokens": 2639862.0, |
| "reward": -0.16029146313667297, |
| "reward_std": 0.2322564721107483, |
| "rewards/cosine_scaled_reward/mean": -0.16029146313667297, |
| "rewards/cosine_scaled_reward/std": 0.36191171407699585, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1833.0, |
| "completions/mean_length": 1300.484375, |
| "completions/mean_terminated_length": 789.0263061523438, |
| "completions/min_length": 287.0, |
| "completions/min_terminated_length": 287.0, |
| "epoch": 0.025142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32522445917129517, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0, |
| "num_tokens": 2732109.0, |
| "reward": 0.0033364146947860718, |
| "reward_std": 0.18878400325775146, |
| "rewards/cosine_scaled_reward/mean": 0.0033364109694957733, |
| "rewards/cosine_scaled_reward/std": 0.45390966534614563, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1641.03125, |
| "completions/mean_terminated_length": 1046.2308349609375, |
| "completions/min_length": 422.0, |
| "completions/min_terminated_length": 422.0, |
| "epoch": 0.026285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28244850039482117, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0, |
| "num_tokens": 2847927.0, |
| "reward": -0.21077856421470642, |
| "reward_std": 0.24399788677692413, |
| "rewards/cosine_scaled_reward/mean": -0.21077856421470642, |
| "rewards/cosine_scaled_reward/std": 0.2925592362880707, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1965.0, |
| "completions/mean_length": 1789.59375, |
| "completions/mean_terminated_length": 1129.2222900390625, |
| "completions/min_length": 560.0, |
| "completions/min_terminated_length": 560.0, |
| "epoch": 0.027428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24896308779716492, |
| "learning_rate": 4.6e-07, |
| "loss": -0.0, |
| "num_tokens": 2973389.0, |
| "reward": -0.1665852814912796, |
| "reward_std": 0.307574987411499, |
| "rewards/cosine_scaled_reward/mean": -0.1665852665901184, |
| "rewards/cosine_scaled_reward/std": 0.4072873294353485, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1851.0, |
| "completions/mean_length": 1696.40625, |
| "completions/mean_terminated_length": 1025.181884765625, |
| "completions/min_length": 434.0, |
| "completions/min_terminated_length": 434.0, |
| "epoch": 0.02857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.262716144323349, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0, |
| "num_tokens": 3092255.0, |
| "reward": -0.14361324906349182, |
| "reward_std": 0.3466429114341736, |
| "rewards/cosine_scaled_reward/mean": -0.14361326396465302, |
| "rewards/cosine_scaled_reward/std": 0.3933021128177643, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1975.0, |
| "completions/mean_length": 1973.046875, |
| "completions/mean_terminated_length": 1448.375, |
| "completions/min_length": 1035.0, |
| "completions/min_terminated_length": 1035.0, |
| "epoch": 0.029714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2365841567516327, |
| "learning_rate": 5e-07, |
| "loss": -0.0, |
| "num_tokens": 3229162.0, |
| "reward": -0.050574399530887604, |
| "reward_std": 0.22459164261817932, |
| "rewards/cosine_scaled_reward/mean": -0.050574399530887604, |
| "rewards/cosine_scaled_reward/std": 0.37290775775909424, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1878.53125, |
| "completions/mean_terminated_length": 1213.6923828125, |
| "completions/min_length": 498.0, |
| "completions/min_terminated_length": 498.0, |
| "epoch": 0.030857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2821083068847656, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0, |
| "num_tokens": 3359676.0, |
| "reward": -0.13096781075000763, |
| "reward_std": 0.26249831914901733, |
| "rewards/cosine_scaled_reward/mean": -0.13096781075000763, |
| "rewards/cosine_scaled_reward/std": 0.3478032350540161, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1933.0, |
| "completions/mean_length": 1827.453125, |
| "completions/mean_terminated_length": 1039.7857666015625, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2539210915565491, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0, |
| "num_tokens": 3486969.0, |
| "reward": -0.11822876334190369, |
| "reward_std": 0.2370690554380417, |
| "rewards/cosine_scaled_reward/mean": -0.11822875589132309, |
| "rewards/cosine_scaled_reward/std": 0.4236762225627899, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 2020.5, |
| "completions/mean_terminated_length": 1608.0, |
| "completions/min_length": 887.0, |
| "completions/min_terminated_length": 887.0, |
| "epoch": 0.03314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23259545862674713, |
| "learning_rate": 5.6e-07, |
| "loss": -0.0, |
| "num_tokens": 3626753.0, |
| "reward": -0.20220182836055756, |
| "reward_std": 0.15910759568214417, |
| "rewards/cosine_scaled_reward/mean": -0.20220182836055756, |
| "rewards/cosine_scaled_reward/std": 0.20781411230564117, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1662.0, |
| "completions/mean_length": 1903.703125, |
| "completions/mean_terminated_length": 1208.45458984375, |
| "completions/min_length": 961.0, |
| "completions/min_terminated_length": 961.0, |
| "epoch": 0.03428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24027252197265625, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0, |
| "num_tokens": 3759126.0, |
| "reward": -0.19193249940872192, |
| "reward_std": 0.24584847688674927, |
| "rewards/cosine_scaled_reward/mean": -0.19193249940872192, |
| "rewards/cosine_scaled_reward/std": 0.28378522396087646, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1922.0, |
| "completions/mean_length": 1847.34375, |
| "completions/mean_terminated_length": 1060.1539306640625, |
| "completions/min_length": 311.0, |
| "completions/min_terminated_length": 311.0, |
| "epoch": 0.03542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2703397274017334, |
| "learning_rate": 6e-07, |
| "loss": -0.0, |
| "num_tokens": 3887852.0, |
| "reward": -0.25379180908203125, |
| "reward_std": 0.24661941826343536, |
| "rewards/cosine_scaled_reward/mean": -0.25379180908203125, |
| "rewards/cosine_scaled_reward/std": 0.29188498854637146, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 1950.3125, |
| "completions/mean_terminated_length": 1479.6363525390625, |
| "completions/min_length": 766.0, |
| "completions/min_terminated_length": 766.0, |
| "epoch": 0.036571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21763876080513, |
| "learning_rate": 6.2e-07, |
| "loss": -0.0, |
| "num_tokens": 4023024.0, |
| "reward": -0.16017228364944458, |
| "reward_std": 0.2255343496799469, |
| "rewards/cosine_scaled_reward/mean": -0.16017228364944458, |
| "rewards/cosine_scaled_reward/std": 0.3709539771080017, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 1996.28125, |
| "completions/mean_terminated_length": 1634.25, |
| "completions/min_length": 1237.0, |
| "completions/min_terminated_length": 1237.0, |
| "epoch": 0.037714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22758260369300842, |
| "learning_rate": 6.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4162002.0, |
| "reward": -0.20318198204040527, |
| "reward_std": 0.18396919965744019, |
| "rewards/cosine_scaled_reward/mean": -0.20318198204040527, |
| "rewards/cosine_scaled_reward/std": 0.34913352131843567, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1850.0, |
| "completions/mean_length": 1703.265625, |
| "completions/mean_terminated_length": 1230.851806640625, |
| "completions/min_length": 651.0, |
| "completions/min_terminated_length": 651.0, |
| "epoch": 0.038857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31658875942230225, |
| "learning_rate": 6.6e-07, |
| "loss": -0.0, |
| "num_tokens": 4280563.0, |
| "reward": -0.05977274850010872, |
| "reward_std": 0.30437377095222473, |
| "rewards/cosine_scaled_reward/mean": -0.059772733598947525, |
| "rewards/cosine_scaled_reward/std": 0.4424094259738922, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1846.0, |
| "completions/mean_length": 1807.546875, |
| "completions/mean_terminated_length": 765.5833740234375, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2792847156524658, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": -0.0, |
| "num_tokens": 4407742.0, |
| "reward": -0.18658886849880219, |
| "reward_std": 0.2910658121109009, |
| "rewards/cosine_scaled_reward/mean": -0.18658888339996338, |
| "rewards/cosine_scaled_reward/std": 0.34802255034446716, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1912.0, |
| "completions/mean_length": 1995.65625, |
| "completions/mean_terminated_length": 1378.0, |
| "completions/min_length": 1090.0, |
| "completions/min_terminated_length": 1090.0, |
| "epoch": 0.04114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23547738790512085, |
| "learning_rate": 7e-07, |
| "loss": 0.0, |
| "num_tokens": 4546576.0, |
| "reward": -0.23918019235134125, |
| "reward_std": 0.19598917663097382, |
| "rewards/cosine_scaled_reward/mean": -0.23918019235134125, |
| "rewards/cosine_scaled_reward/std": 0.2425125539302826, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2015.0, |
| "completions/mean_length": 1994.75, |
| "completions/mean_terminated_length": 1480.0, |
| "completions/min_length": 545.0, |
| "completions/min_terminated_length": 545.0, |
| "epoch": 0.04228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22962674498558044, |
| "learning_rate": 7.2e-07, |
| "loss": -0.0, |
| "num_tokens": 4685264.0, |
| "reward": -0.25335729122161865, |
| "reward_std": 0.15323391556739807, |
| "rewards/cosine_scaled_reward/mean": -0.25335729122161865, |
| "rewards/cosine_scaled_reward/std": 0.17556406557559967, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1934.0, |
| "completions/mean_length": 1957.484375, |
| "completions/mean_terminated_length": 1220.4285888671875, |
| "completions/min_length": 965.0, |
| "completions/min_terminated_length": 965.0, |
| "epoch": 0.04342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24781912565231323, |
| "learning_rate": 7.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4822255.0, |
| "reward": -0.13536512851715088, |
| "reward_std": 0.19208545982837677, |
| "rewards/cosine_scaled_reward/mean": -0.13536511361598969, |
| "rewards/cosine_scaled_reward/std": 0.30052343010902405, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1824.0, |
| "completions/mean_length": 1744.421875, |
| "completions/mean_terminated_length": 833.6875, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.044571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2562144994735718, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 4944682.0, |
| "reward": -0.041110455989837646, |
| "reward_std": 0.21381449699401855, |
| "rewards/cosine_scaled_reward/mean": -0.04111045226454735, |
| "rewards/cosine_scaled_reward/std": 0.35980772972106934, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1774.359375, |
| "completions/mean_terminated_length": 1017.8235473632812, |
| "completions/min_length": 445.0, |
| "completions/min_terminated_length": 445.0, |
| "epoch": 0.045714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25478634238243103, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5068313.0, |
| "reward": -0.12165145576000214, |
| "reward_std": 0.17204006016254425, |
| "rewards/cosine_scaled_reward/mean": -0.12165144830942154, |
| "rewards/cosine_scaled_reward/std": 0.4099982678890228, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1991.0, |
| "completions/mean_length": 1814.375, |
| "completions/mean_terminated_length": 1397.9130859375, |
| "completions/min_length": 968.0, |
| "completions/min_terminated_length": 968.0, |
| "epoch": 0.046857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21750310063362122, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "num_tokens": 5195585.0, |
| "reward": -0.25668060779571533, |
| "reward_std": 0.2832298278808594, |
| "rewards/cosine_scaled_reward/mean": -0.25668060779571533, |
| "rewards/cosine_scaled_reward/std": 0.3347759544849396, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1764.0, |
| "completions/mean_length": 1714.59375, |
| "completions/mean_terminated_length": 625.4666748046875, |
| "completions/min_length": 186.0, |
| "completions/min_terminated_length": 186.0, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34486907720565796, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 5315679.0, |
| "reward": -0.2253742218017578, |
| "reward_std": 0.1778060495853424, |
| "rewards/cosine_scaled_reward/mean": -0.22537420690059662, |
| "rewards/cosine_scaled_reward/std": 0.19647939503192902, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1638.0, |
| "completions/mean_length": 1863.78125, |
| "completions/mean_terminated_length": 976.1818237304688, |
| "completions/min_length": 669.0, |
| "completions/min_terminated_length": 669.0, |
| "epoch": 0.04914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23907455801963806, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5446577.0, |
| "reward": -0.1142776757478714, |
| "reward_std": 0.21804723143577576, |
| "rewards/cosine_scaled_reward/mean": -0.1142776757478714, |
| "rewards/cosine_scaled_reward/std": 0.3637608587741852, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1920.0, |
| "completions/mean_length": 1771.125, |
| "completions/mean_terminated_length": 940.5, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.05028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2888188362121582, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5570625.0, |
| "reward": -0.11845305562019348, |
| "reward_std": 0.2729855477809906, |
| "rewards/cosine_scaled_reward/mean": -0.11845306307077408, |
| "rewards/cosine_scaled_reward/std": 0.4279690086841583, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.96875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1296.0, |
| "completions/mean_length": 2020.859375, |
| "completions/mean_terminated_length": 1179.5, |
| "completions/min_length": 1063.0, |
| "completions/min_terminated_length": 1063.0, |
| "epoch": 0.05142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2232045829296112, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5711616.0, |
| "reward": -0.1830526441335678, |
| "reward_std": 0.20074567198753357, |
| "rewards/cosine_scaled_reward/mean": -0.1830526441335678, |
| "rewards/cosine_scaled_reward/std": 0.3221423327922821, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1121.0, |
| "completions/mean_length": 1843.328125, |
| "completions/mean_terminated_length": 857.1818237304688, |
| "completions/min_length": 608.0, |
| "completions/min_terminated_length": 608.0, |
| "epoch": 0.052571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2569328844547272, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "num_tokens": 5840757.0, |
| "reward": -0.21247822046279907, |
| "reward_std": 0.17188501358032227, |
| "rewards/cosine_scaled_reward/mean": -0.21247822046279907, |
| "rewards/cosine_scaled_reward/std": 0.183182492852211, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1772.984375, |
| "completions/mean_terminated_length": 1012.6470336914062, |
| "completions/min_length": 461.0, |
| "completions/min_terminated_length": 461.0, |
| "epoch": 0.053714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2800576090812683, |
| "learning_rate": 9.2e-07, |
| "loss": -0.0, |
| "num_tokens": 5964628.0, |
| "reward": -0.1755329668521881, |
| "reward_std": 0.19662824273109436, |
| "rewards/cosine_scaled_reward/mean": -0.1755329668521881, |
| "rewards/cosine_scaled_reward/std": 0.3987559974193573, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1949.0, |
| "completions/mean_length": 1787.046875, |
| "completions/mean_terminated_length": 1120.1666259765625, |
| "completions/min_length": 630.0, |
| "completions/min_terminated_length": 630.0, |
| "epoch": 0.054857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2499135434627533, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 6089543.0, |
| "reward": -0.07469595968723297, |
| "reward_std": 0.2802818715572357, |
| "rewards/cosine_scaled_reward/mean": -0.07469595968723297, |
| "rewards/cosine_scaled_reward/std": 0.39331451058387756, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1818.0, |
| "completions/mean_length": 1611.65625, |
| "completions/mean_terminated_length": 1013.7037353515625, |
| "completions/min_length": 298.0, |
| "completions/min_terminated_length": 298.0, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2976716160774231, |
| "learning_rate": 9.6e-07, |
| "loss": -0.0, |
| "num_tokens": 6202753.0, |
| "reward": -0.14219576120376587, |
| "reward_std": 0.3252427875995636, |
| "rewards/cosine_scaled_reward/mean": -0.14219576120376587, |
| "rewards/cosine_scaled_reward/std": 0.41946855187416077, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1826.90625, |
| "completions/mean_terminated_length": 761.6364135742188, |
| "completions/min_length": 341.0, |
| "completions/min_terminated_length": 341.0, |
| "epoch": 0.05714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2344626933336258, |
| "learning_rate": 9.8e-07, |
| "loss": -0.0, |
| "num_tokens": 6330491.0, |
| "reward": -0.098542720079422, |
| "reward_std": 0.20483215153217316, |
| "rewards/cosine_scaled_reward/mean": -0.0985427126288414, |
| "rewards/cosine_scaled_reward/std": 0.396296888589859, |
| "step": 50 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 6330491, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|