{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22857142857142856, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1702.03125, "completions/mean_terminated_length": 993.6190795898438, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.001142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.28377610445022583, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 118418.0, "reward": -0.09800112247467041, "reward_std": 0.3028089702129364, "rewards/cosine_scaled_reward/mean": -0.09800112992525101, "rewards/cosine_scaled_reward/std": 0.37953105568885803, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1738.90625, "completions/mean_terminated_length": 949.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24221572279930115, "learning_rate": 2e-08, "loss": -0.0, "num_tokens": 239748.0, "reward": 0.020556632429361343, "reward_std": 0.3545936942100525, "rewards/cosine_scaled_reward/mean": 0.020556632429361343, "rewards/cosine_scaled_reward/std": 0.4492928683757782, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 1964.078125, "completions/mean_terminated_length": 973.7999877929688, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.0034285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2472974807024002, "learning_rate": 4e-08, "loss": 0.0, "num_tokens": 375921.0, "reward": -0.20954538881778717, "reward_std": 0.13813795149326324, "rewards/cosine_scaled_reward/mean": -0.20954540371894836, "rewards/cosine_scaled_reward/std": 0.16814909875392914, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1555.6875, "completions/mean_terminated_length": 1093.212158203125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2868657112121582, "learning_rate": 6e-08, "loss": -0.0, "num_tokens": 485293.0, "reward": -0.12192361056804657, "reward_std": 0.31710442900657654, "rewards/cosine_scaled_reward/mean": -0.12192361056804657, "rewards/cosine_scaled_reward/std": 0.35428565740585327, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 1958.5625, "completions/mean_terminated_length": 1332.5, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2460148185491562, "learning_rate": 8e-08, "loss": -0.0, "num_tokens": 621457.0, "reward": -0.21145480871200562, "reward_std": 0.14890719950199127, "rewards/cosine_scaled_reward/mean": -0.21145479381084442, "rewards/cosine_scaled_reward/std": 0.20399661362171173, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 1908.375, "completions/mean_terminated_length": 931.0, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26549720764160156, "learning_rate": 1e-07, "loss": -0.0, "num_tokens": 755241.0, "reward": -0.2408866286277771, "reward_std": 0.16572487354278564, "rewards/cosine_scaled_reward/mean": -0.2408866286277771, "rewards/cosine_scaled_reward/std": 0.17492830753326416, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1889.296875, "completions/mean_terminated_length": 1201.5833740234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.23518230020999908, "learning_rate": 1.2e-07, "loss": 0.0, "num_tokens": 886564.0, "reward": -0.16087877750396729, "reward_std": 0.24579641222953796, "rewards/cosine_scaled_reward/mean": -0.16087877750396729, "rewards/cosine_scaled_reward/std": 0.37339961528778076, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1751.578125, "completions/mean_terminated_length": 994.0555419921875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2354528158903122, "learning_rate": 1.4e-07, "loss": 0.0, "num_tokens": 1009081.0, "reward": -0.023812226951122284, "reward_std": 0.2823081314563751, "rewards/cosine_scaled_reward/mean": -0.02381223440170288, "rewards/cosine_scaled_reward/std": 0.4484662115573883, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 2000.59375, "completions/mean_terminated_length": 1289.5, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 0.010285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.24302220344543457, "learning_rate": 1.6e-07, "loss": 0.0, "num_tokens": 1148575.0, "reward": -0.2453702688217163, "reward_std": 0.18811637163162231, "rewards/cosine_scaled_reward/mean": -0.2453702688217163, "rewards/cosine_scaled_reward/std": 0.22203005850315094, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 1701.140625, "completions/mean_terminated_length": 879.631591796875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.25642141699790955, "learning_rate": 1.8e-07, "loss": -0.0, "num_tokens": 1268280.0, "reward": -0.15177705883979797, "reward_std": 0.2125300019979477, "rewards/cosine_scaled_reward/mean": -0.15177705883979797, "rewards/cosine_scaled_reward/std": 0.3240113854408264, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 1950.609375, "completions/mean_terminated_length": 1157.571533203125, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.012571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.24372951686382294, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 1404791.0, "reward": -0.23502977192401886, "reward_std": 0.18896539509296417, "rewards/cosine_scaled_reward/mean": -0.23502977192401886, "rewards/cosine_scaled_reward/std": 0.24224351346492767, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 1751.03125, "completions/mean_terminated_length": 1221.6522216796875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.28422027826309204, "learning_rate": 2.1999999999999998e-07, "loss": -0.0, "num_tokens": 1527801.0, "reward": -0.14280016720294952, "reward_std": 0.32843896746635437, "rewards/cosine_scaled_reward/mean": -0.14280015230178833, "rewards/cosine_scaled_reward/std": 0.41895967721939087, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 1834.453125, "completions/mean_terminated_length": 1193.8125, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 0.014857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24033738672733307, "learning_rate": 2.4e-07, "loss": 0.0, "num_tokens": 1656246.0, "reward": -0.17057427763938904, "reward_std": 0.24429668486118317, "rewards/cosine_scaled_reward/mean": -0.17057427763938904, "rewards/cosine_scaled_reward/std": 0.27816399931907654, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 1800.65625, "completions/mean_terminated_length": 1116.823486328125, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312558889389038, "learning_rate": 2.6e-07, "loss": 0.0, "num_tokens": 1782096.0, "reward": -0.11817245185375214, "reward_std": 0.24491220712661743, "rewards/cosine_scaled_reward/mean": -0.11817245930433273, "rewards/cosine_scaled_reward/std": 0.3942086696624756, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 1692.828125, "completions/mean_terminated_length": 785.1666870117188, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2563658654689789, "learning_rate": 2.8e-07, "loss": -0.0, "num_tokens": 1901357.0, "reward": -0.027107469737529755, "reward_std": 0.1853453516960144, "rewards/cosine_scaled_reward/mean": -0.027107462286949158, "rewards/cosine_scaled_reward/std": 0.4734213352203369, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.24149107933044434, "learning_rate": 3e-07, "loss": -0.0, "num_tokens": 2042869.0, "reward": -0.2542623281478882, "reward_std": 0.14302438497543335, "rewards/cosine_scaled_reward/mean": -0.2542623281478882, "rewards/cosine_scaled_reward/std": 0.160969540476799, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 1548.75, "completions/mean_terminated_length": 864.5925903320312, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.019428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.31088724732398987, "learning_rate": 3.2e-07, "loss": 0.0, "num_tokens": 2152509.0, "reward": -0.12113451957702637, "reward_std": 0.284165620803833, "rewards/cosine_scaled_reward/mean": -0.12113452702760696, "rewards/cosine_scaled_reward/std": 0.4259316623210907, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 1793.03125, "completions/mean_terminated_length": 1028.125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2451843023300171, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "num_tokens": 2277639.0, "reward": -0.18317042291164398, "reward_std": 0.20634235441684723, "rewards/cosine_scaled_reward/mean": -0.18317043781280518, "rewards/cosine_scaled_reward/std": 0.27781662344932556, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 1735.984375, "completions/mean_terminated_length": 997.0, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.021714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.24677637219429016, "learning_rate": 3.6e-07, "loss": 0.0, "num_tokens": 2399998.0, "reward": -0.04996331408619881, "reward_std": 0.2841629385948181, "rewards/cosine_scaled_reward/mean": -0.04996330291032791, "rewards/cosine_scaled_reward/std": 0.4186851680278778, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 1614.890625, "completions/mean_terminated_length": 842.8261108398438, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2543003559112549, "learning_rate": 3.7999999999999996e-07, "loss": -0.0, "num_tokens": 2514703.0, "reward": -0.09282197058200836, "reward_std": 0.2568933367729187, "rewards/cosine_scaled_reward/mean": -0.09282197058200836, "rewards/cosine_scaled_reward/std": 0.4104878604412079, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1786.734375, "completions/mean_terminated_length": 1119.0555419921875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.3147278130054474, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 2639862.0, "reward": -0.16029146313667297, "reward_std": 0.2322564721107483, "rewards/cosine_scaled_reward/mean": -0.16029146313667297, "rewards/cosine_scaled_reward/std": 0.36191171407699585, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 1300.484375, "completions/mean_terminated_length": 789.0263061523438, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.32522445917129517, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "num_tokens": 2732109.0, "reward": 0.0033364146947860718, "reward_std": 0.18878400325775146, "rewards/cosine_scaled_reward/mean": 0.0033364109694957733, "rewards/cosine_scaled_reward/std": 0.45390966534614563, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1641.03125, "completions/mean_terminated_length": 1046.2308349609375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.026285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.28244850039482117, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "num_tokens": 2847927.0, "reward": -0.21077856421470642, "reward_std": 0.24399788677692413, "rewards/cosine_scaled_reward/mean": -0.21077856421470642, "rewards/cosine_scaled_reward/std": 0.2925592362880707, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1789.59375, "completions/mean_terminated_length": 1129.2222900390625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.24896308779716492, "learning_rate": 4.6e-07, "loss": -0.0, "num_tokens": 2973389.0, "reward": -0.1665852814912796, "reward_std": 0.307574987411499, "rewards/cosine_scaled_reward/mean": -0.1665852665901184, "rewards/cosine_scaled_reward/std": 0.4072873294353485, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1696.40625, "completions/mean_terminated_length": 1025.181884765625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.262716144323349, "learning_rate": 4.8e-07, "loss": 0.0, "num_tokens": 3092255.0, "reward": -0.14361324906349182, "reward_std": 0.3466429114341736, "rewards/cosine_scaled_reward/mean": -0.14361326396465302, "rewards/cosine_scaled_reward/std": 0.3933021128177643, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1973.046875, "completions/mean_terminated_length": 1448.375, "completions/min_length": 1035.0, "completions/min_terminated_length": 1035.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2365841567516327, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 3229162.0, "reward": -0.050574399530887604, "reward_std": 0.22459164261817932, "rewards/cosine_scaled_reward/mean": -0.050574399530887604, "rewards/cosine_scaled_reward/std": 0.37290775775909424, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1878.53125, "completions/mean_terminated_length": 1213.6923828125, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.030857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2821083068847656, "learning_rate": 5.2e-07, "loss": 0.0, "num_tokens": 3359676.0, "reward": -0.13096781075000763, "reward_std": 0.26249831914901733, "rewards/cosine_scaled_reward/mean": -0.13096781075000763, "rewards/cosine_scaled_reward/std": 0.3478032350540161, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 1827.453125, "completions/mean_terminated_length": 1039.7857666015625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2539210915565491, "learning_rate": 5.4e-07, "loss": 0.0, "num_tokens": 3486969.0, "reward": -0.11822876334190369, "reward_std": 0.2370690554380417, "rewards/cosine_scaled_reward/mean": -0.11822875589132309, "rewards/cosine_scaled_reward/std": 0.4236762225627899, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 2020.5, "completions/mean_terminated_length": 1608.0, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.03314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23259545862674713, "learning_rate": 5.6e-07, "loss": -0.0, "num_tokens": 3626753.0, "reward": -0.20220182836055756, "reward_std": 0.15910759568214417, "rewards/cosine_scaled_reward/mean": -0.20220182836055756, "rewards/cosine_scaled_reward/std": 0.20781411230564117, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 1903.703125, "completions/mean_terminated_length": 1208.45458984375, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24027252197265625, "learning_rate": 5.8e-07, "loss": 0.0, "num_tokens": 3759126.0, "reward": -0.19193249940872192, "reward_std": 0.24584847688674927, "rewards/cosine_scaled_reward/mean": -0.19193249940872192, "rewards/cosine_scaled_reward/std": 0.28378522396087646, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 1847.34375, "completions/mean_terminated_length": 1060.1539306640625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.03542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2703397274017334, "learning_rate": 6e-07, "loss": -0.0, "num_tokens": 3887852.0, "reward": -0.25379180908203125, "reward_std": 0.24661941826343536, "rewards/cosine_scaled_reward/mean": -0.25379180908203125, "rewards/cosine_scaled_reward/std": 0.29188498854637146, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1950.3125, "completions/mean_terminated_length": 1479.6363525390625, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.21763876080513, "learning_rate": 6.2e-07, "loss": -0.0, "num_tokens": 4023024.0, "reward": -0.16017228364944458, "reward_std": 0.2255343496799469, "rewards/cosine_scaled_reward/mean": -0.16017228364944458, "rewards/cosine_scaled_reward/std": 0.3709539771080017, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1996.28125, "completions/mean_terminated_length": 1634.25, "completions/min_length": 1237.0, "completions/min_terminated_length": 1237.0, "epoch": 0.037714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22758260369300842, "learning_rate": 6.4e-07, "loss": -0.0, "num_tokens": 4162002.0, "reward": -0.20318198204040527, "reward_std": 0.18396919965744019, "rewards/cosine_scaled_reward/mean": -0.20318198204040527, "rewards/cosine_scaled_reward/std": 0.34913352131843567, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 1703.265625, "completions/mean_terminated_length": 1230.851806640625, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.31658875942230225, "learning_rate": 6.6e-07, "loss": -0.0, "num_tokens": 4280563.0, "reward": -0.05977274850010872, "reward_std": 0.30437377095222473, "rewards/cosine_scaled_reward/mean": -0.059772733598947525, "rewards/cosine_scaled_reward/std": 0.4424094259738922, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 1807.546875, "completions/mean_terminated_length": 765.5833740234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.2792847156524658, "learning_rate": 6.800000000000001e-07, "loss": -0.0, "num_tokens": 4407742.0, "reward": -0.18658886849880219, "reward_std": 0.2910658121109009, "rewards/cosine_scaled_reward/mean": -0.18658888339996338, "rewards/cosine_scaled_reward/std": 0.34802255034446716, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 1995.65625, "completions/mean_terminated_length": 1378.0, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23547738790512085, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 4546576.0, "reward": -0.23918019235134125, "reward_std": 0.19598917663097382, "rewards/cosine_scaled_reward/mean": -0.23918019235134125, "rewards/cosine_scaled_reward/std": 0.2425125539302826, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1994.75, "completions/mean_terminated_length": 1480.0, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.04228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.22962674498558044, "learning_rate": 7.2e-07, "loss": -0.0, "num_tokens": 4685264.0, "reward": -0.25335729122161865, "reward_std": 0.15323391556739807, "rewards/cosine_scaled_reward/mean": -0.25335729122161865, "rewards/cosine_scaled_reward/std": 0.17556406557559967, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1957.484375, "completions/mean_terminated_length": 1220.4285888671875, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24781912565231323, "learning_rate": 7.4e-07, "loss": -0.0, "num_tokens": 4822255.0, "reward": -0.13536512851715088, "reward_std": 0.19208545982837677, "rewards/cosine_scaled_reward/mean": -0.13536511361598969, "rewards/cosine_scaled_reward/std": 0.30052343010902405, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1744.421875, "completions/mean_terminated_length": 833.6875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.044571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.2562144994735718, "learning_rate": 7.599999999999999e-07, "loss": -0.0, "num_tokens": 4944682.0, "reward": -0.041110455989837646, "reward_std": 0.21381449699401855, "rewards/cosine_scaled_reward/mean": -0.04111045226454735, "rewards/cosine_scaled_reward/std": 0.35980772972106934, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1774.359375, "completions/mean_terminated_length": 1017.8235473632812, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25478634238243103, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "num_tokens": 5068313.0, "reward": -0.12165145576000214, "reward_std": 0.17204006016254425, "rewards/cosine_scaled_reward/mean": -0.12165144830942154, "rewards/cosine_scaled_reward/std": 0.4099982678890228, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1814.375, "completions/mean_terminated_length": 1397.9130859375, "completions/min_length": 968.0, "completions/min_terminated_length": 968.0, "epoch": 0.046857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.21750310063362122, "learning_rate": 8e-07, "loss": 0.0, "num_tokens": 5195585.0, "reward": -0.25668060779571533, "reward_std": 0.2832298278808594, "rewards/cosine_scaled_reward/mean": -0.25668060779571533, "rewards/cosine_scaled_reward/std": 0.3347759544849396, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1714.59375, "completions/mean_terminated_length": 625.4666748046875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.34486907720565796, "learning_rate": 8.199999999999999e-07, "loss": -0.0, "num_tokens": 5315679.0, "reward": -0.2253742218017578, "reward_std": 0.1778060495853424, "rewards/cosine_scaled_reward/mean": -0.22537420690059662, "rewards/cosine_scaled_reward/std": 0.19647939503192902, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 1863.78125, "completions/mean_terminated_length": 976.1818237304688, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.04914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23907455801963806, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "num_tokens": 5446577.0, "reward": -0.1142776757478714, "reward_std": 0.21804723143577576, "rewards/cosine_scaled_reward/mean": -0.1142776757478714, "rewards/cosine_scaled_reward/std": 0.3637608587741852, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1771.125, "completions/mean_terminated_length": 940.5, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2888188362121582, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "num_tokens": 5570625.0, "reward": -0.11845305562019348, "reward_std": 0.2729855477809906, "rewards/cosine_scaled_reward/mean": -0.11845306307077408, "rewards/cosine_scaled_reward/std": 0.4279690086841583, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 2020.859375, "completions/mean_terminated_length": 1179.5, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2232045829296112, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "num_tokens": 5711616.0, "reward": -0.1830526441335678, "reward_std": 0.20074567198753357, "rewards/cosine_scaled_reward/mean": -0.1830526441335678, "rewards/cosine_scaled_reward/std": 0.3221423327922821, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 1843.328125, "completions/mean_terminated_length": 857.1818237304688, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2569328844547272, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 5840757.0, "reward": -0.21247822046279907, "reward_std": 0.17188501358032227, "rewards/cosine_scaled_reward/mean": -0.21247822046279907, "rewards/cosine_scaled_reward/std": 0.183182492852211, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1772.984375, "completions/mean_terminated_length": 1012.6470336914062, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.053714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2800576090812683, "learning_rate": 9.2e-07, "loss": -0.0, "num_tokens": 5964628.0, "reward": -0.1755329668521881, "reward_std": 0.19662824273109436, "rewards/cosine_scaled_reward/mean": -0.1755329668521881, "rewards/cosine_scaled_reward/std": 0.3987559974193573, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1787.046875, "completions/mean_terminated_length": 1120.1666259765625, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.2499135434627533, "learning_rate": 9.399999999999999e-07, "loss": -0.0, "num_tokens": 6089543.0, "reward": -0.07469595968723297, "reward_std": 0.2802818715572357, "rewards/cosine_scaled_reward/mean": -0.07469595968723297, "rewards/cosine_scaled_reward/std": 0.39331451058387756, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1611.65625, "completions/mean_terminated_length": 1013.7037353515625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.2976716160774231, "learning_rate": 9.6e-07, "loss": -0.0, "num_tokens": 6202753.0, "reward": -0.14219576120376587, "reward_std": 0.3252427875995636, "rewards/cosine_scaled_reward/mean": -0.14219576120376587, "rewards/cosine_scaled_reward/std": 0.41946855187416077, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1826.90625, "completions/mean_terminated_length": 761.6364135742188, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2344626933336258, "learning_rate": 9.8e-07, "loss": -0.0, "num_tokens": 6330491.0, "reward": -0.098542720079422, "reward_std": 0.20483215153217316, "rewards/cosine_scaled_reward/mean": -0.0985427126288414, "rewards/cosine_scaled_reward/std": 0.396296888589859, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1520.1875, "completions/mean_terminated_length": 922.0000610351562, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.05828571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.30348992347717285, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 6437991.0, "reward": -0.12996003031730652, "reward_std": 0.2803010940551758, "rewards/cosine_scaled_reward/mean": -0.12996003031730652, "rewards/cosine_scaled_reward/std": 0.3464147746562958, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1726.71875, "completions/mean_terminated_length": 838.4705810546875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.05942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2905585765838623, "learning_rate": 9.999890338174275e-07, "loss": -0.0, "num_tokens": 6559853.0, "reward": -0.2443142831325531, "reward_std": 0.21010473370552063, "rewards/cosine_scaled_reward/mean": -0.2443142831325531, "rewards/cosine_scaled_reward/std": 0.32864055037498474, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 1757.015625, "completions/mean_terminated_length": 952.5294189453125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.060571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2699633538722992, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "num_tokens": 6683134.0, "reward": -0.18116676807403564, "reward_std": 0.2308851182460785, "rewards/cosine_scaled_reward/mean": -0.18116676807403564, "rewards/cosine_scaled_reward/std": 0.27486056089401245, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1850.65625, "completions/mean_terminated_length": 1206.0001220703125, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.061714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.23383355140686035, "learning_rate": 9.999013075636804e-07, "loss": -0.0, "num_tokens": 6812720.0, "reward": -0.14257444441318512, "reward_std": 0.29668545722961426, "rewards/cosine_scaled_reward/mean": -0.14257442951202393, "rewards/cosine_scaled_reward/std": 0.4257228672504425, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 1754.640625, "completions/mean_terminated_length": 874.5625, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.06285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.23320119082927704, "learning_rate": 9.998245517681593e-07, "loss": -0.0, "num_tokens": 6935305.0, "reward": -0.14078931510448456, "reward_std": 0.17466726899147034, "rewards/cosine_scaled_reward/mean": -0.14078931510448456, "rewards/cosine_scaled_reward/std": 0.3331747353076935, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 1853.78125, "completions/mean_terminated_length": 918.0, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.23405365645885468, "learning_rate": 9.997258721585931e-07, "loss": -0.0, "num_tokens": 7064907.0, "reward": -0.11611534655094147, "reward_std": 0.19285616278648376, "rewards/cosine_scaled_reward/mean": -0.11611534655094147, "rewards/cosine_scaled_reward/std": 0.47406119108200073, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1971.640625, "completions/mean_terminated_length": 1437.125, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.06514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.20449356734752655, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "num_tokens": 7202660.0, "reward": -0.27627938985824585, "reward_std": 0.2080146074295044, "rewards/cosine_scaled_reward/mean": -0.27627938985824585, "rewards/cosine_scaled_reward/std": 0.2397139072418213, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1678.09375, "completions/mean_terminated_length": 971.9091186523438, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.06628571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.251164048910141, "learning_rate": 9.994627618036452e-07, "loss": -0.0, "num_tokens": 7320154.0, "reward": -0.1333095282316208, "reward_std": 0.27265745401382446, "rewards/cosine_scaled_reward/mean": -0.1333095282316208, "rewards/cosine_scaled_reward/std": 0.3821713328361511, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 1732.171875, "completions/mean_terminated_length": 859.0, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.06742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.22478283941745758, "learning_rate": 9.992983438818915e-07, "loss": -0.0, "num_tokens": 7441477.0, "reward": -0.18278491497039795, "reward_std": 0.2154037207365036, "rewards/cosine_scaled_reward/mean": -0.18278491497039795, "rewards/cosine_scaled_reward/std": 0.3414745628833771, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1798.375, "completions/mean_terminated_length": 982.9334106445312, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 0.06857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.22602440416812897, "learning_rate": 9.991120277927223e-07, "loss": -0.0, "num_tokens": 7567461.0, "reward": -0.265900194644928, "reward_std": 0.1530904918909073, "rewards/cosine_scaled_reward/mean": -0.265900194644928, "rewards/cosine_scaled_reward/std": 0.18254056572914124, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1950.578125, "completions/mean_terminated_length": 1424.5, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 0.06971428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.22684067487716675, "learning_rate": 9.989038226169207e-07, "loss": -0.0, "num_tokens": 7703818.0, "reward": -0.05269922316074371, "reward_std": 0.3038993775844574, "rewards/cosine_scaled_reward/mean": -0.052699219435453415, "rewards/cosine_scaled_reward/std": 0.36445698142051697, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1717.703125, "completions/mean_terminated_length": 1041.3809814453125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.07085714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.23552638292312622, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "num_tokens": 7823983.0, "reward": -0.07779724895954132, "reward_std": 0.2913648784160614, "rewards/cosine_scaled_reward/mean": -0.07779725641012192, "rewards/cosine_scaled_reward/std": 0.4099881649017334, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 1600.625, "completions/mean_terminated_length": 1180.3636474609375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.28230276703834534, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "num_tokens": 7936679.0, "reward": -0.02632874622941017, "reward_std": 0.25066205859184265, "rewards/cosine_scaled_reward/mean": -0.02632874995470047, "rewards/cosine_scaled_reward/std": 0.4263686537742615, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1743.578125, "completions/mean_terminated_length": 1073.8499755859375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.07314285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.266590416431427, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "num_tokens": 8059220.0, "reward": -0.10920079052448273, "reward_std": 0.3089619576931, "rewards/cosine_scaled_reward/mean": -0.10920079052448273, "rewards/cosine_scaled_reward/std": 0.43342384696006775, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 1690.609375, "completions/mean_terminated_length": 618.4375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.07428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2891872525215149, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "num_tokens": 8178123.0, "reward": -0.2091352641582489, "reward_std": 0.18792679905891418, "rewards/cosine_scaled_reward/mean": -0.2091352641582489, "rewards/cosine_scaled_reward/std": 0.40636762976646423, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 1320.453125, "completions/mean_terminated_length": 678.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.07542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.30139341950416565, "learning_rate": 9.975348529157229e-07, "loss": -0.0, "num_tokens": 8272864.0, "reward": -0.012375831604003906, "reward_std": 0.2539718747138977, "rewards/cosine_scaled_reward/mean": -0.01237582415342331, "rewards/cosine_scaled_reward/std": 0.45652061700820923, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 2001.21875, "completions/mean_terminated_length": 1050.0, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 0.07657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.21435414254665375, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "num_tokens": 8411678.0, "reward": -0.27966073155403137, "reward_std": 0.14496129751205444, "rewards/cosine_scaled_reward/mean": -0.27966073155403137, "rewards/cosine_scaled_reward/std": 0.1733873188495636, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1359.65625, "completions/mean_terminated_length": 789.3142700195312, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.07771428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.3244759440422058, "learning_rate": 9.968344786479415e-07, "loss": -0.0, "num_tokens": 8507952.0, "reward": -0.06231251358985901, "reward_std": 0.31347835063934326, "rewards/cosine_scaled_reward/mean": -0.062312521040439606, "rewards/cosine_scaled_reward/std": 0.40184450149536133, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1572.78125, "completions/mean_terminated_length": 831.4400024414062, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.07885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3429071605205536, "learning_rate": 9.964516155915151e-07, "loss": 0.0, "num_tokens": 8618954.0, "reward": -0.24097035825252533, "reward_std": 0.22784993052482605, "rewards/cosine_scaled_reward/mean": -0.24097035825252533, "rewards/cosine_scaled_reward/std": 0.2594495415687561, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 1859.578125, "completions/mean_terminated_length": 951.727294921875, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.221941277384758, "learning_rate": 9.960469931131936e-07, "loss": -0.0, "num_tokens": 8749423.0, "reward": -0.27105003595352173, "reward_std": 0.16835230588912964, "rewards/cosine_scaled_reward/mean": -0.27105003595352173, "rewards/cosine_scaled_reward/std": 0.21196867525577545, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 1668.265625, "completions/mean_terminated_length": 832.8500366210938, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.08114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2909034192562103, "learning_rate": 9.956206309337066e-07, "loss": -0.0, "num_tokens": 8866912.0, "reward": -0.09497882425785065, "reward_std": 0.2813299000263214, "rewards/cosine_scaled_reward/mean": -0.09497880935668945, "rewards/cosine_scaled_reward/std": 0.4832696318626404, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 1697.671875, "completions/mean_terminated_length": 926.9500122070312, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.08228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3016415238380432, "learning_rate": 9.951725498333448e-07, "loss": -0.0, "num_tokens": 8985915.0, "reward": -0.22967606782913208, "reward_std": 0.18875859677791595, "rewards/cosine_scaled_reward/mean": -0.2296760529279709, "rewards/cosine_scaled_reward/std": 0.22012120485305786, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 2020.703125, "completions/mean_terminated_length": 1465.666748046875, "completions/min_length": 1143.0, "completions/min_terminated_length": 1143.0, "epoch": 0.08342857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.21586637198925018, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "num_tokens": 9125968.0, "reward": -0.24284613132476807, "reward_std": 0.22862236201763153, "rewards/cosine_scaled_reward/mean": -0.24284613132476807, "rewards/cosine_scaled_reward/std": 0.24740919470787048, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1975.09375, "completions/mean_terminated_length": 1381.4285888671875, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.08457142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.21680164337158203, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "num_tokens": 9262302.0, "reward": -0.1543380469083786, "reward_std": 0.24083258211612701, "rewards/cosine_scaled_reward/mean": -0.1543380618095398, "rewards/cosine_scaled_reward/std": 0.3356986939907074, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1836.3125, "completions/mean_terminated_length": 1295.3333740234375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.125, "grad_norm": 0.1845395565032959, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "num_tokens": 9390786.0, "reward": -0.12792138755321503, "reward_std": 0.10224759578704834, "rewards/cosine_scaled_reward/mean": -0.12792138755321503, "rewards/cosine_scaled_reward/std": 0.4530969560146332, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 1764.109375, "completions/mean_terminated_length": 836.7333984375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.08685714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.26535236835479736, "learning_rate": 9.931634888554935e-07, "loss": -0.0, "num_tokens": 9514089.0, "reward": -0.27717918157577515, "reward_std": 0.19932743906974792, "rewards/cosine_scaled_reward/mean": -0.27717918157577515, "rewards/cosine_scaled_reward/std": 0.20844916999340057, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1945.109375, "completions/mean_terminated_length": 1224.875, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.2047174870967865, "learning_rate": 9.926071618660237e-07, "loss": -0.0, "num_tokens": 9650152.0, "reward": -0.09873012453317642, "reward_std": 0.22244854271411896, "rewards/cosine_scaled_reward/mean": -0.09873010218143463, "rewards/cosine_scaled_reward/std": 0.34491515159606934, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1928.703125, "completions/mean_terminated_length": 1199.6666259765625, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.08914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.22559019923210144, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "num_tokens": 9784309.0, "reward": -0.09572747349739075, "reward_std": 0.23068635165691376, "rewards/cosine_scaled_reward/mean": -0.09572747349739075, "rewards/cosine_scaled_reward/std": 0.38660773634910583, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1508.40625, "completions/mean_terminated_length": 814.6428833007812, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.09028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24668477475643158, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "num_tokens": 9890943.0, "reward": -0.1618795394897461, "reward_std": 0.22540031373500824, "rewards/cosine_scaled_reward/mean": -0.1618795245885849, "rewards/cosine_scaled_reward/std": 0.3233039081096649, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 2012.671875, "completions/mean_terminated_length": 1725.0001220703125, "completions/min_length": 1283.0, "completions/min_terminated_length": 1283.0, "epoch": 0.09142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24282054603099823, "learning_rate": 9.908088623197048e-07, "loss": -0.0, "num_tokens": 10030146.0, "reward": -0.25591158866882324, "reward_std": 0.15104801952838898, "rewards/cosine_scaled_reward/mean": -0.25591158866882324, "rewards/cosine_scaled_reward/std": 0.18741995096206665, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 1821.921875, "completions/mean_terminated_length": 935.0000610351562, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.09257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3027254641056061, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "num_tokens": 10158021.0, "reward": -0.15331333875656128, "reward_std": 0.18424856662750244, "rewards/cosine_scaled_reward/mean": -0.15331333875656128, "rewards/cosine_scaled_reward/std": 0.24023762345314026, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 1734.28125, "completions/mean_terminated_length": 991.26318359375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.09371428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.2466808557510376, "learning_rate": 9.895025252503755e-07, "loss": -0.0, "num_tokens": 10279343.0, "reward": -0.07192108780145645, "reward_std": 0.2587333917617798, "rewards/cosine_scaled_reward/mean": -0.07192108780145645, "rewards/cosine_scaled_reward/std": 0.46087121963500977, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 1688.78125, "completions/mean_terminated_length": 953.2380981445312, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.09485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2600877285003662, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "num_tokens": 10398513.0, "reward": -0.1718086451292038, "reward_std": 0.2223512828350067, "rewards/cosine_scaled_reward/mean": -0.1718086451292038, "rewards/cosine_scaled_reward/std": 0.2828122675418854, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 1838.203125, "completions/mean_terminated_length": 705.2999877929688, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.22531215846538544, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "num_tokens": 10526854.0, "reward": -0.2154863476753235, "reward_std": 0.261901319026947, "rewards/cosine_scaled_reward/mean": -0.2154863476753235, "rewards/cosine_scaled_reward/std": 0.29268571734428406, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1926.34375, "completions/mean_terminated_length": 1399.166748046875, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.09714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19031891226768494, "learning_rate": 9.873824502603459e-07, "loss": -0.0, "num_tokens": 10660460.0, "reward": -0.21009978652000427, "reward_std": 0.19575349986553192, "rewards/cosine_scaled_reward/mean": -0.21009978652000427, "rewards/cosine_scaled_reward/std": 0.2456056773662567, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1713.21875, "completions/mean_terminated_length": 787.6470336914062, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.09828571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.258359432220459, "learning_rate": 9.866330768241983e-07, "loss": -0.0, "num_tokens": 10780962.0, "reward": -0.1955144852399826, "reward_std": 0.24323132634162903, "rewards/cosine_scaled_reward/mean": -0.1955144852399826, "rewards/cosine_scaled_reward/std": 0.3071554899215698, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1656.0, "completions/mean_terminated_length": 1002.6666870117188, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.09942857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2636864185333252, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "num_tokens": 10897066.0, "reward": -0.1988150179386139, "reward_std": 0.24088150262832642, "rewards/cosine_scaled_reward/mean": -0.1988150179386139, "rewards/cosine_scaled_reward/std": 0.2925129532814026, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1755.234375, "completions/mean_terminated_length": 1061.8421630859375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.10057142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.29112017154693604, "learning_rate": 9.850705248720068e-07, "loss": 0.0, "num_tokens": 11019913.0, "reward": -0.02967459335923195, "reward_std": 0.3240855932235718, "rewards/cosine_scaled_reward/mean": -0.029674597084522247, "rewards/cosine_scaled_reward/std": 0.3718070983886719, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1752.78125, "completions/mean_terminated_length": 1148.2857666015625, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.10171428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2357943207025528, "learning_rate": 9.8425742251254e-07, "loss": -0.0, "num_tokens": 11143091.0, "reward": -0.1188301220536232, "reward_std": 0.296513170003891, "rewards/cosine_scaled_reward/mean": -0.1188301220536232, "rewards/cosine_scaled_reward/std": 0.3878798484802246, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 1633.84375, "completions/mean_terminated_length": 1101.357177734375, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.10285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.32384219765663147, "learning_rate": 9.83423155058946e-07, "loss": -0.0, "num_tokens": 11257657.0, "reward": -0.22837099432945251, "reward_std": 0.18625205755233765, "rewards/cosine_scaled_reward/mean": -0.22837099432945251, "rewards/cosine_scaled_reward/std": 0.23636196553707123, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 1847.21875, "completions/mean_terminated_length": 1244.875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.24563109874725342, "learning_rate": 9.825677631722435e-07, "loss": -0.0, "num_tokens": 11386447.0, "reward": -0.11780542880296707, "reward_std": 0.3100074827671051, "rewards/cosine_scaled_reward/mean": -0.11780542135238647, "rewards/cosine_scaled_reward/std": 0.39149248600006104, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 1595.125, "completions/mean_terminated_length": 888.6399536132812, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.10514285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2472057044506073, "learning_rate": 9.816912885430258e-07, "loss": -0.0, "num_tokens": 11498527.0, "reward": -0.2128506749868393, "reward_std": 0.20926561951637268, "rewards/cosine_scaled_reward/mean": -0.2128506898880005, "rewards/cosine_scaled_reward/std": 0.23348061740398407, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 1979.953125, "completions/mean_terminated_length": 959.25, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.10628571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2550150454044342, "learning_rate": 9.807937738894303e-07, "loss": -0.0, "num_tokens": 11636588.0, "reward": -0.2922024428844452, "reward_std": 0.1515069603919983, "rewards/cosine_scaled_reward/mean": -0.2922024726867676, "rewards/cosine_scaled_reward/std": 0.18899379670619965, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1830.609375, "completions/mean_terminated_length": 977.769287109375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.10742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.27164825797080994, "learning_rate": 9.798752629550546e-07, "loss": -0.0, "num_tokens": 11763515.0, "reward": -0.18001651763916016, "reward_std": 0.18973413109779358, "rewards/cosine_scaled_reward/mean": -0.18001650273799896, "rewards/cosine_scaled_reward/std": 0.4316568076610565, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 2004.671875, "completions/mean_terminated_length": 1493.4000244140625, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.10857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.20861269533634186, "learning_rate": 9.78935800506826e-07, "loss": -0.0, "num_tokens": 11902342.0, "reward": -0.24148261547088623, "reward_std": 0.18629083037376404, "rewards/cosine_scaled_reward/mean": -0.24148263037204742, "rewards/cosine_scaled_reward/std": 0.23122739791870117, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 1703.359375, "completions/mean_terminated_length": 945.1500244140625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.10971428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.2585296928882599, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "num_tokens": 12022493.0, "reward": -0.11465626955032349, "reward_std": 0.24939197301864624, "rewards/cosine_scaled_reward/mean": -0.11465626955032349, "rewards/cosine_scaled_reward/std": 0.4384477138519287, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1819.921875, "completions/mean_terminated_length": 1135.6875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.11085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3019813299179077, "learning_rate": 9.769942052400235e-07, "loss": 0.0, "num_tokens": 12149232.0, "reward": -0.18846748769283295, "reward_std": 0.2666187584400177, "rewards/cosine_scaled_reward/mean": -0.18846750259399414, "rewards/cosine_scaled_reward/std": 0.3043021559715271, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1677.296875, "completions/mean_terminated_length": 1099.0, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.2722402513027191, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "num_tokens": 12267643.0, "reward": -0.09557384252548218, "reward_std": 0.2643275558948517, "rewards/cosine_scaled_reward/mean": -0.09557383507490158, "rewards/cosine_scaled_reward/std": 0.3361329138278961, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 1716.59375, "completions/mean_terminated_length": 634.0000610351562, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.11314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2789485454559326, "learning_rate": 9.749693666068663e-07, "loss": -0.0, "num_tokens": 12388673.0, "reward": -0.11132554709911346, "reward_std": 0.1736970841884613, "rewards/cosine_scaled_reward/mean": -0.11132554709911346, "rewards/cosine_scaled_reward/std": 0.38663193583488464, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 1627.78125, "completions/mean_terminated_length": 927.4166870117188, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.2479974329471588, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "num_tokens": 12502563.0, "reward": 0.05247430503368378, "reward_std": 0.2633323669433594, "rewards/cosine_scaled_reward/mean": 0.05247429758310318, "rewards/cosine_scaled_reward/std": 0.44700634479522705, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1684.75, "completions/mean_terminated_length": 1037.2174072265625, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.11542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2880499064922333, "learning_rate": 9.728616793536587e-07, "loss": -0.0, "num_tokens": 12621819.0, "reward": -0.09590694308280945, "reward_std": 0.21176990866661072, "rewards/cosine_scaled_reward/mean": -0.09590694308280945, "rewards/cosine_scaled_reward/std": 0.426421195268631, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 1361.265625, "completions/mean_terminated_length": 860.1351318359375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.11657142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2874862551689148, "learning_rate": 9.717768952713511e-07, "loss": -0.0, "num_tokens": 12719092.0, "reward": -0.19330359995365143, "reward_std": 0.1932550072669983, "rewards/cosine_scaled_reward/mean": -0.19330358505249023, "rewards/cosine_scaled_reward/std": 0.34549427032470703, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 1687.90625, "completions/mean_terminated_length": 607.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.11771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.29745906591415405, "learning_rate": 9.706715543782064e-07, "loss": -0.0, "num_tokens": 12837470.0, "reward": -0.2588111162185669, "reward_std": 0.26013171672821045, "rewards/cosine_scaled_reward/mean": -0.2588111162185669, "rewards/cosine_scaled_reward/std": 0.32377612590789795, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 1679.59375, "completions/mean_terminated_length": 925.2380981445312, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.11885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.27166086435317993, "learning_rate": 9.695457105469804e-07, "loss": -0.0, "num_tokens": 12955428.0, "reward": -0.17275363206863403, "reward_std": 0.20137225091457367, "rewards/cosine_scaled_reward/mean": -0.17275363206863403, "rewards/cosine_scaled_reward/std": 0.2731510400772095, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1568.203125, "completions/mean_terminated_length": 819.719970703125, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.26759475469589233, "learning_rate": 9.683994186497132e-07, "loss": -0.0, "num_tokens": 13067081.0, "reward": -0.1266355961561203, "reward_std": 0.3027850389480591, "rewards/cosine_scaled_reward/mean": -0.1266355961561203, "rewards/cosine_scaled_reward/std": 0.4276663362979889, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1432.09375, "completions/mean_terminated_length": 816.1875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.12114285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2912415862083435, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "num_tokens": 13169567.0, "reward": 0.052130524069070816, "reward_std": 0.30294427275657654, "rewards/cosine_scaled_reward/mean": 0.052130527794361115, "rewards/cosine_scaled_reward/std": 0.43769362568855286, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1721.28125, "completions/mean_terminated_length": 1097.5455322265625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.12228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.26628872752189636, "learning_rate": 9.66045715125541e-07, "loss": 0.0, "num_tokens": 13290881.0, "reward": -0.18292994797229767, "reward_std": 0.25176504254341125, "rewards/cosine_scaled_reward/mean": -0.18292994797229767, "rewards/cosine_scaled_reward/std": 0.33385229110717773, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1518.6875, "completions/mean_terminated_length": 989.375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.12342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.25796031951904297, "learning_rate": 9.648384182148252e-07, "loss": -0.0, "num_tokens": 13398437.0, "reward": -0.17732736468315125, "reward_std": 0.32095974683761597, "rewards/cosine_scaled_reward/mean": -0.17732736468315125, "rewards/cosine_scaled_reward/std": 0.3682377338409424, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1871.890625, "completions/mean_terminated_length": 1108.75, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.12457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2274676412343979, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "num_tokens": 13529486.0, "reward": -0.13115660846233368, "reward_std": 0.15383467078208923, "rewards/cosine_scaled_reward/mean": -0.13115662336349487, "rewards/cosine_scaled_reward/std": 0.4183727204799652, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1584.125, "completions/mean_terminated_length": 811.0, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.12571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2768951952457428, "learning_rate": 9.623632283030077e-07, "loss": -0.0, "num_tokens": 13641646.0, "reward": -0.27792292833328247, "reward_std": 0.18945851922035217, "rewards/cosine_scaled_reward/mean": -0.27792292833328247, "rewards/cosine_scaled_reward/std": 0.20238204300403595, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1709.1875, "completions/mean_terminated_length": 1062.3636474609375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.12685714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24532362818717957, "learning_rate": 9.610954559391704e-07, "loss": -0.0, "num_tokens": 13761154.0, "reward": -0.0890636295080185, "reward_std": 0.33067381381988525, "rewards/cosine_scaled_reward/mean": -0.0890636295080185, "rewards/cosine_scaled_reward/std": 0.40376362204551697, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1895.234375, "completions/mean_terminated_length": 1436.9375, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.22462251782417297, "learning_rate": 9.598076473627796e-07, "loss": -0.0, "num_tokens": 13893545.0, "reward": -0.1325383186340332, "reward_std": 0.330952525138855, "rewards/cosine_scaled_reward/mean": -0.1325383186340332, "rewards/cosine_scaled_reward/std": 0.4280668795108795, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 1606.890625, "completions/mean_terminated_length": 871.7083740234375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.12914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3009057939052582, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "num_tokens": 14006682.0, "reward": -0.05043189600110054, "reward_std": 0.300018846988678, "rewards/cosine_scaled_reward/mean": -0.050431910902261734, "rewards/cosine_scaled_reward/std": 0.43634143471717834, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 1562.515625, "completions/mean_terminated_length": 753.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.13028571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.37847185134887695, "learning_rate": 9.571721736097088e-07, "loss": 0.0, "num_tokens": 14116531.0, "reward": -0.27539706230163574, "reward_std": 0.18451666831970215, "rewards/cosine_scaled_reward/mean": -0.27539709210395813, "rewards/cosine_scaled_reward/std": 0.23580753803253174, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1656.484375, "completions/mean_terminated_length": 958.5652465820312, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.13142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.26879096031188965, "learning_rate": 9.55824636882301e-07, "loss": 0.0, "num_tokens": 14233762.0, "reward": -0.058682698756456375, "reward_std": 0.2945008873939514, "rewards/cosine_scaled_reward/mean": -0.05868269130587578, "rewards/cosine_scaled_reward/std": 0.40092962980270386, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 1924.84375, "completions/mean_terminated_length": 734.3333740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.13257142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.2654048800468445, "learning_rate": 9.54457320834625e-07, "loss": 0.0, "num_tokens": 14368336.0, "reward": -0.2030428647994995, "reward_std": 0.18692326545715332, "rewards/cosine_scaled_reward/mean": -0.2030428647994995, "rewards/cosine_scaled_reward/std": 0.2246093899011612, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 1785.484375, "completions/mean_terminated_length": 997.9375, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.1337142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26004910469055176, "learning_rate": 9.530702921077358e-07, "loss": -0.0, "num_tokens": 14493631.0, "reward": -0.19770082831382751, "reward_std": 0.25534579157829285, "rewards/cosine_scaled_reward/mean": -0.19770082831382751, "rewards/cosine_scaled_reward/std": 0.33773326873779297, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 1802.84375, "completions/mean_terminated_length": 1067.375, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.13485714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.22992977499961853, "learning_rate": 9.516636183034564e-07, "loss": 0.0, "num_tokens": 14619549.0, "reward": -0.011579632759094238, "reward_std": 0.3697226643562317, "rewards/cosine_scaled_reward/mean": -0.011579625308513641, "rewards/cosine_scaled_reward/std": 0.4647332727909088, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1343.578125, "completions/mean_terminated_length": 920.9249877929688, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.3279743492603302, "learning_rate": 9.502373679810839e-07, "loss": -0.0, "num_tokens": 14715946.0, "reward": -0.0004618987441062927, "reward_std": 0.27856603264808655, "rewards/cosine_scaled_reward/mean": -0.0004618987441062927, "rewards/cosine_scaled_reward/std": 0.45174649357795715, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1286.75, "completions/mean_terminated_length": 859.707275390625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.13714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.3185117244720459, "learning_rate": 9.487916106540465e-07, "loss": -0.0, "num_tokens": 14808754.0, "reward": -0.06128609925508499, "reward_std": 0.3139324188232422, "rewards/cosine_scaled_reward/mean": -0.06128609925508499, "rewards/cosine_scaled_reward/std": 0.46217504143714905, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1103.90625, "completions/mean_terminated_length": 789.2083740234375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.1382857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3791055381298065, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "num_tokens": 14889100.0, "reward": -0.012373358011245728, "reward_std": 0.3332873284816742, "rewards/cosine_scaled_reward/mean": -0.012373358011245728, "rewards/cosine_scaled_reward/std": 0.4969451427459717, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1765.0625, "completions/mean_terminated_length": 1042.0, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.13942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.27713218331336975, "learning_rate": 9.458418577899774e-07, "loss": -0.0, "num_tokens": 15013624.0, "reward": -0.1387348771095276, "reward_std": 0.25947195291519165, "rewards/cosine_scaled_reward/mean": -0.1387348771095276, "rewards/cosine_scaled_reward/std": 0.3304338753223419, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1576.28125, "completions/mean_terminated_length": 1006.9655151367188, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.14057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2664856016635895, "learning_rate": 9.443380060197385e-07, "loss": 0.0, "num_tokens": 15124738.0, "reward": -0.18317654728889465, "reward_std": 0.16592136025428772, "rewards/cosine_scaled_reward/mean": -0.18317654728889465, "rewards/cosine_scaled_reward/std": 0.33475980162620544, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1395.78125, "completions/mean_terminated_length": 888.5, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.1417142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2889535427093506, "learning_rate": 9.428149347714143e-07, "loss": 0.0, "num_tokens": 15225020.0, "reward": -0.12295320630073547, "reward_std": 0.30637824535369873, "rewards/cosine_scaled_reward/mean": -0.12295320630073547, "rewards/cosine_scaled_reward/std": 0.4125574827194214, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1622.8125, "completions/mean_terminated_length": 914.1666870117188, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.24003510177135468, "learning_rate": 9.412727182773486e-07, "loss": -0.0, "num_tokens": 15339808.0, "reward": -0.06917156279087067, "reward_std": 0.19467812776565552, "rewards/cosine_scaled_reward/mean": -0.06917153298854828, "rewards/cosine_scaled_reward/std": 0.44139373302459717, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1735.96875, "completions/mean_terminated_length": 1097.047607421875, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.23693455755710602, "learning_rate": 9.397114317029974e-07, "loss": -0.0, "num_tokens": 15462206.0, "reward": -0.15823431313037872, "reward_std": 0.26196378469467163, "rewards/cosine_scaled_reward/mean": -0.15823431313037872, "rewards/cosine_scaled_reward/std": 0.3110467195510864, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1729.421875, "completions/mean_terminated_length": 1161.521728515625, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 0.14514285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.23715488612651825, "learning_rate": 9.381311511432658e-07, "loss": -0.0, "num_tokens": 15583985.0, "reward": -0.2520313262939453, "reward_std": 0.1912405639886856, "rewards/cosine_scaled_reward/mean": -0.2520313262939453, "rewards/cosine_scaled_reward/std": 0.276276558637619, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1614.125, "completions/mean_terminated_length": 1090.4827880859375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.1462857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.25245338678359985, "learning_rate": 9.36531953618799e-07, "loss": -0.0, "num_tokens": 15697641.0, "reward": 0.029929369688034058, "reward_std": 0.2960119843482971, "rewards/cosine_scaled_reward/mean": 0.029929369688034058, "rewards/cosine_scaled_reward/std": 0.40772902965545654, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1892.953125, "completions/mean_terminated_length": 945.4444580078125, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.14742857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.22934643924236298, "learning_rate": 9.34913917072228e-07, "loss": -0.0, "num_tokens": 15829494.0, "reward": -0.27538371086120605, "reward_std": 0.2161153256893158, "rewards/cosine_scaled_reward/mean": -0.27538371086120605, "rewards/cosine_scaled_reward/std": 0.25140947103500366, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 1631.5625, "completions/mean_terminated_length": 889.2174072265625, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.14857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312338650226593, "learning_rate": 9.332771203643714e-07, "loss": -0.0, "num_tokens": 15944418.0, "reward": -0.16326984763145447, "reward_std": 0.22974258661270142, "rewards/cosine_scaled_reward/mean": -0.16326983273029327, "rewards/cosine_scaled_reward/std": 0.3127349317073822, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 1549.453125, "completions/mean_terminated_length": 820.8077392578125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.14971428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.28737154603004456, "learning_rate": 9.316216432703916e-07, "loss": 0.0, "num_tokens": 16053319.0, "reward": -0.060378547757864, "reward_std": 0.23251818120479584, "rewards/cosine_scaled_reward/mean": -0.060378558933734894, "rewards/cosine_scaled_reward/std": 0.4743967354297638, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 1536.859375, "completions/mean_terminated_length": 957.5667114257812, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.15085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24873872101306915, "learning_rate": 9.299475664759068e-07, "loss": 0.0, "num_tokens": 16162742.0, "reward": -0.10933490097522736, "reward_std": 0.2869688868522644, "rewards/cosine_scaled_reward/mean": -0.10933491587638855, "rewards/cosine_scaled_reward/std": 0.45436573028564453, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1817.453125, "completions/mean_terminated_length": 1125.8125, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.2753625810146332, "learning_rate": 9.282549715730579e-07, "loss": 0.0, "num_tokens": 16290283.0, "reward": -0.1931842416524887, "reward_std": 0.2315790057182312, "rewards/cosine_scaled_reward/mean": -0.1931842565536499, "rewards/cosine_scaled_reward/std": 0.26366862654685974, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 1685.390625, "completions/mean_terminated_length": 1119.719970703125, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.15314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25077056884765625, "learning_rate": 9.265439410565328e-07, "loss": -0.0, "num_tokens": 16408716.0, "reward": -0.1305551677942276, "reward_std": 0.15626969933509827, "rewards/cosine_scaled_reward/mean": -0.1305551677942276, "rewards/cosine_scaled_reward/std": 0.35703787207603455, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 1111.578125, "completions/mean_terminated_length": 654.2557983398438, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.15428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3456169366836548, "learning_rate": 9.248145583195447e-07, "loss": -0.0, "num_tokens": 16490329.0, "reward": 0.08614158630371094, "reward_std": 0.3152117133140564, "rewards/cosine_scaled_reward/mean": 0.08614158630371094, "rewards/cosine_scaled_reward/std": 0.5073397159576416, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 1485.703125, "completions/mean_terminated_length": 848.433349609375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.15542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.28029024600982666, "learning_rate": 9.230669076497687e-07, "loss": -0.0, "num_tokens": 16596086.0, "reward": 0.01799224689602852, "reward_std": 0.28087177872657776, "rewards/cosine_scaled_reward/mean": 0.017992250621318817, "rewards/cosine_scaled_reward/std": 0.5039587020874023, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1765.421875, "completions/mean_terminated_length": 1043.27783203125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.15657142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.21782204508781433, "learning_rate": 9.213010742252327e-07, "loss": 0.0, "num_tokens": 16719681.0, "reward": -0.2635670304298401, "reward_std": 0.16446365416049957, "rewards/cosine_scaled_reward/mean": -0.2635670304298401, "rewards/cosine_scaled_reward/std": 0.1840340793132782, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1529.734375, "completions/mean_terminated_length": 1072.441162109375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.15771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.26588714122772217, "learning_rate": 9.195171441101668e-07, "loss": -0.0, "num_tokens": 16828896.0, "reward": -0.08665560930967331, "reward_std": 0.23063711822032928, "rewards/cosine_scaled_reward/mean": -0.08665560930967331, "rewards/cosine_scaled_reward/std": 0.44113171100616455, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1667.8125, "completions/mean_terminated_length": 990.0869750976562, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.15885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2917172610759735, "learning_rate": 9.177152042508077e-07, "loss": -0.0, "num_tokens": 16946276.0, "reward": -0.19403964281082153, "reward_std": 0.2673150300979614, "rewards/cosine_scaled_reward/mean": -0.19403962790966034, "rewards/cosine_scaled_reward/std": 0.32773110270500183, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1412.796875, "completions/mean_terminated_length": 949.270263671875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.16, "frac_reward_zero_std": 0.125, "grad_norm": 0.28324976563453674, "learning_rate": 9.158953424711624e-07, "loss": -0.0, "num_tokens": 17046919.0, "reward": -0.13130062818527222, "reward_std": 0.13907812535762787, "rewards/cosine_scaled_reward/mean": -0.13130061328411102, "rewards/cosine_scaled_reward/std": 0.46400320529937744, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 1272.25, "completions/mean_terminated_length": 893.3953247070312, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.16114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.28660058975219727, "learning_rate": 9.140576474687263e-07, "loss": 0.0, "num_tokens": 17138903.0, "reward": -0.044462256133556366, "reward_std": 0.3412697911262512, "rewards/cosine_scaled_reward/mean": -0.04446224868297577, "rewards/cosine_scaled_reward/std": 0.4661441445350647, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1662.734375, "completions/mean_terminated_length": 1226.10009765625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.16228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3415294587612152, "learning_rate": 9.122022088101613e-07, "loss": -0.0, "num_tokens": 17255822.0, "reward": -0.15457069873809814, "reward_std": 0.31260305643081665, "rewards/cosine_scaled_reward/mean": -0.15457069873809814, "rewards/cosine_scaled_reward/std": 0.3450033664703369, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1441.203125, "completions/mean_terminated_length": 998.4054565429688, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.16342857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2900330424308777, "learning_rate": 9.103291169269299e-07, "loss": 0.0, "num_tokens": 17358875.0, "reward": -0.1936979442834854, "reward_std": 0.26940327882766724, "rewards/cosine_scaled_reward/mean": -0.1936979442834854, "rewards/cosine_scaled_reward/std": 0.31407564878463745, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1560.703125, "completions/mean_terminated_length": 1008.4334106445312, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.16457142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.29284507036209106, "learning_rate": 9.084384631108882e-07, "loss": -0.0, "num_tokens": 17470248.0, "reward": -0.14136260747909546, "reward_std": 0.2985552251338959, "rewards/cosine_scaled_reward/mean": -0.14136262238025665, "rewards/cosine_scaled_reward/std": 0.4261241853237152, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1226.0, "completions/mean_terminated_length": 852.3636474609375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.1657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.30853384733200073, "learning_rate": 9.065303395098358e-07, "loss": -0.0, "num_tokens": 17558656.0, "reward": -0.011180020868778229, "reward_std": 0.3104313910007477, "rewards/cosine_scaled_reward/mean": -0.011180016212165356, "rewards/cosine_scaled_reward/std": 0.502927303314209, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 1468.8125, "completions/mean_terminated_length": 889.625, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.16685714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.25645971298217773, "learning_rate": 9.046048391230247e-07, "loss": -0.0, "num_tokens": 17663276.0, "reward": -0.1956520974636078, "reward_std": 0.24750414490699768, "rewards/cosine_scaled_reward/mean": -0.1956520974636078, "rewards/cosine_scaled_reward/std": 0.30754002928733826, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1517.84375, "completions/mean_terminated_length": 1078.5714111328125, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.28331542015075684, "learning_rate": 9.026620557966279e-07, "loss": -0.0, "num_tokens": 17771202.0, "reward": -0.14546620845794678, "reward_std": 0.307411253452301, "rewards/cosine_scaled_reward/mean": -0.14546619355678558, "rewards/cosine_scaled_reward/std": 0.3964070975780487, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 1319.75, "completions/mean_terminated_length": 882.7999877929688, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.16914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.24973155558109283, "learning_rate": 9.007020842191634e-07, "loss": -0.0, "num_tokens": 17866850.0, "reward": -0.05917578190565109, "reward_std": 0.24221420288085938, "rewards/cosine_scaled_reward/mean": -0.05917578190565109, "rewards/cosine_scaled_reward/std": 0.39783161878585815, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 1641.578125, "completions/mean_terminated_length": 1007.5599975585938, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.1702857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.23923377692699432, "learning_rate": 8.987250199168808e-07, "loss": 0.0, "num_tokens": 17983807.0, "reward": -0.16958971321582794, "reward_std": 0.3115168809890747, "rewards/cosine_scaled_reward/mean": -0.16958969831466675, "rewards/cosine_scaled_reward/std": 0.4009650945663452, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 1294.734375, "completions/mean_terminated_length": 976.6889038085938, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2846779525279999, "learning_rate": 8.967309592491052e-07, "loss": 0.0, "num_tokens": 18077174.0, "reward": -0.16757264733314514, "reward_std": 0.26536184549331665, "rewards/cosine_scaled_reward/mean": -0.16757264733314514, "rewards/cosine_scaled_reward/std": 0.32911255955696106, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1461.640625, "completions/mean_terminated_length": 1005.5833129882812, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.17257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.27918684482574463, "learning_rate": 8.9471999940354e-07, "loss": 0.0, "num_tokens": 18181399.0, "reward": -0.04434409737586975, "reward_std": 0.21946659684181213, "rewards/cosine_scaled_reward/mean": -0.04434409365057945, "rewards/cosine_scaled_reward/std": 0.385776162147522, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 1685.984375, "completions/mean_terminated_length": 944.7142944335938, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1737142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.31312429904937744, "learning_rate": 8.926922383915315e-07, "loss": -0.0, "num_tokens": 18299966.0, "reward": -0.16299618780612946, "reward_std": 0.2579989731311798, "rewards/cosine_scaled_reward/mean": -0.16299618780612946, "rewards/cosine_scaled_reward/std": 0.2968141436576843, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1507.171875, "completions/mean_terminated_length": 999.1212158203125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.17485714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.27001288533210754, "learning_rate": 8.906477750432903e-07, "loss": -0.0, "num_tokens": 18407569.0, "reward": -0.2650793790817261, "reward_std": 0.2175406664609909, "rewards/cosine_scaled_reward/mean": -0.2650793790817261, "rewards/cosine_scaled_reward/std": 0.2671082317829132, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 1655.453125, "completions/mean_terminated_length": 1081.7308349609375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.23643171787261963, "learning_rate": 8.88586709003076e-07, "loss": -0.0, "num_tokens": 18524582.0, "reward": -0.1807454228401184, "reward_std": 0.28304773569107056, "rewards/cosine_scaled_reward/mean": -0.1807454228401184, "rewards/cosine_scaled_reward/std": 0.35738474130630493, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 1342.046875, "completions/mean_terminated_length": 826.8919067382812, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.17714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.3501509726047516, "learning_rate": 8.865091407243394e-07, "loss": 0.0, "num_tokens": 18621097.0, "reward": -0.0294140987098217, "reward_std": 0.1941235363483429, "rewards/cosine_scaled_reward/mean": -0.029414094984531403, "rewards/cosine_scaled_reward/std": 0.41702020168304443, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1586.40625, "completions/mean_terminated_length": 911.769287109375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.1782857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24873077869415283, "learning_rate": 8.844151714648274e-07, "loss": -0.0, "num_tokens": 18732731.0, "reward": -0.09675467014312744, "reward_std": 0.2634894847869873, "rewards/cosine_scaled_reward/mean": -0.09675467014312744, "rewards/cosine_scaled_reward/std": 0.42875486612319946, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1426.203125, "completions/mean_terminated_length": 1053.125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.17942857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.28960639238357544, "learning_rate": 8.823049032816478e-07, "loss": -0.0, "num_tokens": 18833968.0, "reward": -0.19702841341495514, "reward_std": 0.2148652821779251, "rewards/cosine_scaled_reward/mean": -0.19702842831611633, "rewards/cosine_scaled_reward/std": 0.2610262334346771, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1293.140625, "completions/mean_terminated_length": 997.7608642578125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.18057142857142858, "frac_reward_zero_std": 0.125, "grad_norm": 0.24434594810009003, "learning_rate": 8.801784390262943e-07, "loss": -0.0, "num_tokens": 18926849.0, "reward": 0.03873754292726517, "reward_std": 0.23464180529117584, "rewards/cosine_scaled_reward/mean": 0.03873754292726517, "rewards/cosine_scaled_reward/std": 0.5250495076179504, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1565.171875, "completions/mean_terminated_length": 1139.1470947265625, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.18171428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.22470001876354218, "learning_rate": 8.780358823396352e-07, "loss": 0.0, "num_tokens": 19038700.0, "reward": -0.202285498380661, "reward_std": 0.20965763926506042, "rewards/cosine_scaled_reward/mean": -0.202285498380661, "rewards/cosine_scaled_reward/std": 0.3204317092895508, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1400.296875, "completions/mean_terminated_length": 985.1026000976562, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.18285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2976718246936798, "learning_rate": 8.758773376468604e-07, "loss": -0.0, "num_tokens": 19139903.0, "reward": 0.020067960023880005, "reward_std": 0.4074331223964691, "rewards/cosine_scaled_reward/mean": 0.020067960023880005, "rewards/cosine_scaled_reward/std": 0.5162546038627625, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 1382.25, "completions/mean_terminated_length": 982.7999877929688, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.27535930275917053, "learning_rate": 8.737029101523929e-07, "loss": -0.0, "num_tokens": 19238359.0, "reward": -0.2095160186290741, "reward_std": 0.20490920543670654, "rewards/cosine_scaled_reward/mean": -0.2095160186290741, "rewards/cosine_scaled_reward/std": 0.22322162985801697, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1301.0, "completions/mean_terminated_length": 985.6000366210938, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.18514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3560119867324829, "learning_rate": 8.715127058347614e-07, "loss": -0.0, "num_tokens": 19331927.0, "reward": -0.23389019072055817, "reward_std": 0.2546258866786957, "rewards/cosine_scaled_reward/mean": -0.23389017581939697, "rewards/cosine_scaled_reward/std": 0.28031107783317566, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 1394.09375, "completions/mean_terminated_length": 946.6842041015625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.18628571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3067707121372223, "learning_rate": 8.693068314414344e-07, "loss": 0.0, "num_tokens": 19432333.0, "reward": 0.008387047797441483, "reward_std": 0.2966369390487671, "rewards/cosine_scaled_reward/mean": 0.008387047797441483, "rewards/cosine_scaled_reward/std": 0.47443318367004395, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1301.03125, "completions/mean_terminated_length": 881.9999389648438, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.18742857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.26724106073379517, "learning_rate": 8.670853944836176e-07, "loss": -0.0, "num_tokens": 19526127.0, "reward": 0.015163253992795944, "reward_std": 0.2171541154384613, "rewards/cosine_scaled_reward/mean": 0.015163261443376541, "rewards/cosine_scaled_reward/std": 0.43332821130752563, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1290.8125, "completions/mean_terminated_length": 971.1111450195312, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.18857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2774181663990021, "learning_rate": 8.648485032310144e-07, "loss": 0.0, "num_tokens": 19620155.0, "reward": -0.07460268586874008, "reward_std": 0.25969409942626953, "rewards/cosine_scaled_reward/mean": -0.07460269331932068, "rewards/cosine_scaled_reward/std": 0.391157329082489, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1456.3125, "completions/mean_terminated_length": 1077.025634765625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.18971428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.23980101943016052, "learning_rate": 8.625962667065487e-07, "loss": -0.0, "num_tokens": 19724935.0, "reward": -0.10265599191188812, "reward_std": 0.3349866271018982, "rewards/cosine_scaled_reward/mean": -0.10265599191188812, "rewards/cosine_scaled_reward/std": 0.4455646872520447, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1240.96875, "completions/mean_terminated_length": 949.0637817382812, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.19085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.27142858505249023, "learning_rate": 8.603287946810513e-07, "loss": 0.0, "num_tokens": 19815901.0, "reward": -0.11890637874603271, "reward_std": 0.26525112986564636, "rewards/cosine_scaled_reward/mean": -0.11890637129545212, "rewards/cosine_scaled_reward/std": 0.3307341933250427, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1368.515625, "completions/mean_terminated_length": 987.3414306640625, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.23390193283557892, "learning_rate": 8.580461976679099e-07, "loss": 0.0, "num_tokens": 19914326.0, "reward": -0.08119502663612366, "reward_std": 0.21067029237747192, "rewards/cosine_scaled_reward/mean": -0.08119503408670425, "rewards/cosine_scaled_reward/std": 0.3641049563884735, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1400.109375, "completions/mean_terminated_length": 1105.6136474609375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.19314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25923916697502136, "learning_rate": 8.557485869176825e-07, "loss": -0.0, "num_tokens": 20014557.0, "reward": 0.2310131937265396, "reward_std": 0.44008710980415344, "rewards/cosine_scaled_reward/mean": 0.2310132086277008, "rewards/cosine_scaled_reward/std": 0.5884551405906677, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1230.984375, "completions/mean_terminated_length": 859.6136474609375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.19428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.297661691904068, "learning_rate": 8.534360744126753e-07, "loss": -0.0, "num_tokens": 20103124.0, "reward": -0.02752646803855896, "reward_std": 0.2112906128168106, "rewards/cosine_scaled_reward/mean": -0.027526460587978363, "rewards/cosine_scaled_reward/std": 0.4330926835536957, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1330.734375, "completions/mean_terminated_length": 928.3658447265625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.19542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.27631711959838867, "learning_rate": 8.511087728614862e-07, "loss": -0.0, "num_tokens": 20198683.0, "reward": -0.02587110549211502, "reward_std": 0.3332647681236267, "rewards/cosine_scaled_reward/mean": -0.025871101766824722, "rewards/cosine_scaled_reward/std": 0.4695811867713928, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 1300.171875, "completions/mean_terminated_length": 880.6585083007812, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.19657142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.3011989891529083, "learning_rate": 8.487667956935087e-07, "loss": -0.0, "num_tokens": 20292510.0, "reward": 0.17403244972229004, "reward_std": 0.23184293508529663, "rewards/cosine_scaled_reward/mean": 0.17403244972229004, "rewards/cosine_scaled_reward/std": 0.46001583337783813, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1064.515625, "completions/mean_terminated_length": 813.8235473632812, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.1977142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3317919075489044, "learning_rate": 8.464102570534061e-07, "loss": 0.0, "num_tokens": 20371951.0, "reward": -0.14008744060993195, "reward_std": 0.23045390844345093, "rewards/cosine_scaled_reward/mean": -0.14008745551109314, "rewards/cosine_scaled_reward/std": 0.327737033367157, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 1076.65625, "completions/mean_terminated_length": 994.3389892578125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.19885714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2918402850627899, "learning_rate": 8.440392717955475e-07, "loss": -0.0, "num_tokens": 20451193.0, "reward": -0.020191974937915802, "reward_std": 0.3699801564216614, "rewards/cosine_scaled_reward/mean": -0.020191967487335205, "rewards/cosine_scaled_reward/std": 0.4890177845954895, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 1360.859375, "completions/mean_terminated_length": 1025.279052734375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.24612702429294586, "learning_rate": 8.416539554784089e-07, "loss": -0.0, "num_tokens": 20549112.0, "reward": -0.07502052187919617, "reward_std": 0.23629868030548096, "rewards/cosine_scaled_reward/mean": -0.07502052187919617, "rewards/cosine_scaled_reward/std": 0.4632040560245514, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1351.046875, "completions/mean_terminated_length": 904.2820434570312, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.20114285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2809349298477173, "learning_rate": 8.392544243589427e-07, "loss": 0.0, "num_tokens": 20646747.0, "reward": -0.09139305353164673, "reward_std": 0.3010050654411316, "rewards/cosine_scaled_reward/mean": -0.09139305353164673, "rewards/cosine_scaled_reward/std": 0.3958495557308197, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1245.578125, "completions/mean_terminated_length": 978.1041870117188, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.2022857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3317105770111084, "learning_rate": 8.368407953869103e-07, "loss": 0.0, "num_tokens": 20736688.0, "reward": -0.0028449445962905884, "reward_std": 0.3299737870693207, "rewards/cosine_scaled_reward/mean": -0.0028449594974517822, "rewards/cosine_scaled_reward/std": 0.505253255367279, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1192.234375, "completions/mean_terminated_length": 952.6199951171875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.20342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2933591306209564, "learning_rate": 8.344131861991828e-07, "loss": 0.0, "num_tokens": 20824527.0, "reward": -0.06664696335792542, "reward_std": 0.29735952615737915, "rewards/cosine_scaled_reward/mean": -0.06664696335792542, "rewards/cosine_scaled_reward/std": 0.41459333896636963, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1546.0625, "completions/mean_terminated_length": 1011.7418823242188, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.20457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2591443657875061, "learning_rate": 8.319717151140072e-07, "loss": -0.0, "num_tokens": 20934307.0, "reward": -0.18733876943588257, "reward_std": 0.29792603850364685, "rewards/cosine_scaled_reward/mean": -0.18733876943588257, "rewards/cosine_scaled_reward/std": 0.33306172490119934, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1200.609375, "completions/mean_terminated_length": 941.2040405273438, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.2057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3007314205169678, "learning_rate": 8.295165011252396e-07, "loss": 0.0, "num_tokens": 21022322.0, "reward": 0.16183573007583618, "reward_std": 0.3202260136604309, "rewards/cosine_scaled_reward/mean": 0.16183573007583618, "rewards/cosine_scaled_reward/std": 0.4895489513874054, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1438.96875, "completions/mean_terminated_length": 994.5405883789062, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.20685714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.26270556449890137, "learning_rate": 8.270476638965461e-07, "loss": -0.0, "num_tokens": 21125888.0, "reward": -0.09954661130905151, "reward_std": 0.19112557172775269, "rewards/cosine_scaled_reward/mean": -0.09954659640789032, "rewards/cosine_scaled_reward/std": 0.4616987109184265, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1401.1875, "completions/mean_terminated_length": 986.5640869140625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312396615743637, "learning_rate": 8.245653237555705e-07, "loss": -0.0, "num_tokens": 21225356.0, "reward": -0.1191510483622551, "reward_std": 0.2993764877319336, "rewards/cosine_scaled_reward/mean": -0.1191510558128357, "rewards/cosine_scaled_reward/std": 0.4002695679664612, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1059.46875, "completions/mean_terminated_length": 897.7090454101562, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.20914285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.31593599915504456, "learning_rate": 8.220696016880687e-07, "loss": 0.0, "num_tokens": 21303778.0, "reward": 0.02695992961525917, "reward_std": 0.33188390731811523, "rewards/cosine_scaled_reward/mean": 0.026959922164678574, "rewards/cosine_scaled_reward/std": 0.462587833404541, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 1409.90625, "completions/mean_terminated_length": 944.270263671875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2102857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.4352613687515259, "learning_rate": 8.195606193320136e-07, "loss": 0.0, "num_tokens": 21405364.0, "reward": -0.1421402245759964, "reward_std": 0.16645817458629608, "rewards/cosine_scaled_reward/mean": -0.1421402394771576, "rewards/cosine_scaled_reward/std": 0.33322539925575256, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1153.75, "completions/mean_terminated_length": 947.3846435546875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.21142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.33058229088783264, "learning_rate": 8.170384989716657e-07, "loss": 0.0, "num_tokens": 21489388.0, "reward": -0.13177143037319183, "reward_std": 0.26749324798583984, "rewards/cosine_scaled_reward/mean": -0.13177144527435303, "rewards/cosine_scaled_reward/std": 0.42720580101013184, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1299.109375, "completions/mean_terminated_length": 1089.419921875, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.21257142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2624933421611786, "learning_rate": 8.145033635316128e-07, "loss": 0.0, "num_tokens": 21583227.0, "reward": -0.03228667005896568, "reward_std": 0.3138354420661926, "rewards/cosine_scaled_reward/mean": -0.03228667378425598, "rewards/cosine_scaled_reward/std": 0.47089555859565735, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1294.046875, "completions/mean_terminated_length": 1042.729248046875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.21371428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.3034888207912445, "learning_rate": 8.119553365707802e-07, "loss": -0.0, "num_tokens": 21676470.0, "reward": -0.11602523177862167, "reward_std": 0.2424153983592987, "rewards/cosine_scaled_reward/mean": -0.11602522432804108, "rewards/cosine_scaled_reward/std": 0.4187147617340088, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 1501.09375, "completions/mean_terminated_length": 1102.0, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.21485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2895212769508362, "learning_rate": 8.093945422764069e-07, "loss": 0.0, "num_tokens": 21784340.0, "reward": -0.2754727005958557, "reward_std": 0.2081308364868164, "rewards/cosine_scaled_reward/mean": -0.2754727005958557, "rewards/cosine_scaled_reward/std": 0.21545176208019257, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 974.765625, "completions/mean_terminated_length": 842.9649047851562, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.5491365790367126, "learning_rate": 8.068211054579943e-07, "loss": -0.0, "num_tokens": 21856013.0, "reward": -0.030638471245765686, "reward_std": 0.28698021173477173, "rewards/cosine_scaled_reward/mean": -0.030638471245765686, "rewards/cosine_scaled_reward/std": 0.4680361747741699, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1131.0, "completions/mean_terminated_length": 961.1851806640625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.21714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.31247782707214355, "learning_rate": 8.04235151541222e-07, "loss": -0.0, "num_tokens": 21938165.0, "reward": -0.0024216994643211365, "reward_std": 0.3122457265853882, "rewards/cosine_scaled_reward/mean": -0.002421695739030838, "rewards/cosine_scaled_reward/std": 0.46288225054740906, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2011.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 968.671875, "completions/mean_terminated_length": 968.671875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.21828571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3146747350692749, "learning_rate": 8.01636806561836e-07, "loss": -0.0, "num_tokens": 22010448.0, "reward": 0.060832589864730835, "reward_std": 0.32057198882102966, "rewards/cosine_scaled_reward/mean": 0.06083259731531143, "rewards/cosine_scaled_reward/std": 0.5255656242370605, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1386.765625, "completions/mean_terminated_length": 1184.346923828125, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.21942857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.27087679505348206, "learning_rate": 7.990261971595048e-07, "loss": -0.0, "num_tokens": 22110497.0, "reward": -0.16324350237846375, "reward_std": 0.2538989782333374, "rewards/cosine_scaled_reward/mean": -0.16324350237846375, "rewards/cosine_scaled_reward/std": 0.37434685230255127, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 1645.171875, "completions/mean_terminated_length": 973.7916870117188, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.22057142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2821788489818573, "learning_rate": 7.964034505716476e-07, "loss": -0.0, "num_tokens": 22226932.0, "reward": -0.2253284454345703, "reward_std": 0.21184760332107544, "rewards/cosine_scaled_reward/mean": -0.2253284454345703, "rewards/cosine_scaled_reward/std": 0.2249312698841095, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1466.953125, "completions/mean_terminated_length": 1042.9459228515625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.22171428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2751137614250183, "learning_rate": 7.93768694627233e-07, "loss": -0.0, "num_tokens": 22332177.0, "reward": -0.014900192618370056, "reward_std": 0.37233370542526245, "rewards/cosine_scaled_reward/mean": -0.014900196343660355, "rewards/cosine_scaled_reward/std": 0.45363670587539673, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1380.515625, "completions/mean_terminated_length": 1119.3260498046875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.22285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.23562170565128326, "learning_rate": 7.911220577405484e-07, "loss": -0.0, "num_tokens": 22431674.0, "reward": -0.1425207257270813, "reward_std": 0.2992969751358032, "rewards/cosine_scaled_reward/mean": -0.1425207257270813, "rewards/cosine_scaled_reward/std": 0.37265509366989136, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1474.296875, "completions/mean_terminated_length": 1194.1163330078125, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.26706522703170776, "learning_rate": 7.884636689049422e-07, "loss": 0.0, "num_tokens": 22537621.0, "reward": -0.14723923802375793, "reward_std": 0.34929120540618896, "rewards/cosine_scaled_reward/mean": -0.14723923802375793, "rewards/cosine_scaled_reward/std": 0.42120617628097534, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 961.5, "completions/mean_terminated_length": 908.0655517578125, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.22514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.32898151874542236, "learning_rate": 7.857936576865356e-07, "loss": -0.0, "num_tokens": 22609525.0, "reward": 0.05756232142448425, "reward_std": 0.42182230949401855, "rewards/cosine_scaled_reward/mean": 0.05756233632564545, "rewards/cosine_scaled_reward/std": 0.5148370862007141, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1141.140625, "completions/mean_terminated_length": 1011.58935546875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.22628571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3166671395301819, "learning_rate": 7.831121542179086e-07, "loss": -0.0, "num_tokens": 22694062.0, "reward": 0.11976294219493866, "reward_std": 0.34080085158348083, "rewards/cosine_scaled_reward/mean": 0.11976294219493866, "rewards/cosine_scaled_reward/std": 0.5243961215019226, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1369.25, "completions/mean_terminated_length": 1179.199951171875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.22742857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.33395788073539734, "learning_rate": 7.804192891917571e-07, "loss": 0.0, "num_tokens": 22793198.0, "reward": -0.2272849828004837, "reward_std": 0.21225537359714508, "rewards/cosine_scaled_reward/mean": -0.2272849828004837, "rewards/cosine_scaled_reward/std": 0.2696577310562134, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 1219.734375, "completions/mean_terminated_length": 920.14892578125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.22857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.27558398246765137, "learning_rate": 7.777151938545235e-07, "loss": 0.0, "num_tokens": 22881381.0, "reward": 0.07320597767829895, "reward_std": 0.3534944951534271, "rewards/cosine_scaled_reward/mean": 0.07320597767829895, "rewards/cosine_scaled_reward/std": 0.4344184398651123, "step": 200 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 22881381, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }