{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17142857142857143, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1702.03125, "completions/mean_terminated_length": 993.6190795898438, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.001142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.28377610445022583, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 118418.0, "reward": -0.09800112247467041, "reward_std": 0.3028089702129364, "rewards/cosine_scaled_reward/mean": -0.09800112992525101, "rewards/cosine_scaled_reward/std": 0.37953105568885803, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1738.90625, "completions/mean_terminated_length": 949.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24221572279930115, "learning_rate": 2e-08, "loss": -0.0, "num_tokens": 239748.0, "reward": 0.020556632429361343, "reward_std": 0.3545936942100525, "rewards/cosine_scaled_reward/mean": 0.020556632429361343, "rewards/cosine_scaled_reward/std": 0.4492928683757782, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 1964.078125, "completions/mean_terminated_length": 973.7999877929688, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.0034285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2472974807024002, "learning_rate": 4e-08, "loss": 0.0, "num_tokens": 375921.0, "reward": -0.20954538881778717, "reward_std": 0.13813795149326324, "rewards/cosine_scaled_reward/mean": -0.20954540371894836, "rewards/cosine_scaled_reward/std": 0.16814909875392914, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1555.6875, "completions/mean_terminated_length": 1093.212158203125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2868657112121582, "learning_rate": 6e-08, "loss": -0.0, "num_tokens": 485293.0, "reward": -0.12192361056804657, "reward_std": 0.31710442900657654, "rewards/cosine_scaled_reward/mean": -0.12192361056804657, "rewards/cosine_scaled_reward/std": 0.35428565740585327, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 1958.5625, "completions/mean_terminated_length": 1332.5, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2460148185491562, "learning_rate": 8e-08, "loss": -0.0, "num_tokens": 621457.0, "reward": -0.21145480871200562, "reward_std": 0.14890719950199127, "rewards/cosine_scaled_reward/mean": -0.21145479381084442, "rewards/cosine_scaled_reward/std": 0.20399661362171173, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 1908.375, "completions/mean_terminated_length": 931.0, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26549720764160156, "learning_rate": 1e-07, "loss": -0.0, "num_tokens": 755241.0, "reward": -0.2408866286277771, "reward_std": 0.16572487354278564, "rewards/cosine_scaled_reward/mean": -0.2408866286277771, "rewards/cosine_scaled_reward/std": 0.17492830753326416, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1889.296875, "completions/mean_terminated_length": 1201.5833740234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.23518230020999908, "learning_rate": 1.2e-07, "loss": 0.0, "num_tokens": 886564.0, "reward": -0.16087877750396729, "reward_std": 0.24579641222953796, "rewards/cosine_scaled_reward/mean": -0.16087877750396729, "rewards/cosine_scaled_reward/std": 0.37339961528778076, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1751.578125, "completions/mean_terminated_length": 994.0555419921875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2354528158903122, "learning_rate": 1.4e-07, "loss": 0.0, "num_tokens": 1009081.0, "reward": -0.023812226951122284, "reward_std": 0.2823081314563751, "rewards/cosine_scaled_reward/mean": -0.02381223440170288, "rewards/cosine_scaled_reward/std": 0.4484662115573883, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 2000.59375, "completions/mean_terminated_length": 1289.5, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 0.010285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.24302220344543457, "learning_rate": 1.6e-07, "loss": 0.0, "num_tokens": 1148575.0, "reward": -0.2453702688217163, "reward_std": 0.18811637163162231, "rewards/cosine_scaled_reward/mean": -0.2453702688217163, "rewards/cosine_scaled_reward/std": 0.22203005850315094, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 1701.140625, "completions/mean_terminated_length": 879.631591796875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.25642141699790955, "learning_rate": 1.8e-07, "loss": -0.0, "num_tokens": 1268280.0, "reward": -0.15177705883979797, "reward_std": 0.2125300019979477, "rewards/cosine_scaled_reward/mean": -0.15177705883979797, "rewards/cosine_scaled_reward/std": 0.3240113854408264, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 1950.609375, "completions/mean_terminated_length": 1157.571533203125, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.012571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.24372951686382294, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 1404791.0, "reward": -0.23502977192401886, "reward_std": 0.18896539509296417, "rewards/cosine_scaled_reward/mean": -0.23502977192401886, "rewards/cosine_scaled_reward/std": 0.24224351346492767, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 1751.03125, "completions/mean_terminated_length": 1221.6522216796875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.28422027826309204, "learning_rate": 2.1999999999999998e-07, "loss": -0.0, "num_tokens": 1527801.0, "reward": -0.14280016720294952, "reward_std": 0.32843896746635437, "rewards/cosine_scaled_reward/mean": -0.14280015230178833, "rewards/cosine_scaled_reward/std": 0.41895967721939087, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 1834.453125, "completions/mean_terminated_length": 1193.8125, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 0.014857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24033738672733307, "learning_rate": 2.4e-07, "loss": 0.0, "num_tokens": 1656246.0, "reward": -0.17057427763938904, "reward_std": 0.24429668486118317, "rewards/cosine_scaled_reward/mean": -0.17057427763938904, "rewards/cosine_scaled_reward/std": 0.27816399931907654, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 1800.65625, "completions/mean_terminated_length": 1116.823486328125, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312558889389038, "learning_rate": 2.6e-07, "loss": 0.0, "num_tokens": 1782096.0, "reward": -0.11817245185375214, "reward_std": 0.24491220712661743, "rewards/cosine_scaled_reward/mean": -0.11817245930433273, "rewards/cosine_scaled_reward/std": 0.3942086696624756, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 1692.828125, "completions/mean_terminated_length": 785.1666870117188, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2563658654689789, "learning_rate": 2.8e-07, "loss": -0.0, "num_tokens": 1901357.0, "reward": -0.027107469737529755, "reward_std": 0.1853453516960144, "rewards/cosine_scaled_reward/mean": -0.027107462286949158, "rewards/cosine_scaled_reward/std": 0.4734213352203369, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.24149107933044434, "learning_rate": 3e-07, "loss": -0.0, "num_tokens": 2042869.0, "reward": -0.2542623281478882, "reward_std": 0.14302438497543335, "rewards/cosine_scaled_reward/mean": -0.2542623281478882, "rewards/cosine_scaled_reward/std": 0.160969540476799, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 1548.75, "completions/mean_terminated_length": 864.5925903320312, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.019428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.31088724732398987, "learning_rate": 3.2e-07, "loss": 0.0, "num_tokens": 2152509.0, "reward": -0.12113451957702637, "reward_std": 0.284165620803833, "rewards/cosine_scaled_reward/mean": -0.12113452702760696, "rewards/cosine_scaled_reward/std": 0.4259316623210907, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 1793.03125, "completions/mean_terminated_length": 1028.125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2451843023300171, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "num_tokens": 2277639.0, "reward": -0.18317042291164398, "reward_std": 0.20634235441684723, "rewards/cosine_scaled_reward/mean": -0.18317043781280518, "rewards/cosine_scaled_reward/std": 0.27781662344932556, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 1735.984375, "completions/mean_terminated_length": 997.0, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.021714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.24677637219429016, "learning_rate": 3.6e-07, "loss": 0.0, "num_tokens": 2399998.0, "reward": -0.04996331408619881, "reward_std": 0.2841629385948181, "rewards/cosine_scaled_reward/mean": -0.04996330291032791, "rewards/cosine_scaled_reward/std": 0.4186851680278778, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 1614.890625, "completions/mean_terminated_length": 842.8261108398438, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2543003559112549, "learning_rate": 3.7999999999999996e-07, "loss": -0.0, "num_tokens": 2514703.0, "reward": -0.09282197058200836, "reward_std": 0.2568933367729187, "rewards/cosine_scaled_reward/mean": -0.09282197058200836, "rewards/cosine_scaled_reward/std": 0.4104878604412079, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1786.734375, "completions/mean_terminated_length": 1119.0555419921875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.3147278130054474, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 2639862.0, "reward": -0.16029146313667297, "reward_std": 0.2322564721107483, "rewards/cosine_scaled_reward/mean": -0.16029146313667297, "rewards/cosine_scaled_reward/std": 0.36191171407699585, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 1300.484375, "completions/mean_terminated_length": 789.0263061523438, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.32522445917129517, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "num_tokens": 2732109.0, "reward": 0.0033364146947860718, "reward_std": 0.18878400325775146, "rewards/cosine_scaled_reward/mean": 0.0033364109694957733, "rewards/cosine_scaled_reward/std": 0.45390966534614563, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1641.03125, "completions/mean_terminated_length": 1046.2308349609375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.026285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.28244850039482117, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "num_tokens": 2847927.0, "reward": -0.21077856421470642, "reward_std": 0.24399788677692413, "rewards/cosine_scaled_reward/mean": -0.21077856421470642, "rewards/cosine_scaled_reward/std": 0.2925592362880707, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1789.59375, "completions/mean_terminated_length": 1129.2222900390625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.24896308779716492, "learning_rate": 4.6e-07, "loss": -0.0, "num_tokens": 2973389.0, "reward": -0.1665852814912796, "reward_std": 0.307574987411499, "rewards/cosine_scaled_reward/mean": -0.1665852665901184, "rewards/cosine_scaled_reward/std": 0.4072873294353485, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1696.40625, "completions/mean_terminated_length": 1025.181884765625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.262716144323349, "learning_rate": 4.8e-07, "loss": 0.0, "num_tokens": 3092255.0, "reward": -0.14361324906349182, "reward_std": 0.3466429114341736, "rewards/cosine_scaled_reward/mean": -0.14361326396465302, "rewards/cosine_scaled_reward/std": 0.3933021128177643, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1973.046875, "completions/mean_terminated_length": 1448.375, "completions/min_length": 1035.0, "completions/min_terminated_length": 1035.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2365841567516327, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 3229162.0, "reward": -0.050574399530887604, "reward_std": 0.22459164261817932, "rewards/cosine_scaled_reward/mean": -0.050574399530887604, "rewards/cosine_scaled_reward/std": 0.37290775775909424, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1878.53125, "completions/mean_terminated_length": 1213.6923828125, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.030857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2821083068847656, "learning_rate": 5.2e-07, "loss": 0.0, "num_tokens": 3359676.0, "reward": -0.13096781075000763, "reward_std": 0.26249831914901733, "rewards/cosine_scaled_reward/mean": -0.13096781075000763, "rewards/cosine_scaled_reward/std": 0.3478032350540161, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 1827.453125, "completions/mean_terminated_length": 1039.7857666015625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2539210915565491, "learning_rate": 5.4e-07, "loss": 0.0, "num_tokens": 3486969.0, "reward": -0.11822876334190369, "reward_std": 0.2370690554380417, "rewards/cosine_scaled_reward/mean": -0.11822875589132309, "rewards/cosine_scaled_reward/std": 0.4236762225627899, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 2020.5, "completions/mean_terminated_length": 1608.0, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.03314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23259545862674713, "learning_rate": 5.6e-07, "loss": -0.0, "num_tokens": 3626753.0, "reward": -0.20220182836055756, "reward_std": 0.15910759568214417, "rewards/cosine_scaled_reward/mean": -0.20220182836055756, "rewards/cosine_scaled_reward/std": 0.20781411230564117, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 1903.703125, "completions/mean_terminated_length": 1208.45458984375, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24027252197265625, "learning_rate": 5.8e-07, "loss": 0.0, "num_tokens": 3759126.0, "reward": -0.19193249940872192, "reward_std": 0.24584847688674927, "rewards/cosine_scaled_reward/mean": -0.19193249940872192, "rewards/cosine_scaled_reward/std": 0.28378522396087646, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 1847.34375, "completions/mean_terminated_length": 1060.1539306640625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.03542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2703397274017334, "learning_rate": 6e-07, "loss": -0.0, "num_tokens": 3887852.0, "reward": -0.25379180908203125, "reward_std": 0.24661941826343536, "rewards/cosine_scaled_reward/mean": -0.25379180908203125, "rewards/cosine_scaled_reward/std": 0.29188498854637146, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1950.3125, "completions/mean_terminated_length": 1479.6363525390625, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.21763876080513, "learning_rate": 6.2e-07, "loss": -0.0, "num_tokens": 4023024.0, "reward": -0.16017228364944458, "reward_std": 0.2255343496799469, "rewards/cosine_scaled_reward/mean": -0.16017228364944458, "rewards/cosine_scaled_reward/std": 0.3709539771080017, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1996.28125, "completions/mean_terminated_length": 1634.25, "completions/min_length": 1237.0, "completions/min_terminated_length": 1237.0, "epoch": 0.037714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22758260369300842, "learning_rate": 6.4e-07, "loss": -0.0, "num_tokens": 4162002.0, "reward": -0.20318198204040527, "reward_std": 0.18396919965744019, "rewards/cosine_scaled_reward/mean": -0.20318198204040527, "rewards/cosine_scaled_reward/std": 0.34913352131843567, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 1703.265625, "completions/mean_terminated_length": 1230.851806640625, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.31658875942230225, "learning_rate": 6.6e-07, "loss": -0.0, "num_tokens": 4280563.0, "reward": -0.05977274850010872, "reward_std": 0.30437377095222473, "rewards/cosine_scaled_reward/mean": -0.059772733598947525, "rewards/cosine_scaled_reward/std": 0.4424094259738922, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 1807.546875, "completions/mean_terminated_length": 765.5833740234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.2792847156524658, "learning_rate": 6.800000000000001e-07, "loss": -0.0, "num_tokens": 4407742.0, "reward": -0.18658886849880219, "reward_std": 0.2910658121109009, "rewards/cosine_scaled_reward/mean": -0.18658888339996338, "rewards/cosine_scaled_reward/std": 0.34802255034446716, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 1995.65625, "completions/mean_terminated_length": 1378.0, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23547738790512085, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 4546576.0, "reward": -0.23918019235134125, "reward_std": 0.19598917663097382, "rewards/cosine_scaled_reward/mean": -0.23918019235134125, "rewards/cosine_scaled_reward/std": 0.2425125539302826, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1994.75, "completions/mean_terminated_length": 1480.0, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.04228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.22962674498558044, "learning_rate": 7.2e-07, "loss": -0.0, "num_tokens": 4685264.0, "reward": -0.25335729122161865, "reward_std": 0.15323391556739807, "rewards/cosine_scaled_reward/mean": -0.25335729122161865, "rewards/cosine_scaled_reward/std": 0.17556406557559967, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1957.484375, "completions/mean_terminated_length": 1220.4285888671875, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24781912565231323, "learning_rate": 7.4e-07, "loss": -0.0, "num_tokens": 4822255.0, "reward": -0.13536512851715088, "reward_std": 0.19208545982837677, "rewards/cosine_scaled_reward/mean": -0.13536511361598969, "rewards/cosine_scaled_reward/std": 0.30052343010902405, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1744.421875, "completions/mean_terminated_length": 833.6875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.044571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.2562144994735718, "learning_rate": 7.599999999999999e-07, "loss": -0.0, "num_tokens": 4944682.0, "reward": -0.041110455989837646, "reward_std": 0.21381449699401855, "rewards/cosine_scaled_reward/mean": -0.04111045226454735, "rewards/cosine_scaled_reward/std": 0.35980772972106934, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1774.359375, "completions/mean_terminated_length": 1017.8235473632812, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25478634238243103, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "num_tokens": 5068313.0, "reward": -0.12165145576000214, "reward_std": 0.17204006016254425, "rewards/cosine_scaled_reward/mean": -0.12165144830942154, "rewards/cosine_scaled_reward/std": 0.4099982678890228, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1814.375, "completions/mean_terminated_length": 1397.9130859375, "completions/min_length": 968.0, "completions/min_terminated_length": 968.0, "epoch": 0.046857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.21750310063362122, "learning_rate": 8e-07, "loss": 0.0, "num_tokens": 5195585.0, "reward": -0.25668060779571533, "reward_std": 0.2832298278808594, "rewards/cosine_scaled_reward/mean": -0.25668060779571533, "rewards/cosine_scaled_reward/std": 0.3347759544849396, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1714.59375, "completions/mean_terminated_length": 625.4666748046875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.34486907720565796, "learning_rate": 8.199999999999999e-07, "loss": -0.0, "num_tokens": 5315679.0, "reward": -0.2253742218017578, "reward_std": 0.1778060495853424, "rewards/cosine_scaled_reward/mean": -0.22537420690059662, "rewards/cosine_scaled_reward/std": 0.19647939503192902, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 1863.78125, "completions/mean_terminated_length": 976.1818237304688, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.04914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23907455801963806, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "num_tokens": 5446577.0, "reward": -0.1142776757478714, "reward_std": 0.21804723143577576, "rewards/cosine_scaled_reward/mean": -0.1142776757478714, "rewards/cosine_scaled_reward/std": 0.3637608587741852, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1771.125, "completions/mean_terminated_length": 940.5, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2888188362121582, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "num_tokens": 5570625.0, "reward": -0.11845305562019348, "reward_std": 0.2729855477809906, "rewards/cosine_scaled_reward/mean": -0.11845306307077408, "rewards/cosine_scaled_reward/std": 0.4279690086841583, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 2020.859375, "completions/mean_terminated_length": 1179.5, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2232045829296112, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "num_tokens": 5711616.0, "reward": -0.1830526441335678, "reward_std": 0.20074567198753357, "rewards/cosine_scaled_reward/mean": -0.1830526441335678, "rewards/cosine_scaled_reward/std": 0.3221423327922821, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 1843.328125, "completions/mean_terminated_length": 857.1818237304688, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2569328844547272, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 5840757.0, "reward": -0.21247822046279907, "reward_std": 0.17188501358032227, "rewards/cosine_scaled_reward/mean": -0.21247822046279907, "rewards/cosine_scaled_reward/std": 0.183182492852211, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1772.984375, "completions/mean_terminated_length": 1012.6470336914062, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.053714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2800576090812683, "learning_rate": 9.2e-07, "loss": -0.0, "num_tokens": 5964628.0, "reward": -0.1755329668521881, "reward_std": 0.19662824273109436, "rewards/cosine_scaled_reward/mean": -0.1755329668521881, "rewards/cosine_scaled_reward/std": 0.3987559974193573, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1787.046875, "completions/mean_terminated_length": 1120.1666259765625, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.2499135434627533, "learning_rate": 9.399999999999999e-07, "loss": -0.0, "num_tokens": 6089543.0, "reward": -0.07469595968723297, "reward_std": 0.2802818715572357, "rewards/cosine_scaled_reward/mean": -0.07469595968723297, "rewards/cosine_scaled_reward/std": 0.39331451058387756, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1611.65625, "completions/mean_terminated_length": 1013.7037353515625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.2976716160774231, "learning_rate": 9.6e-07, "loss": -0.0, "num_tokens": 6202753.0, "reward": -0.14219576120376587, "reward_std": 0.3252427875995636, "rewards/cosine_scaled_reward/mean": -0.14219576120376587, "rewards/cosine_scaled_reward/std": 0.41946855187416077, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1826.90625, "completions/mean_terminated_length": 761.6364135742188, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2344626933336258, "learning_rate": 9.8e-07, "loss": -0.0, "num_tokens": 6330491.0, "reward": -0.098542720079422, "reward_std": 0.20483215153217316, "rewards/cosine_scaled_reward/mean": -0.0985427126288414, "rewards/cosine_scaled_reward/std": 0.396296888589859, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1520.1875, "completions/mean_terminated_length": 922.0000610351562, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.05828571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.30348992347717285, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 6437991.0, "reward": -0.12996003031730652, "reward_std": 0.2803010940551758, "rewards/cosine_scaled_reward/mean": -0.12996003031730652, "rewards/cosine_scaled_reward/std": 0.3464147746562958, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1726.71875, "completions/mean_terminated_length": 838.4705810546875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.05942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2905585765838623, "learning_rate": 9.999890338174275e-07, "loss": -0.0, "num_tokens": 6559853.0, "reward": -0.2443142831325531, "reward_std": 0.21010473370552063, "rewards/cosine_scaled_reward/mean": -0.2443142831325531, "rewards/cosine_scaled_reward/std": 0.32864055037498474, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 1757.015625, "completions/mean_terminated_length": 952.5294189453125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.060571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2699633538722992, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "num_tokens": 6683134.0, "reward": -0.18116676807403564, "reward_std": 0.2308851182460785, "rewards/cosine_scaled_reward/mean": -0.18116676807403564, "rewards/cosine_scaled_reward/std": 0.27486056089401245, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1850.65625, "completions/mean_terminated_length": 1206.0001220703125, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.061714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.23383355140686035, "learning_rate": 9.999013075636804e-07, "loss": -0.0, "num_tokens": 6812720.0, "reward": -0.14257444441318512, "reward_std": 0.29668545722961426, "rewards/cosine_scaled_reward/mean": -0.14257442951202393, "rewards/cosine_scaled_reward/std": 0.4257228672504425, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 1754.640625, "completions/mean_terminated_length": 874.5625, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.06285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.23320119082927704, "learning_rate": 9.998245517681593e-07, "loss": -0.0, "num_tokens": 6935305.0, "reward": -0.14078931510448456, "reward_std": 0.17466726899147034, "rewards/cosine_scaled_reward/mean": -0.14078931510448456, "rewards/cosine_scaled_reward/std": 0.3331747353076935, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 1853.78125, "completions/mean_terminated_length": 918.0, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.23405365645885468, "learning_rate": 9.997258721585931e-07, "loss": -0.0, "num_tokens": 7064907.0, "reward": -0.11611534655094147, "reward_std": 0.19285616278648376, "rewards/cosine_scaled_reward/mean": -0.11611534655094147, "rewards/cosine_scaled_reward/std": 0.47406119108200073, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1971.640625, "completions/mean_terminated_length": 1437.125, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.06514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.20449356734752655, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "num_tokens": 7202660.0, "reward": -0.27627938985824585, "reward_std": 0.2080146074295044, "rewards/cosine_scaled_reward/mean": -0.27627938985824585, "rewards/cosine_scaled_reward/std": 0.2397139072418213, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1678.09375, "completions/mean_terminated_length": 971.9091186523438, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.06628571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.251164048910141, "learning_rate": 9.994627618036452e-07, "loss": -0.0, "num_tokens": 7320154.0, "reward": -0.1333095282316208, "reward_std": 0.27265745401382446, "rewards/cosine_scaled_reward/mean": -0.1333095282316208, "rewards/cosine_scaled_reward/std": 0.3821713328361511, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 1732.171875, "completions/mean_terminated_length": 859.0, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.06742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.22478283941745758, "learning_rate": 9.992983438818915e-07, "loss": -0.0, "num_tokens": 7441477.0, "reward": -0.18278491497039795, "reward_std": 0.2154037207365036, "rewards/cosine_scaled_reward/mean": -0.18278491497039795, "rewards/cosine_scaled_reward/std": 0.3414745628833771, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1798.375, "completions/mean_terminated_length": 982.9334106445312, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 0.06857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.22602440416812897, "learning_rate": 9.991120277927223e-07, "loss": -0.0, "num_tokens": 7567461.0, "reward": -0.265900194644928, "reward_std": 0.1530904918909073, "rewards/cosine_scaled_reward/mean": -0.265900194644928, "rewards/cosine_scaled_reward/std": 0.18254056572914124, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1950.578125, "completions/mean_terminated_length": 1424.5, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 0.06971428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.22684067487716675, "learning_rate": 9.989038226169207e-07, "loss": -0.0, "num_tokens": 7703818.0, "reward": -0.05269922316074371, "reward_std": 0.3038993775844574, "rewards/cosine_scaled_reward/mean": -0.052699219435453415, "rewards/cosine_scaled_reward/std": 0.36445698142051697, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1717.703125, "completions/mean_terminated_length": 1041.3809814453125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.07085714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.23552638292312622, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "num_tokens": 7823983.0, "reward": -0.07779724895954132, "reward_std": 0.2913648784160614, "rewards/cosine_scaled_reward/mean": -0.07779725641012192, "rewards/cosine_scaled_reward/std": 0.4099881649017334, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 1600.625, "completions/mean_terminated_length": 1180.3636474609375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.28230276703834534, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "num_tokens": 7936679.0, "reward": -0.02632874622941017, "reward_std": 0.25066205859184265, "rewards/cosine_scaled_reward/mean": -0.02632874995470047, "rewards/cosine_scaled_reward/std": 0.4263686537742615, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1743.578125, "completions/mean_terminated_length": 1073.8499755859375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.07314285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.266590416431427, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "num_tokens": 8059220.0, "reward": -0.10920079052448273, "reward_std": 0.3089619576931, "rewards/cosine_scaled_reward/mean": -0.10920079052448273, "rewards/cosine_scaled_reward/std": 0.43342384696006775, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 1690.609375, "completions/mean_terminated_length": 618.4375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.07428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2891872525215149, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "num_tokens": 8178123.0, "reward": -0.2091352641582489, "reward_std": 0.18792679905891418, "rewards/cosine_scaled_reward/mean": -0.2091352641582489, "rewards/cosine_scaled_reward/std": 0.40636762976646423, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 1320.453125, "completions/mean_terminated_length": 678.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.07542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.30139341950416565, "learning_rate": 9.975348529157229e-07, "loss": -0.0, "num_tokens": 8272864.0, "reward": -0.012375831604003906, "reward_std": 0.2539718747138977, "rewards/cosine_scaled_reward/mean": -0.01237582415342331, "rewards/cosine_scaled_reward/std": 0.45652061700820923, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 2001.21875, "completions/mean_terminated_length": 1050.0, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 0.07657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.21435414254665375, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "num_tokens": 8411678.0, "reward": -0.27966073155403137, "reward_std": 0.14496129751205444, "rewards/cosine_scaled_reward/mean": -0.27966073155403137, "rewards/cosine_scaled_reward/std": 0.1733873188495636, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1359.65625, "completions/mean_terminated_length": 789.3142700195312, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.07771428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.3244759440422058, "learning_rate": 9.968344786479415e-07, "loss": -0.0, "num_tokens": 8507952.0, "reward": -0.06231251358985901, "reward_std": 0.31347835063934326, "rewards/cosine_scaled_reward/mean": -0.062312521040439606, "rewards/cosine_scaled_reward/std": 0.40184450149536133, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1572.78125, "completions/mean_terminated_length": 831.4400024414062, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.07885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3429071605205536, "learning_rate": 9.964516155915151e-07, "loss": 0.0, "num_tokens": 8618954.0, "reward": -0.24097035825252533, "reward_std": 0.22784993052482605, "rewards/cosine_scaled_reward/mean": -0.24097035825252533, "rewards/cosine_scaled_reward/std": 0.2594495415687561, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 1859.578125, "completions/mean_terminated_length": 951.727294921875, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.221941277384758, "learning_rate": 9.960469931131936e-07, "loss": -0.0, "num_tokens": 8749423.0, "reward": -0.27105003595352173, "reward_std": 0.16835230588912964, "rewards/cosine_scaled_reward/mean": -0.27105003595352173, "rewards/cosine_scaled_reward/std": 0.21196867525577545, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 1668.265625, "completions/mean_terminated_length": 832.8500366210938, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.08114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2909034192562103, "learning_rate": 9.956206309337066e-07, "loss": -0.0, "num_tokens": 8866912.0, "reward": -0.09497882425785065, "reward_std": 0.2813299000263214, "rewards/cosine_scaled_reward/mean": -0.09497880935668945, "rewards/cosine_scaled_reward/std": 0.4832696318626404, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 1697.671875, "completions/mean_terminated_length": 926.9500122070312, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.08228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3016415238380432, "learning_rate": 9.951725498333448e-07, "loss": -0.0, "num_tokens": 8985915.0, "reward": -0.22967606782913208, "reward_std": 0.18875859677791595, "rewards/cosine_scaled_reward/mean": -0.2296760529279709, "rewards/cosine_scaled_reward/std": 0.22012120485305786, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 2020.703125, "completions/mean_terminated_length": 1465.666748046875, "completions/min_length": 1143.0, "completions/min_terminated_length": 1143.0, "epoch": 0.08342857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.21586637198925018, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "num_tokens": 9125968.0, "reward": -0.24284613132476807, "reward_std": 0.22862236201763153, "rewards/cosine_scaled_reward/mean": -0.24284613132476807, "rewards/cosine_scaled_reward/std": 0.24740919470787048, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1975.09375, "completions/mean_terminated_length": 1381.4285888671875, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.08457142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.21680164337158203, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "num_tokens": 9262302.0, "reward": -0.1543380469083786, "reward_std": 0.24083258211612701, "rewards/cosine_scaled_reward/mean": -0.1543380618095398, "rewards/cosine_scaled_reward/std": 0.3356986939907074, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1836.3125, "completions/mean_terminated_length": 1295.3333740234375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.125, "grad_norm": 0.1845395565032959, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "num_tokens": 9390786.0, "reward": -0.12792138755321503, "reward_std": 0.10224759578704834, "rewards/cosine_scaled_reward/mean": -0.12792138755321503, "rewards/cosine_scaled_reward/std": 0.4530969560146332, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 1764.109375, "completions/mean_terminated_length": 836.7333984375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.08685714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.26535236835479736, "learning_rate": 9.931634888554935e-07, "loss": -0.0, "num_tokens": 9514089.0, "reward": -0.27717918157577515, "reward_std": 0.19932743906974792, "rewards/cosine_scaled_reward/mean": -0.27717918157577515, "rewards/cosine_scaled_reward/std": 0.20844916999340057, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1945.109375, "completions/mean_terminated_length": 1224.875, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.2047174870967865, "learning_rate": 9.926071618660237e-07, "loss": -0.0, "num_tokens": 9650152.0, "reward": -0.09873012453317642, "reward_std": 0.22244854271411896, "rewards/cosine_scaled_reward/mean": -0.09873010218143463, "rewards/cosine_scaled_reward/std": 0.34491515159606934, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1928.703125, "completions/mean_terminated_length": 1199.6666259765625, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.08914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.22559019923210144, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "num_tokens": 9784309.0, "reward": -0.09572747349739075, "reward_std": 0.23068635165691376, "rewards/cosine_scaled_reward/mean": -0.09572747349739075, "rewards/cosine_scaled_reward/std": 0.38660773634910583, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1508.40625, "completions/mean_terminated_length": 814.6428833007812, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.09028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24668477475643158, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "num_tokens": 9890943.0, "reward": -0.1618795394897461, "reward_std": 0.22540031373500824, "rewards/cosine_scaled_reward/mean": -0.1618795245885849, "rewards/cosine_scaled_reward/std": 0.3233039081096649, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 2012.671875, "completions/mean_terminated_length": 1725.0001220703125, "completions/min_length": 1283.0, "completions/min_terminated_length": 1283.0, "epoch": 0.09142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24282054603099823, "learning_rate": 9.908088623197048e-07, "loss": -0.0, "num_tokens": 10030146.0, "reward": -0.25591158866882324, "reward_std": 0.15104801952838898, "rewards/cosine_scaled_reward/mean": -0.25591158866882324, "rewards/cosine_scaled_reward/std": 0.18741995096206665, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 1821.921875, "completions/mean_terminated_length": 935.0000610351562, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.09257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3027254641056061, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "num_tokens": 10158021.0, "reward": -0.15331333875656128, "reward_std": 0.18424856662750244, "rewards/cosine_scaled_reward/mean": -0.15331333875656128, "rewards/cosine_scaled_reward/std": 0.24023762345314026, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 1734.28125, "completions/mean_terminated_length": 991.26318359375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.09371428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.2466808557510376, "learning_rate": 9.895025252503755e-07, "loss": -0.0, "num_tokens": 10279343.0, "reward": -0.07192108780145645, "reward_std": 0.2587333917617798, "rewards/cosine_scaled_reward/mean": -0.07192108780145645, "rewards/cosine_scaled_reward/std": 0.46087121963500977, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 1688.78125, "completions/mean_terminated_length": 953.2380981445312, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.09485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2600877285003662, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "num_tokens": 10398513.0, "reward": -0.1718086451292038, "reward_std": 0.2223512828350067, "rewards/cosine_scaled_reward/mean": -0.1718086451292038, "rewards/cosine_scaled_reward/std": 0.2828122675418854, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 1838.203125, "completions/mean_terminated_length": 705.2999877929688, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.22531215846538544, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "num_tokens": 10526854.0, "reward": -0.2154863476753235, "reward_std": 0.261901319026947, "rewards/cosine_scaled_reward/mean": -0.2154863476753235, "rewards/cosine_scaled_reward/std": 0.29268571734428406, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1926.34375, "completions/mean_terminated_length": 1399.166748046875, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.09714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19031891226768494, "learning_rate": 9.873824502603459e-07, "loss": -0.0, "num_tokens": 10660460.0, "reward": -0.21009978652000427, "reward_std": 0.19575349986553192, "rewards/cosine_scaled_reward/mean": -0.21009978652000427, "rewards/cosine_scaled_reward/std": 0.2456056773662567, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1713.21875, "completions/mean_terminated_length": 787.6470336914062, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.09828571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.258359432220459, "learning_rate": 9.866330768241983e-07, "loss": -0.0, "num_tokens": 10780962.0, "reward": -0.1955144852399826, "reward_std": 0.24323132634162903, "rewards/cosine_scaled_reward/mean": -0.1955144852399826, "rewards/cosine_scaled_reward/std": 0.3071554899215698, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1656.0, "completions/mean_terminated_length": 1002.6666870117188, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.09942857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2636864185333252, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "num_tokens": 10897066.0, "reward": -0.1988150179386139, "reward_std": 0.24088150262832642, "rewards/cosine_scaled_reward/mean": -0.1988150179386139, "rewards/cosine_scaled_reward/std": 0.2925129532814026, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1755.234375, "completions/mean_terminated_length": 1061.8421630859375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.10057142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.29112017154693604, "learning_rate": 9.850705248720068e-07, "loss": 0.0, "num_tokens": 11019913.0, "reward": -0.02967459335923195, "reward_std": 0.3240855932235718, "rewards/cosine_scaled_reward/mean": -0.029674597084522247, "rewards/cosine_scaled_reward/std": 0.3718070983886719, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1752.78125, "completions/mean_terminated_length": 1148.2857666015625, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.10171428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2357943207025528, "learning_rate": 9.8425742251254e-07, "loss": -0.0, "num_tokens": 11143091.0, "reward": -0.1188301220536232, "reward_std": 0.296513170003891, "rewards/cosine_scaled_reward/mean": -0.1188301220536232, "rewards/cosine_scaled_reward/std": 0.3878798484802246, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 1633.84375, "completions/mean_terminated_length": 1101.357177734375, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.10285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.32384219765663147, "learning_rate": 9.83423155058946e-07, "loss": -0.0, "num_tokens": 11257657.0, "reward": -0.22837099432945251, "reward_std": 0.18625205755233765, "rewards/cosine_scaled_reward/mean": -0.22837099432945251, "rewards/cosine_scaled_reward/std": 0.23636196553707123, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 1847.21875, "completions/mean_terminated_length": 1244.875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.24563109874725342, "learning_rate": 9.825677631722435e-07, "loss": -0.0, "num_tokens": 11386447.0, "reward": -0.11780542880296707, "reward_std": 0.3100074827671051, "rewards/cosine_scaled_reward/mean": -0.11780542135238647, "rewards/cosine_scaled_reward/std": 0.39149248600006104, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 1595.125, "completions/mean_terminated_length": 888.6399536132812, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.10514285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2472057044506073, "learning_rate": 9.816912885430258e-07, "loss": -0.0, "num_tokens": 11498527.0, "reward": -0.2128506749868393, "reward_std": 0.20926561951637268, "rewards/cosine_scaled_reward/mean": -0.2128506898880005, "rewards/cosine_scaled_reward/std": 0.23348061740398407, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 1979.953125, "completions/mean_terminated_length": 959.25, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.10628571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2550150454044342, "learning_rate": 9.807937738894303e-07, "loss": -0.0, "num_tokens": 11636588.0, "reward": -0.2922024428844452, "reward_std": 0.1515069603919983, "rewards/cosine_scaled_reward/mean": -0.2922024726867676, "rewards/cosine_scaled_reward/std": 0.18899379670619965, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1830.609375, "completions/mean_terminated_length": 977.769287109375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.10742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.27164825797080994, "learning_rate": 9.798752629550546e-07, "loss": -0.0, "num_tokens": 11763515.0, "reward": -0.18001651763916016, "reward_std": 0.18973413109779358, "rewards/cosine_scaled_reward/mean": -0.18001650273799896, "rewards/cosine_scaled_reward/std": 0.4316568076610565, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 2004.671875, "completions/mean_terminated_length": 1493.4000244140625, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.10857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.20861269533634186, "learning_rate": 9.78935800506826e-07, "loss": -0.0, "num_tokens": 11902342.0, "reward": -0.24148261547088623, "reward_std": 0.18629083037376404, "rewards/cosine_scaled_reward/mean": -0.24148263037204742, "rewards/cosine_scaled_reward/std": 0.23122739791870117, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 1703.359375, "completions/mean_terminated_length": 945.1500244140625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.10971428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.2585296928882599, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "num_tokens": 12022493.0, "reward": -0.11465626955032349, "reward_std": 0.24939197301864624, "rewards/cosine_scaled_reward/mean": -0.11465626955032349, "rewards/cosine_scaled_reward/std": 0.4384477138519287, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1819.921875, "completions/mean_terminated_length": 1135.6875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.11085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3019813299179077, "learning_rate": 9.769942052400235e-07, "loss": 0.0, "num_tokens": 12149232.0, "reward": -0.18846748769283295, "reward_std": 0.2666187584400177, "rewards/cosine_scaled_reward/mean": -0.18846750259399414, "rewards/cosine_scaled_reward/std": 0.3043021559715271, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1677.296875, "completions/mean_terminated_length": 1099.0, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.2722402513027191, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "num_tokens": 12267643.0, "reward": -0.09557384252548218, "reward_std": 0.2643275558948517, "rewards/cosine_scaled_reward/mean": -0.09557383507490158, "rewards/cosine_scaled_reward/std": 0.3361329138278961, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 1716.59375, "completions/mean_terminated_length": 634.0000610351562, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.11314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2789485454559326, "learning_rate": 9.749693666068663e-07, "loss": -0.0, "num_tokens": 12388673.0, "reward": -0.11132554709911346, "reward_std": 0.1736970841884613, "rewards/cosine_scaled_reward/mean": -0.11132554709911346, "rewards/cosine_scaled_reward/std": 0.38663193583488464, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 1627.78125, "completions/mean_terminated_length": 927.4166870117188, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.2479974329471588, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "num_tokens": 12502563.0, "reward": 0.05247430503368378, "reward_std": 0.2633323669433594, "rewards/cosine_scaled_reward/mean": 0.05247429758310318, "rewards/cosine_scaled_reward/std": 0.44700634479522705, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1684.75, "completions/mean_terminated_length": 1037.2174072265625, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.11542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2880499064922333, "learning_rate": 9.728616793536587e-07, "loss": -0.0, "num_tokens": 12621819.0, "reward": -0.09590694308280945, "reward_std": 0.21176990866661072, "rewards/cosine_scaled_reward/mean": -0.09590694308280945, "rewards/cosine_scaled_reward/std": 0.426421195268631, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 1361.265625, "completions/mean_terminated_length": 860.1351318359375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.11657142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2874862551689148, "learning_rate": 9.717768952713511e-07, "loss": -0.0, "num_tokens": 12719092.0, "reward": -0.19330359995365143, "reward_std": 0.1932550072669983, "rewards/cosine_scaled_reward/mean": -0.19330358505249023, "rewards/cosine_scaled_reward/std": 0.34549427032470703, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 1687.90625, "completions/mean_terminated_length": 607.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.11771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.29745906591415405, "learning_rate": 9.706715543782064e-07, "loss": -0.0, "num_tokens": 12837470.0, "reward": -0.2588111162185669, "reward_std": 0.26013171672821045, "rewards/cosine_scaled_reward/mean": -0.2588111162185669, "rewards/cosine_scaled_reward/std": 0.32377612590789795, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 1679.59375, "completions/mean_terminated_length": 925.2380981445312, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.11885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.27166086435317993, "learning_rate": 9.695457105469804e-07, "loss": -0.0, "num_tokens": 12955428.0, "reward": -0.17275363206863403, "reward_std": 0.20137225091457367, "rewards/cosine_scaled_reward/mean": -0.17275363206863403, "rewards/cosine_scaled_reward/std": 0.2731510400772095, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1568.203125, "completions/mean_terminated_length": 819.719970703125, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.26759475469589233, "learning_rate": 9.683994186497132e-07, "loss": -0.0, "num_tokens": 13067081.0, "reward": -0.1266355961561203, "reward_std": 0.3027850389480591, "rewards/cosine_scaled_reward/mean": -0.1266355961561203, "rewards/cosine_scaled_reward/std": 0.4276663362979889, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1432.09375, "completions/mean_terminated_length": 816.1875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.12114285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2912415862083435, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "num_tokens": 13169567.0, "reward": 0.052130524069070816, "reward_std": 0.30294427275657654, "rewards/cosine_scaled_reward/mean": 0.052130527794361115, "rewards/cosine_scaled_reward/std": 0.43769362568855286, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1721.28125, "completions/mean_terminated_length": 1097.5455322265625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.12228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.26628872752189636, "learning_rate": 9.66045715125541e-07, "loss": 0.0, "num_tokens": 13290881.0, "reward": -0.18292994797229767, "reward_std": 0.25176504254341125, "rewards/cosine_scaled_reward/mean": -0.18292994797229767, "rewards/cosine_scaled_reward/std": 0.33385229110717773, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1518.6875, "completions/mean_terminated_length": 989.375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.12342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.25796031951904297, "learning_rate": 9.648384182148252e-07, "loss": -0.0, "num_tokens": 13398437.0, "reward": -0.17732736468315125, "reward_std": 0.32095974683761597, "rewards/cosine_scaled_reward/mean": -0.17732736468315125, "rewards/cosine_scaled_reward/std": 0.3682377338409424, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1871.890625, "completions/mean_terminated_length": 1108.75, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.12457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2274676412343979, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "num_tokens": 13529486.0, "reward": -0.13115660846233368, "reward_std": 0.15383467078208923, "rewards/cosine_scaled_reward/mean": -0.13115662336349487, "rewards/cosine_scaled_reward/std": 0.4183727204799652, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1584.125, "completions/mean_terminated_length": 811.0, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.12571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2768951952457428, "learning_rate": 9.623632283030077e-07, "loss": -0.0, "num_tokens": 13641646.0, "reward": -0.27792292833328247, "reward_std": 0.18945851922035217, "rewards/cosine_scaled_reward/mean": -0.27792292833328247, "rewards/cosine_scaled_reward/std": 0.20238204300403595, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1709.1875, "completions/mean_terminated_length": 1062.3636474609375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.12685714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24532362818717957, "learning_rate": 9.610954559391704e-07, "loss": -0.0, "num_tokens": 13761154.0, "reward": -0.0890636295080185, "reward_std": 0.33067381381988525, "rewards/cosine_scaled_reward/mean": -0.0890636295080185, "rewards/cosine_scaled_reward/std": 0.40376362204551697, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1895.234375, "completions/mean_terminated_length": 1436.9375, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.22462251782417297, "learning_rate": 9.598076473627796e-07, "loss": -0.0, "num_tokens": 13893545.0, "reward": -0.1325383186340332, "reward_std": 0.330952525138855, "rewards/cosine_scaled_reward/mean": -0.1325383186340332, "rewards/cosine_scaled_reward/std": 0.4280668795108795, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 1606.890625, "completions/mean_terminated_length": 871.7083740234375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.12914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3009057939052582, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "num_tokens": 14006682.0, "reward": -0.05043189600110054, "reward_std": 0.300018846988678, "rewards/cosine_scaled_reward/mean": -0.050431910902261734, "rewards/cosine_scaled_reward/std": 0.43634143471717834, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 1562.515625, "completions/mean_terminated_length": 753.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.13028571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.37847185134887695, "learning_rate": 9.571721736097088e-07, "loss": 0.0, "num_tokens": 14116531.0, "reward": -0.27539706230163574, "reward_std": 0.18451666831970215, "rewards/cosine_scaled_reward/mean": -0.27539709210395813, "rewards/cosine_scaled_reward/std": 0.23580753803253174, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1656.484375, "completions/mean_terminated_length": 958.5652465820312, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.13142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.26879096031188965, "learning_rate": 9.55824636882301e-07, "loss": 0.0, "num_tokens": 14233762.0, "reward": -0.058682698756456375, "reward_std": 0.2945008873939514, "rewards/cosine_scaled_reward/mean": -0.05868269130587578, "rewards/cosine_scaled_reward/std": 0.40092962980270386, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 1924.84375, "completions/mean_terminated_length": 734.3333740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.13257142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.2654048800468445, "learning_rate": 9.54457320834625e-07, "loss": 0.0, "num_tokens": 14368336.0, "reward": -0.2030428647994995, "reward_std": 0.18692326545715332, "rewards/cosine_scaled_reward/mean": -0.2030428647994995, "rewards/cosine_scaled_reward/std": 0.2246093899011612, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 1785.484375, "completions/mean_terminated_length": 997.9375, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.1337142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26004910469055176, "learning_rate": 9.530702921077358e-07, "loss": -0.0, "num_tokens": 14493631.0, "reward": -0.19770082831382751, "reward_std": 0.25534579157829285, "rewards/cosine_scaled_reward/mean": -0.19770082831382751, "rewards/cosine_scaled_reward/std": 0.33773326873779297, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 1802.84375, "completions/mean_terminated_length": 1067.375, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.13485714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.22992977499961853, "learning_rate": 9.516636183034564e-07, "loss": 0.0, "num_tokens": 14619549.0, "reward": -0.011579632759094238, "reward_std": 0.3697226643562317, "rewards/cosine_scaled_reward/mean": -0.011579625308513641, "rewards/cosine_scaled_reward/std": 0.4647332727909088, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1343.578125, "completions/mean_terminated_length": 920.9249877929688, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.3279743492603302, "learning_rate": 9.502373679810839e-07, "loss": -0.0, "num_tokens": 14715946.0, "reward": -0.0004618987441062927, "reward_std": 0.27856603264808655, "rewards/cosine_scaled_reward/mean": -0.0004618987441062927, "rewards/cosine_scaled_reward/std": 0.45174649357795715, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1286.75, "completions/mean_terminated_length": 859.707275390625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.13714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.3185117244720459, "learning_rate": 9.487916106540465e-07, "loss": -0.0, "num_tokens": 14808754.0, "reward": -0.06128609925508499, "reward_std": 0.3139324188232422, "rewards/cosine_scaled_reward/mean": -0.06128609925508499, "rewards/cosine_scaled_reward/std": 0.46217504143714905, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1103.90625, "completions/mean_terminated_length": 789.2083740234375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.1382857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3791055381298065, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "num_tokens": 14889100.0, "reward": -0.012373358011245728, "reward_std": 0.3332873284816742, "rewards/cosine_scaled_reward/mean": -0.012373358011245728, "rewards/cosine_scaled_reward/std": 0.4969451427459717, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1765.0625, "completions/mean_terminated_length": 1042.0, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.13942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.27713218331336975, "learning_rate": 9.458418577899774e-07, "loss": -0.0, "num_tokens": 15013624.0, "reward": -0.1387348771095276, "reward_std": 0.25947195291519165, "rewards/cosine_scaled_reward/mean": -0.1387348771095276, "rewards/cosine_scaled_reward/std": 0.3304338753223419, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1576.28125, "completions/mean_terminated_length": 1006.9655151367188, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.14057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2664856016635895, "learning_rate": 9.443380060197385e-07, "loss": 0.0, "num_tokens": 15124738.0, "reward": -0.18317654728889465, "reward_std": 0.16592136025428772, "rewards/cosine_scaled_reward/mean": -0.18317654728889465, "rewards/cosine_scaled_reward/std": 0.33475980162620544, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1395.78125, "completions/mean_terminated_length": 888.5, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.1417142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2889535427093506, "learning_rate": 9.428149347714143e-07, "loss": 0.0, "num_tokens": 15225020.0, "reward": -0.12295320630073547, "reward_std": 0.30637824535369873, "rewards/cosine_scaled_reward/mean": -0.12295320630073547, "rewards/cosine_scaled_reward/std": 0.4125574827194214, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1622.8125, "completions/mean_terminated_length": 914.1666870117188, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.24003510177135468, "learning_rate": 9.412727182773486e-07, "loss": -0.0, "num_tokens": 15339808.0, "reward": -0.06917156279087067, "reward_std": 0.19467812776565552, "rewards/cosine_scaled_reward/mean": -0.06917153298854828, "rewards/cosine_scaled_reward/std": 0.44139373302459717, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1735.96875, "completions/mean_terminated_length": 1097.047607421875, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.23693455755710602, "learning_rate": 9.397114317029974e-07, "loss": -0.0, "num_tokens": 15462206.0, "reward": -0.15823431313037872, "reward_std": 0.26196378469467163, "rewards/cosine_scaled_reward/mean": -0.15823431313037872, "rewards/cosine_scaled_reward/std": 0.3110467195510864, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1729.421875, "completions/mean_terminated_length": 1161.521728515625, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 0.14514285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.23715488612651825, "learning_rate": 9.381311511432658e-07, "loss": -0.0, "num_tokens": 15583985.0, "reward": -0.2520313262939453, "reward_std": 0.1912405639886856, "rewards/cosine_scaled_reward/mean": -0.2520313262939453, "rewards/cosine_scaled_reward/std": 0.276276558637619, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1614.125, "completions/mean_terminated_length": 1090.4827880859375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.1462857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.25245338678359985, "learning_rate": 9.36531953618799e-07, "loss": -0.0, "num_tokens": 15697641.0, "reward": 0.029929369688034058, "reward_std": 0.2960119843482971, "rewards/cosine_scaled_reward/mean": 0.029929369688034058, "rewards/cosine_scaled_reward/std": 0.40772902965545654, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1892.953125, "completions/mean_terminated_length": 945.4444580078125, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.14742857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.22934643924236298, "learning_rate": 9.34913917072228e-07, "loss": -0.0, "num_tokens": 15829494.0, "reward": -0.27538371086120605, "reward_std": 0.2161153256893158, "rewards/cosine_scaled_reward/mean": -0.27538371086120605, "rewards/cosine_scaled_reward/std": 0.25140947103500366, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 1631.5625, "completions/mean_terminated_length": 889.2174072265625, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.14857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312338650226593, "learning_rate": 9.332771203643714e-07, "loss": -0.0, "num_tokens": 15944418.0, "reward": -0.16326984763145447, "reward_std": 0.22974258661270142, "rewards/cosine_scaled_reward/mean": -0.16326983273029327, "rewards/cosine_scaled_reward/std": 0.3127349317073822, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 1549.453125, "completions/mean_terminated_length": 820.8077392578125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.14971428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.28737154603004456, "learning_rate": 9.316216432703916e-07, "loss": 0.0, "num_tokens": 16053319.0, "reward": -0.060378547757864, "reward_std": 0.23251818120479584, "rewards/cosine_scaled_reward/mean": -0.060378558933734894, "rewards/cosine_scaled_reward/std": 0.4743967354297638, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 1536.859375, "completions/mean_terminated_length": 957.5667114257812, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.15085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24873872101306915, "learning_rate": 9.299475664759068e-07, "loss": 0.0, "num_tokens": 16162742.0, "reward": -0.10933490097522736, "reward_std": 0.2869688868522644, "rewards/cosine_scaled_reward/mean": -0.10933491587638855, "rewards/cosine_scaled_reward/std": 0.45436573028564453, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1817.453125, "completions/mean_terminated_length": 1125.8125, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.2753625810146332, "learning_rate": 9.282549715730579e-07, "loss": 0.0, "num_tokens": 16290283.0, "reward": -0.1931842416524887, "reward_std": 0.2315790057182312, "rewards/cosine_scaled_reward/mean": -0.1931842565536499, "rewards/cosine_scaled_reward/std": 0.26366862654685974, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 1685.390625, "completions/mean_terminated_length": 1119.719970703125, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.15314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25077056884765625, "learning_rate": 9.265439410565328e-07, "loss": -0.0, "num_tokens": 16408716.0, "reward": -0.1305551677942276, "reward_std": 0.15626969933509827, "rewards/cosine_scaled_reward/mean": -0.1305551677942276, "rewards/cosine_scaled_reward/std": 0.35703787207603455, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 1111.578125, "completions/mean_terminated_length": 654.2557983398438, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.15428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3456169366836548, "learning_rate": 9.248145583195447e-07, "loss": -0.0, "num_tokens": 16490329.0, "reward": 0.08614158630371094, "reward_std": 0.3152117133140564, "rewards/cosine_scaled_reward/mean": 0.08614158630371094, "rewards/cosine_scaled_reward/std": 0.5073397159576416, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 1485.703125, "completions/mean_terminated_length": 848.433349609375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.15542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.28029024600982666, "learning_rate": 9.230669076497687e-07, "loss": -0.0, "num_tokens": 16596086.0, "reward": 0.01799224689602852, "reward_std": 0.28087177872657776, "rewards/cosine_scaled_reward/mean": 0.017992250621318817, "rewards/cosine_scaled_reward/std": 0.5039587020874023, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1765.421875, "completions/mean_terminated_length": 1043.27783203125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.15657142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.21782204508781433, "learning_rate": 9.213010742252327e-07, "loss": 0.0, "num_tokens": 16719681.0, "reward": -0.2635670304298401, "reward_std": 0.16446365416049957, "rewards/cosine_scaled_reward/mean": -0.2635670304298401, "rewards/cosine_scaled_reward/std": 0.1840340793132782, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1529.734375, "completions/mean_terminated_length": 1072.441162109375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.15771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.26588714122772217, "learning_rate": 9.195171441101668e-07, "loss": -0.0, "num_tokens": 16828896.0, "reward": -0.08665560930967331, "reward_std": 0.23063711822032928, "rewards/cosine_scaled_reward/mean": -0.08665560930967331, "rewards/cosine_scaled_reward/std": 0.44113171100616455, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1667.8125, "completions/mean_terminated_length": 990.0869750976562, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.15885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2917172610759735, "learning_rate": 9.177152042508077e-07, "loss": -0.0, "num_tokens": 16946276.0, "reward": -0.19403964281082153, "reward_std": 0.2673150300979614, "rewards/cosine_scaled_reward/mean": -0.19403962790966034, "rewards/cosine_scaled_reward/std": 0.32773110270500183, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1412.796875, "completions/mean_terminated_length": 949.270263671875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.16, "frac_reward_zero_std": 0.125, "grad_norm": 0.28324976563453674, "learning_rate": 9.158953424711624e-07, "loss": -0.0, "num_tokens": 17046919.0, "reward": -0.13130062818527222, "reward_std": 0.13907812535762787, "rewards/cosine_scaled_reward/mean": -0.13130061328411102, "rewards/cosine_scaled_reward/std": 0.46400320529937744, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 1272.25, "completions/mean_terminated_length": 893.3953247070312, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.16114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.28660058975219727, "learning_rate": 9.140576474687263e-07, "loss": 0.0, "num_tokens": 17138903.0, "reward": -0.044462256133556366, "reward_std": 0.3412697911262512, "rewards/cosine_scaled_reward/mean": -0.04446224868297577, "rewards/cosine_scaled_reward/std": 0.4661441445350647, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1662.734375, "completions/mean_terminated_length": 1226.10009765625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.16228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3415294587612152, "learning_rate": 9.122022088101613e-07, "loss": -0.0, "num_tokens": 17255822.0, "reward": -0.15457069873809814, "reward_std": 0.31260305643081665, "rewards/cosine_scaled_reward/mean": -0.15457069873809814, "rewards/cosine_scaled_reward/std": 0.3450033664703369, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1441.203125, "completions/mean_terminated_length": 998.4054565429688, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.16342857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2900330424308777, "learning_rate": 9.103291169269299e-07, "loss": 0.0, "num_tokens": 17358875.0, "reward": -0.1936979442834854, "reward_std": 0.26940327882766724, "rewards/cosine_scaled_reward/mean": -0.1936979442834854, "rewards/cosine_scaled_reward/std": 0.31407564878463745, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1560.703125, "completions/mean_terminated_length": 1008.4334106445312, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.16457142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.29284507036209106, "learning_rate": 9.084384631108882e-07, "loss": -0.0, "num_tokens": 17470248.0, "reward": -0.14136260747909546, "reward_std": 0.2985552251338959, "rewards/cosine_scaled_reward/mean": -0.14136262238025665, "rewards/cosine_scaled_reward/std": 0.4261241853237152, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1226.0, "completions/mean_terminated_length": 852.3636474609375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.1657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.30853384733200073, "learning_rate": 9.065303395098358e-07, "loss": -0.0, "num_tokens": 17558656.0, "reward": -0.011180020868778229, "reward_std": 0.3104313910007477, "rewards/cosine_scaled_reward/mean": -0.011180016212165356, "rewards/cosine_scaled_reward/std": 0.502927303314209, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 1468.8125, "completions/mean_terminated_length": 889.625, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.16685714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.25645971298217773, "learning_rate": 9.046048391230247e-07, "loss": -0.0, "num_tokens": 17663276.0, "reward": -0.1956520974636078, "reward_std": 0.24750414490699768, "rewards/cosine_scaled_reward/mean": -0.1956520974636078, "rewards/cosine_scaled_reward/std": 0.30754002928733826, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1517.84375, "completions/mean_terminated_length": 1078.5714111328125, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.28331542015075684, "learning_rate": 9.026620557966279e-07, "loss": -0.0, "num_tokens": 17771202.0, "reward": -0.14546620845794678, "reward_std": 0.307411253452301, "rewards/cosine_scaled_reward/mean": -0.14546619355678558, "rewards/cosine_scaled_reward/std": 0.3964070975780487, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 1319.75, "completions/mean_terminated_length": 882.7999877929688, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.16914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.24973155558109283, "learning_rate": 9.007020842191634e-07, "loss": -0.0, "num_tokens": 17866850.0, "reward": -0.05917578190565109, "reward_std": 0.24221420288085938, "rewards/cosine_scaled_reward/mean": -0.05917578190565109, "rewards/cosine_scaled_reward/std": 0.39783161878585815, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 1641.578125, "completions/mean_terminated_length": 1007.5599975585938, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.1702857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.23923377692699432, "learning_rate": 8.987250199168808e-07, "loss": 0.0, "num_tokens": 17983807.0, "reward": -0.16958971321582794, "reward_std": 0.3115168809890747, "rewards/cosine_scaled_reward/mean": -0.16958969831466675, "rewards/cosine_scaled_reward/std": 0.4009650945663452, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 1294.734375, "completions/mean_terminated_length": 976.6889038085938, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2846779525279999, "learning_rate": 8.967309592491052e-07, "loss": 0.0, "num_tokens": 18077174.0, "reward": -0.16757264733314514, "reward_std": 0.26536184549331665, "rewards/cosine_scaled_reward/mean": -0.16757264733314514, "rewards/cosine_scaled_reward/std": 0.32911255955696106, "step": 150 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 18077174, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }