{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05714285714285714, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1702.03125, "completions/mean_terminated_length": 993.6190795898438, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.001142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.28377610445022583, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 118418.0, "reward": -0.09800112247467041, "reward_std": 0.3028089702129364, "rewards/cosine_scaled_reward/mean": -0.09800112992525101, "rewards/cosine_scaled_reward/std": 0.37953105568885803, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1738.90625, "completions/mean_terminated_length": 949.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24221572279930115, "learning_rate": 2e-08, "loss": -0.0, "num_tokens": 239748.0, "reward": 0.020556632429361343, "reward_std": 0.3545936942100525, "rewards/cosine_scaled_reward/mean": 0.020556632429361343, "rewards/cosine_scaled_reward/std": 0.4492928683757782, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 1964.078125, "completions/mean_terminated_length": 973.7999877929688, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.0034285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2472974807024002, "learning_rate": 4e-08, "loss": 0.0, "num_tokens": 375921.0, "reward": -0.20954538881778717, "reward_std": 0.13813795149326324, "rewards/cosine_scaled_reward/mean": -0.20954540371894836, "rewards/cosine_scaled_reward/std": 0.16814909875392914, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1555.6875, "completions/mean_terminated_length": 1093.212158203125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2868657112121582, "learning_rate": 6e-08, "loss": -0.0, "num_tokens": 485293.0, "reward": -0.12192361056804657, "reward_std": 0.31710442900657654, "rewards/cosine_scaled_reward/mean": -0.12192361056804657, "rewards/cosine_scaled_reward/std": 0.35428565740585327, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 1958.5625, "completions/mean_terminated_length": 1332.5, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2460148185491562, "learning_rate": 8e-08, "loss": -0.0, "num_tokens": 621457.0, "reward": -0.21145480871200562, "reward_std": 0.14890719950199127, "rewards/cosine_scaled_reward/mean": -0.21145479381084442, "rewards/cosine_scaled_reward/std": 0.20399661362171173, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 1908.375, "completions/mean_terminated_length": 931.0, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26549720764160156, "learning_rate": 1e-07, "loss": -0.0, "num_tokens": 755241.0, "reward": -0.2408866286277771, "reward_std": 0.16572487354278564, "rewards/cosine_scaled_reward/mean": -0.2408866286277771, "rewards/cosine_scaled_reward/std": 0.17492830753326416, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1889.296875, "completions/mean_terminated_length": 1201.5833740234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.23518230020999908, "learning_rate": 1.2e-07, "loss": 0.0, "num_tokens": 886564.0, "reward": -0.16087877750396729, "reward_std": 0.24579641222953796, "rewards/cosine_scaled_reward/mean": -0.16087877750396729, "rewards/cosine_scaled_reward/std": 0.37339961528778076, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1751.578125, "completions/mean_terminated_length": 994.0555419921875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2354528158903122, "learning_rate": 1.4e-07, "loss": 0.0, "num_tokens": 1009081.0, "reward": -0.023812226951122284, "reward_std": 0.2823081314563751, "rewards/cosine_scaled_reward/mean": -0.02381223440170288, "rewards/cosine_scaled_reward/std": 0.4484662115573883, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 2000.59375, "completions/mean_terminated_length": 1289.5, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 0.010285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.24302220344543457, "learning_rate": 1.6e-07, "loss": 0.0, "num_tokens": 1148575.0, "reward": -0.2453702688217163, "reward_std": 0.18811637163162231, "rewards/cosine_scaled_reward/mean": -0.2453702688217163, "rewards/cosine_scaled_reward/std": 0.22203005850315094, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 1701.140625, "completions/mean_terminated_length": 879.631591796875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.25642141699790955, "learning_rate": 1.8e-07, "loss": -0.0, "num_tokens": 1268280.0, "reward": -0.15177705883979797, "reward_std": 0.2125300019979477, "rewards/cosine_scaled_reward/mean": -0.15177705883979797, "rewards/cosine_scaled_reward/std": 0.3240113854408264, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 1950.609375, "completions/mean_terminated_length": 1157.571533203125, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.012571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.24372951686382294, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 1404791.0, "reward": -0.23502977192401886, "reward_std": 0.18896539509296417, "rewards/cosine_scaled_reward/mean": -0.23502977192401886, "rewards/cosine_scaled_reward/std": 0.24224351346492767, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 1751.03125, "completions/mean_terminated_length": 1221.6522216796875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.28422027826309204, "learning_rate": 2.1999999999999998e-07, "loss": -0.0, "num_tokens": 1527801.0, "reward": -0.14280016720294952, "reward_std": 0.32843896746635437, "rewards/cosine_scaled_reward/mean": -0.14280015230178833, "rewards/cosine_scaled_reward/std": 0.41895967721939087, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 1834.453125, "completions/mean_terminated_length": 1193.8125, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 0.014857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24033738672733307, "learning_rate": 2.4e-07, "loss": 0.0, "num_tokens": 1656246.0, "reward": -0.17057427763938904, "reward_std": 0.24429668486118317, "rewards/cosine_scaled_reward/mean": -0.17057427763938904, "rewards/cosine_scaled_reward/std": 0.27816399931907654, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 1800.65625, "completions/mean_terminated_length": 1116.823486328125, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312558889389038, "learning_rate": 2.6e-07, "loss": 0.0, "num_tokens": 1782096.0, "reward": -0.11817245185375214, "reward_std": 0.24491220712661743, "rewards/cosine_scaled_reward/mean": -0.11817245930433273, "rewards/cosine_scaled_reward/std": 0.3942086696624756, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 1692.828125, "completions/mean_terminated_length": 785.1666870117188, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2563658654689789, "learning_rate": 2.8e-07, "loss": -0.0, "num_tokens": 1901357.0, "reward": -0.027107469737529755, "reward_std": 0.1853453516960144, "rewards/cosine_scaled_reward/mean": -0.027107462286949158, "rewards/cosine_scaled_reward/std": 0.4734213352203369, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.24149107933044434, "learning_rate": 3e-07, "loss": -0.0, "num_tokens": 2042869.0, "reward": -0.2542623281478882, "reward_std": 0.14302438497543335, "rewards/cosine_scaled_reward/mean": -0.2542623281478882, "rewards/cosine_scaled_reward/std": 0.160969540476799, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 1548.75, "completions/mean_terminated_length": 864.5925903320312, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.019428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.31088724732398987, "learning_rate": 3.2e-07, "loss": 0.0, "num_tokens": 2152509.0, "reward": -0.12113451957702637, "reward_std": 0.284165620803833, "rewards/cosine_scaled_reward/mean": -0.12113452702760696, "rewards/cosine_scaled_reward/std": 0.4259316623210907, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 1793.03125, "completions/mean_terminated_length": 1028.125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2451843023300171, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "num_tokens": 2277639.0, "reward": -0.18317042291164398, "reward_std": 0.20634235441684723, "rewards/cosine_scaled_reward/mean": -0.18317043781280518, "rewards/cosine_scaled_reward/std": 0.27781662344932556, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 1735.984375, "completions/mean_terminated_length": 997.0, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.021714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.24677637219429016, "learning_rate": 3.6e-07, "loss": 0.0, "num_tokens": 2399998.0, "reward": -0.04996331408619881, "reward_std": 0.2841629385948181, "rewards/cosine_scaled_reward/mean": -0.04996330291032791, "rewards/cosine_scaled_reward/std": 0.4186851680278778, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 1614.890625, "completions/mean_terminated_length": 842.8261108398438, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2543003559112549, "learning_rate": 3.7999999999999996e-07, "loss": -0.0, "num_tokens": 2514703.0, "reward": -0.09282197058200836, "reward_std": 0.2568933367729187, "rewards/cosine_scaled_reward/mean": -0.09282197058200836, "rewards/cosine_scaled_reward/std": 0.4104878604412079, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1786.734375, "completions/mean_terminated_length": 1119.0555419921875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.3147278130054474, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 2639862.0, "reward": -0.16029146313667297, "reward_std": 0.2322564721107483, "rewards/cosine_scaled_reward/mean": -0.16029146313667297, "rewards/cosine_scaled_reward/std": 0.36191171407699585, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 1300.484375, "completions/mean_terminated_length": 789.0263061523438, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.32522445917129517, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "num_tokens": 2732109.0, "reward": 0.0033364146947860718, "reward_std": 0.18878400325775146, "rewards/cosine_scaled_reward/mean": 0.0033364109694957733, "rewards/cosine_scaled_reward/std": 0.45390966534614563, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1641.03125, "completions/mean_terminated_length": 1046.2308349609375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.026285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.28244850039482117, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "num_tokens": 2847927.0, "reward": -0.21077856421470642, "reward_std": 0.24399788677692413, "rewards/cosine_scaled_reward/mean": -0.21077856421470642, "rewards/cosine_scaled_reward/std": 0.2925592362880707, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1789.59375, "completions/mean_terminated_length": 1129.2222900390625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.24896308779716492, "learning_rate": 4.6e-07, "loss": -0.0, "num_tokens": 2973389.0, "reward": -0.1665852814912796, "reward_std": 0.307574987411499, "rewards/cosine_scaled_reward/mean": -0.1665852665901184, "rewards/cosine_scaled_reward/std": 0.4072873294353485, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1696.40625, "completions/mean_terminated_length": 1025.181884765625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.262716144323349, "learning_rate": 4.8e-07, "loss": 0.0, "num_tokens": 3092255.0, "reward": -0.14361324906349182, "reward_std": 0.3466429114341736, "rewards/cosine_scaled_reward/mean": -0.14361326396465302, "rewards/cosine_scaled_reward/std": 0.3933021128177643, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1973.046875, "completions/mean_terminated_length": 1448.375, "completions/min_length": 1035.0, "completions/min_terminated_length": 1035.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2365841567516327, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 3229162.0, "reward": -0.050574399530887604, "reward_std": 0.22459164261817932, "rewards/cosine_scaled_reward/mean": -0.050574399530887604, "rewards/cosine_scaled_reward/std": 0.37290775775909424, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1878.53125, "completions/mean_terminated_length": 1213.6923828125, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.030857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2821083068847656, "learning_rate": 5.2e-07, "loss": 0.0, "num_tokens": 3359676.0, "reward": -0.13096781075000763, "reward_std": 0.26249831914901733, "rewards/cosine_scaled_reward/mean": -0.13096781075000763, "rewards/cosine_scaled_reward/std": 0.3478032350540161, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 1827.453125, "completions/mean_terminated_length": 1039.7857666015625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2539210915565491, "learning_rate": 5.4e-07, "loss": 0.0, "num_tokens": 3486969.0, "reward": -0.11822876334190369, "reward_std": 0.2370690554380417, "rewards/cosine_scaled_reward/mean": -0.11822875589132309, "rewards/cosine_scaled_reward/std": 0.4236762225627899, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 2020.5, "completions/mean_terminated_length": 1608.0, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.03314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23259545862674713, "learning_rate": 5.6e-07, "loss": -0.0, "num_tokens": 3626753.0, "reward": -0.20220182836055756, "reward_std": 0.15910759568214417, "rewards/cosine_scaled_reward/mean": -0.20220182836055756, "rewards/cosine_scaled_reward/std": 0.20781411230564117, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 1903.703125, "completions/mean_terminated_length": 1208.45458984375, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24027252197265625, "learning_rate": 5.8e-07, "loss": 0.0, "num_tokens": 3759126.0, "reward": -0.19193249940872192, "reward_std": 0.24584847688674927, "rewards/cosine_scaled_reward/mean": -0.19193249940872192, "rewards/cosine_scaled_reward/std": 0.28378522396087646, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 1847.34375, "completions/mean_terminated_length": 1060.1539306640625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.03542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2703397274017334, "learning_rate": 6e-07, "loss": -0.0, "num_tokens": 3887852.0, "reward": -0.25379180908203125, "reward_std": 0.24661941826343536, "rewards/cosine_scaled_reward/mean": -0.25379180908203125, "rewards/cosine_scaled_reward/std": 0.29188498854637146, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1950.3125, "completions/mean_terminated_length": 1479.6363525390625, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.21763876080513, "learning_rate": 6.2e-07, "loss": -0.0, "num_tokens": 4023024.0, "reward": -0.16017228364944458, "reward_std": 0.2255343496799469, "rewards/cosine_scaled_reward/mean": -0.16017228364944458, "rewards/cosine_scaled_reward/std": 0.3709539771080017, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1996.28125, "completions/mean_terminated_length": 1634.25, "completions/min_length": 1237.0, "completions/min_terminated_length": 1237.0, "epoch": 0.037714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22758260369300842, "learning_rate": 6.4e-07, "loss": -0.0, "num_tokens": 4162002.0, "reward": -0.20318198204040527, "reward_std": 0.18396919965744019, "rewards/cosine_scaled_reward/mean": -0.20318198204040527, "rewards/cosine_scaled_reward/std": 0.34913352131843567, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 1703.265625, "completions/mean_terminated_length": 1230.851806640625, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.31658875942230225, "learning_rate": 6.6e-07, "loss": -0.0, "num_tokens": 4280563.0, "reward": -0.05977274850010872, "reward_std": 0.30437377095222473, "rewards/cosine_scaled_reward/mean": -0.059772733598947525, "rewards/cosine_scaled_reward/std": 0.4424094259738922, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 1807.546875, "completions/mean_terminated_length": 765.5833740234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.2792847156524658, "learning_rate": 6.800000000000001e-07, "loss": -0.0, "num_tokens": 4407742.0, "reward": -0.18658886849880219, "reward_std": 0.2910658121109009, "rewards/cosine_scaled_reward/mean": -0.18658888339996338, "rewards/cosine_scaled_reward/std": 0.34802255034446716, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 1995.65625, "completions/mean_terminated_length": 1378.0, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23547738790512085, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 4546576.0, "reward": -0.23918019235134125, "reward_std": 0.19598917663097382, "rewards/cosine_scaled_reward/mean": -0.23918019235134125, "rewards/cosine_scaled_reward/std": 0.2425125539302826, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1994.75, "completions/mean_terminated_length": 1480.0, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.04228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.22962674498558044, "learning_rate": 7.2e-07, "loss": -0.0, "num_tokens": 4685264.0, "reward": -0.25335729122161865, "reward_std": 0.15323391556739807, "rewards/cosine_scaled_reward/mean": -0.25335729122161865, "rewards/cosine_scaled_reward/std": 0.17556406557559967, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1957.484375, "completions/mean_terminated_length": 1220.4285888671875, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24781912565231323, "learning_rate": 7.4e-07, "loss": -0.0, "num_tokens": 4822255.0, "reward": -0.13536512851715088, "reward_std": 0.19208545982837677, "rewards/cosine_scaled_reward/mean": -0.13536511361598969, "rewards/cosine_scaled_reward/std": 0.30052343010902405, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1744.421875, "completions/mean_terminated_length": 833.6875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.044571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.2562144994735718, "learning_rate": 7.599999999999999e-07, "loss": -0.0, "num_tokens": 4944682.0, "reward": -0.041110455989837646, "reward_std": 0.21381449699401855, "rewards/cosine_scaled_reward/mean": -0.04111045226454735, "rewards/cosine_scaled_reward/std": 0.35980772972106934, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1774.359375, "completions/mean_terminated_length": 1017.8235473632812, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25478634238243103, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "num_tokens": 5068313.0, "reward": -0.12165145576000214, "reward_std": 0.17204006016254425, "rewards/cosine_scaled_reward/mean": -0.12165144830942154, "rewards/cosine_scaled_reward/std": 0.4099982678890228, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1814.375, "completions/mean_terminated_length": 1397.9130859375, "completions/min_length": 968.0, "completions/min_terminated_length": 968.0, "epoch": 0.046857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.21750310063362122, "learning_rate": 8e-07, "loss": 0.0, "num_tokens": 5195585.0, "reward": -0.25668060779571533, "reward_std": 0.2832298278808594, "rewards/cosine_scaled_reward/mean": -0.25668060779571533, "rewards/cosine_scaled_reward/std": 0.3347759544849396, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1714.59375, "completions/mean_terminated_length": 625.4666748046875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.34486907720565796, "learning_rate": 8.199999999999999e-07, "loss": -0.0, "num_tokens": 5315679.0, "reward": -0.2253742218017578, "reward_std": 0.1778060495853424, "rewards/cosine_scaled_reward/mean": -0.22537420690059662, "rewards/cosine_scaled_reward/std": 0.19647939503192902, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 1863.78125, "completions/mean_terminated_length": 976.1818237304688, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.04914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23907455801963806, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "num_tokens": 5446577.0, "reward": -0.1142776757478714, "reward_std": 0.21804723143577576, "rewards/cosine_scaled_reward/mean": -0.1142776757478714, "rewards/cosine_scaled_reward/std": 0.3637608587741852, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1771.125, "completions/mean_terminated_length": 940.5, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2888188362121582, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "num_tokens": 5570625.0, "reward": -0.11845305562019348, "reward_std": 0.2729855477809906, "rewards/cosine_scaled_reward/mean": -0.11845306307077408, "rewards/cosine_scaled_reward/std": 0.4279690086841583, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 2020.859375, "completions/mean_terminated_length": 1179.5, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2232045829296112, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "num_tokens": 5711616.0, "reward": -0.1830526441335678, "reward_std": 0.20074567198753357, "rewards/cosine_scaled_reward/mean": -0.1830526441335678, "rewards/cosine_scaled_reward/std": 0.3221423327922821, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 1843.328125, "completions/mean_terminated_length": 857.1818237304688, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2569328844547272, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 5840757.0, "reward": -0.21247822046279907, "reward_std": 0.17188501358032227, "rewards/cosine_scaled_reward/mean": -0.21247822046279907, "rewards/cosine_scaled_reward/std": 0.183182492852211, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1772.984375, "completions/mean_terminated_length": 1012.6470336914062, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.053714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2800576090812683, "learning_rate": 9.2e-07, "loss": -0.0, "num_tokens": 5964628.0, "reward": -0.1755329668521881, "reward_std": 0.19662824273109436, "rewards/cosine_scaled_reward/mean": -0.1755329668521881, "rewards/cosine_scaled_reward/std": 0.3987559974193573, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1787.046875, "completions/mean_terminated_length": 1120.1666259765625, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.2499135434627533, "learning_rate": 9.399999999999999e-07, "loss": -0.0, "num_tokens": 6089543.0, "reward": -0.07469595968723297, "reward_std": 0.2802818715572357, "rewards/cosine_scaled_reward/mean": -0.07469595968723297, "rewards/cosine_scaled_reward/std": 0.39331451058387756, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1611.65625, "completions/mean_terminated_length": 1013.7037353515625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.2976716160774231, "learning_rate": 9.6e-07, "loss": -0.0, "num_tokens": 6202753.0, "reward": -0.14219576120376587, "reward_std": 0.3252427875995636, "rewards/cosine_scaled_reward/mean": -0.14219576120376587, "rewards/cosine_scaled_reward/std": 0.41946855187416077, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1826.90625, "completions/mean_terminated_length": 761.6364135742188, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2344626933336258, "learning_rate": 9.8e-07, "loss": -0.0, "num_tokens": 6330491.0, "reward": -0.098542720079422, "reward_std": 0.20483215153217316, "rewards/cosine_scaled_reward/mean": -0.0985427126288414, "rewards/cosine_scaled_reward/std": 0.396296888589859, "step": 50 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 6330491, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }