| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.22857142857142856, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.671875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1734.0, | |
| "completions/mean_length": 1702.03125, | |
| "completions/mean_terminated_length": 993.6190795898438, | |
| "completions/min_length": 483.0, | |
| "completions/min_terminated_length": 483.0, | |
| "epoch": 0.001142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20052470266819, | |
| "learning_rate": 0.0, | |
| "loss": 0.0427, | |
| "num_tokens": 118418.0, | |
| "reward": 0.17899775505065918, | |
| "reward_std": 0.7650213241577148, | |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, | |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, | |
| "rewards/format_reward/mean": 0.375, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1894.0, | |
| "completions/mean_length": 1738.90625, | |
| "completions/mean_terminated_length": 949.0, | |
| "completions/min_length": 435.0, | |
| "completions/min_terminated_length": 435.0, | |
| "epoch": 0.002285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19504369795322418, | |
| "learning_rate": 5e-08, | |
| "loss": 0.0561, | |
| "num_tokens": 239748.0, | |
| "reward": 0.3848632574081421, | |
| "reward_std": 0.9111153483390808, | |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, | |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.90625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1896.0, | |
| "completions/mean_length": 1948.96875, | |
| "completions/mean_terminated_length": 991.6666870117188, | |
| "completions/min_length": 534.0, | |
| "completions/min_terminated_length": 534.0, | |
| "epoch": 0.0034285714285714284, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.23850594460964203, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0525, | |
| "num_tokens": 374954.0, | |
| "reward": -0.2894650101661682, | |
| "reward_std": 0.40320682525634766, | |
| "rewards/cosine_scaled_reward/mean": -0.1916075050830841, | |
| "rewards/cosine_scaled_reward/std": 0.17467568814754486, | |
| "rewards/format_reward/mean": 0.09375, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.53125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1653.0, | |
| "completions/mean_length": 1545.390625, | |
| "completions/mean_terminated_length": 975.7667236328125, | |
| "completions/min_length": 564.0, | |
| "completions/min_terminated_length": 564.0, | |
| "epoch": 0.004571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19908685982227325, | |
| "learning_rate": 1.5e-07, | |
| "loss": 0.0836, | |
| "num_tokens": 483667.0, | |
| "reward": 0.1905757486820221, | |
| "reward_std": 0.6709368824958801, | |
| "rewards/cosine_scaled_reward/mean": -0.16252461075782776, | |
| "rewards/cosine_scaled_reward/std": 0.27594515681266785, | |
| "rewards/format_reward/mean": 0.515625, | |
| "rewards/format_reward/std": 0.5037065148353577, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.90625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2019.0, | |
| "completions/mean_length": 1966.78125, | |
| "completions/mean_terminated_length": 1181.666748046875, | |
| "completions/min_length": 474.0, | |
| "completions/min_terminated_length": 474.0, | |
| "epoch": 0.005714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21755796670913696, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0519, | |
| "num_tokens": 620357.0, | |
| "reward": -0.402042031288147, | |
| "reward_std": 0.399784117937088, | |
| "rewards/cosine_scaled_reward/mean": -0.24789603054523468, | |
| "rewards/cosine_scaled_reward/std": 0.18156999349594116, | |
| "rewards/format_reward/mean": 0.09375, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1264.0, | |
| "completions/mean_length": 1897.390625, | |
| "completions/mean_terminated_length": 843.125, | |
| "completions/min_length": 628.0, | |
| "completions/min_terminated_length": 628.0, | |
| "epoch": 0.006857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2295181304216385, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0729, | |
| "num_tokens": 753438.0, | |
| "reward": -0.3786737024784088, | |
| "reward_std": 0.4345499277114868, | |
| "rewards/cosine_scaled_reward/mean": -0.2596493363380432, | |
| "rewards/cosine_scaled_reward/std": 0.1708926111459732, | |
| "rewards/format_reward/mean": 0.140625, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2000.0, | |
| "completions/mean_length": 1933.21875, | |
| "completions/mean_terminated_length": 1231.77783203125, | |
| "completions/min_length": 863.0, | |
| "completions/min_terminated_length": 863.0, | |
| "epoch": 0.008, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20217153429985046, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0254, | |
| "num_tokens": 887572.0, | |
| "reward": -0.13325583934783936, | |
| "reward_std": 0.5423575639724731, | |
| "rewards/cosine_scaled_reward/mean": -0.17600291967391968, | |
| "rewards/cosine_scaled_reward/std": 0.35686567425727844, | |
| "rewards/format_reward/mean": 0.21875, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2024.0, | |
| "completions/mean_length": 1776.96875, | |
| "completions/mean_terminated_length": 1180.7000732421875, | |
| "completions/min_length": 342.0, | |
| "completions/min_terminated_length": 342.0, | |
| "epoch": 0.009142857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19864660501480103, | |
| "learning_rate": 3.5e-07, | |
| "loss": -0.0092, | |
| "num_tokens": 1011714.0, | |
| "reward": 0.35212597250938416, | |
| "reward_std": 0.7144544720649719, | |
| "rewards/cosine_scaled_reward/mean": -0.003624534234404564, | |
| "rewards/cosine_scaled_reward/std": 0.515006422996521, | |
| "rewards/format_reward/mean": 0.359375, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.890625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1941.0, | |
| "completions/mean_length": 1951.0625, | |
| "completions/mean_terminated_length": 1161.71435546875, | |
| "completions/min_length": 636.0, | |
| "completions/min_terminated_length": 636.0, | |
| "epoch": 0.010285714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20887432992458344, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0806, | |
| "num_tokens": 1148038.0, | |
| "reward": -0.3706062436103821, | |
| "reward_std": 0.4610140025615692, | |
| "rewards/cosine_scaled_reward/mean": -0.25561562180519104, | |
| "rewards/cosine_scaled_reward/std": 0.1772036999464035, | |
| "rewards/format_reward/mean": 0.140625, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.703125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1271.0, | |
| "completions/mean_length": 1669.9375, | |
| "completions/mean_terminated_length": 774.5263061523438, | |
| "completions/min_length": 303.0, | |
| "completions/min_terminated_length": 303.0, | |
| "epoch": 0.011428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20181182026863098, | |
| "learning_rate": 4.5e-07, | |
| "loss": 0.043, | |
| "num_tokens": 1265746.0, | |
| "reward": 0.0919075608253479, | |
| "reward_std": 0.5226040482521057, | |
| "rewards/cosine_scaled_reward/mean": -0.10248372703790665, | |
| "rewards/cosine_scaled_reward/std": 0.37469154596328735, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.921875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 839.0, | |
| "completions/mean_length": 1948.453125, | |
| "completions/mean_terminated_length": 773.7999877929688, | |
| "completions/min_length": 659.0, | |
| "completions/min_terminated_length": 659.0, | |
| "epoch": 0.012571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21668891608715057, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0312, | |
| "num_tokens": 1402119.0, | |
| "reward": -0.4548088014125824, | |
| "reward_std": 0.35335251688957214, | |
| "rewards/cosine_scaled_reward/mean": -0.2664669156074524, | |
| "rewards/cosine_scaled_reward/std": 0.1670963168144226, | |
| "rewards/format_reward/mean": 0.078125, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.578125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1787.0, | |
| "completions/mean_length": 1666.046875, | |
| "completions/mean_terminated_length": 1142.629638671875, | |
| "completions/min_length": 157.0, | |
| "completions/min_terminated_length": 157.0, | |
| "epoch": 0.013714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22070375084877014, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0437, | |
| "num_tokens": 1519690.0, | |
| "reward": 0.07585961371660233, | |
| "reward_std": 0.7337090373039246, | |
| "rewards/cosine_scaled_reward/mean": -0.21207019686698914, | |
| "rewards/cosine_scaled_reward/std": 0.32506927847862244, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.703125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1837.0, | |
| "completions/mean_length": 1780.578125, | |
| "completions/mean_terminated_length": 1147.2105712890625, | |
| "completions/min_length": 780.0, | |
| "completions/min_terminated_length": 780.0, | |
| "epoch": 0.014857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21096666157245636, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0463, | |
| "num_tokens": 1644687.0, | |
| "reward": 0.10567126423120499, | |
| "reward_std": 0.7079647779464722, | |
| "rewards/cosine_scaled_reward/mean": -0.11122686415910721, | |
| "rewards/cosine_scaled_reward/std": 0.3569961190223694, | |
| "rewards/format_reward/mean": 0.328125, | |
| "rewards/format_reward/std": 0.4732423722743988, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.765625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1880.0, | |
| "completions/mean_length": 1887.984375, | |
| "completions/mean_terminated_length": 1365.2667236328125, | |
| "completions/min_length": 824.0, | |
| "completions/min_terminated_length": 824.0, | |
| "epoch": 0.016, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21131716668605804, | |
| "learning_rate": 6.5e-07, | |
| "loss": 0.0144, | |
| "num_tokens": 1776126.0, | |
| "reward": -0.0225231796503067, | |
| "reward_std": 0.5179126262664795, | |
| "rewards/cosine_scaled_reward/mean": -0.14407408237457275, | |
| "rewards/cosine_scaled_reward/std": 0.33444011211395264, | |
| "rewards/format_reward/mean": 0.265625, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1351.0, | |
| "completions/mean_length": 1718.78125, | |
| "completions/mean_terminated_length": 731.125, | |
| "completions/min_length": 420.0, | |
| "completions/min_terminated_length": 420.0, | |
| "epoch": 0.017142857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1991148591041565, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0049, | |
| "num_tokens": 1897048.0, | |
| "reward": 0.19555333256721497, | |
| "reward_std": 0.40205830335617065, | |
| "rewards/cosine_scaled_reward/mean": -0.04284832626581192, | |
| "rewards/cosine_scaled_reward/std": 0.4670048952102661, | |
| "rewards/format_reward/mean": 0.28125, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1697.0, | |
| "completions/mean_length": 2027.5, | |
| "completions/mean_terminated_length": 1392.0, | |
| "completions/min_length": 1087.0, | |
| "completions/min_terminated_length": 1087.0, | |
| "epoch": 0.018285714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22394295036792755, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0187, | |
| "num_tokens": 2037248.0, | |
| "reward": -0.47975414991378784, | |
| "reward_std": 0.3722427487373352, | |
| "rewards/cosine_scaled_reward/mean": -0.2555020749568939, | |
| "rewards/cosine_scaled_reward/std": 0.17358116805553436, | |
| "rewards/format_reward/mean": 0.03125, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.640625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1789.0, | |
| "completions/mean_length": 1608.859375, | |
| "completions/mean_terminated_length": 826.0435180664062, | |
| "completions/min_length": 325.0, | |
| "completions/min_terminated_length": 325.0, | |
| "epoch": 0.019428571428571427, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20954757928848267, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0717, | |
| "num_tokens": 2150735.0, | |
| "reward": 0.09985511004924774, | |
| "reward_std": 0.7668930292129517, | |
| "rewards/cosine_scaled_reward/mean": -0.13757243752479553, | |
| "rewards/cosine_scaled_reward/std": 0.3857298791408539, | |
| "rewards/format_reward/mean": 0.375, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.78125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1656.0, | |
| "completions/mean_length": 1832.9375, | |
| "completions/mean_terminated_length": 1064.857177734375, | |
| "completions/min_length": 616.0, | |
| "completions/min_terminated_length": 616.0, | |
| "epoch": 0.02057142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19936956465244293, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.0415, | |
| "num_tokens": 2278419.0, | |
| "reward": -0.09606979787349701, | |
| "reward_std": 0.6028552055358887, | |
| "rewards/cosine_scaled_reward/mean": -0.1886598914861679, | |
| "rewards/cosine_scaled_reward/std": 0.2934761047363281, | |
| "rewards/format_reward/mean": 0.28125, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1992.0, | |
| "completions/mean_length": 1797.421875, | |
| "completions/mean_terminated_length": 1157.0555419921875, | |
| "completions/min_length": 548.0, | |
| "completions/min_terminated_length": 548.0, | |
| "epoch": 0.021714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20787546038627625, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0691, | |
| "num_tokens": 2404710.0, | |
| "reward": 0.3256925344467163, | |
| "reward_std": 0.7026835680007935, | |
| "rewards/cosine_scaled_reward/mean": -0.02465374395251274, | |
| "rewards/cosine_scaled_reward/std": 0.48578760027885437, | |
| "rewards/format_reward/mean": 0.375, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.609375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1981.0, | |
| "completions/mean_length": 1595.921875, | |
| "completions/mean_terminated_length": 890.6799926757812, | |
| "completions/min_length": 357.0, | |
| "completions/min_terminated_length": 357.0, | |
| "epoch": 0.022857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19203181564807892, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": 0.0843, | |
| "num_tokens": 2518201.0, | |
| "reward": 0.2115776240825653, | |
| "reward_std": 0.6924929618835449, | |
| "rewards/cosine_scaled_reward/mean": -0.09733618050813675, | |
| "rewards/cosine_scaled_reward/std": 0.4008020758628845, | |
| "rewards/format_reward/mean": 0.40625, | |
| "rewards/format_reward/std": 0.49501484632492065, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.65625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1892.0, | |
| "completions/mean_length": 1669.71875, | |
| "completions/mean_terminated_length": 947.5454711914062, | |
| "completions/min_length": 333.0, | |
| "completions/min_terminated_length": 333.0, | |
| "epoch": 0.024, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19905951619148254, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0554, | |
| "num_tokens": 2635871.0, | |
| "reward": -0.04711771011352539, | |
| "reward_std": 0.6225218772888184, | |
| "rewards/cosine_scaled_reward/mean": -0.2032463699579239, | |
| "rewards/cosine_scaled_reward/std": 0.32066139578819275, | |
| "rewards/format_reward/mean": 0.359375, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1776.0, | |
| "completions/mean_length": 1381.5625, | |
| "completions/mean_terminated_length": 793.5294189453125, | |
| "completions/min_length": 290.0, | |
| "completions/min_terminated_length": 290.0, | |
| "epoch": 0.025142857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2047095149755478, | |
| "learning_rate": 9.99931462820376e-07, | |
| "loss": 0.0102, | |
| "num_tokens": 2733307.0, | |
| "reward": 0.5420082807540894, | |
| "reward_std": 0.5808548927307129, | |
| "rewards/cosine_scaled_reward/mean": -0.04149584099650383, | |
| "rewards/cosine_scaled_reward/std": 0.45060864090919495, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.640625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1883.0, | |
| "completions/mean_length": 1658.0, | |
| "completions/mean_terminated_length": 962.7826538085938, | |
| "completions/min_length": 405.0, | |
| "completions/min_terminated_length": 405.0, | |
| "epoch": 0.026285714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19252249598503113, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0556, | |
| "num_tokens": 2850211.0, | |
| "reward": -0.003935225307941437, | |
| "reward_std": 0.5448156595230103, | |
| "rewards/cosine_scaled_reward/mean": -0.21290510892868042, | |
| "rewards/cosine_scaled_reward/std": 0.3244985342025757, | |
| "rewards/format_reward/mean": 0.421875, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.65625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1979.0, | |
| "completions/mean_length": 1739.015625, | |
| "completions/mean_terminated_length": 1149.1363525390625, | |
| "completions/min_length": 512.0, | |
| "completions/min_terminated_length": 512.0, | |
| "epoch": 0.027428571428571427, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20268025994300842, | |
| "learning_rate": 9.993832906395582e-07, | |
| "loss": 0.0283, | |
| "num_tokens": 2972436.0, | |
| "reward": 0.023234538733959198, | |
| "reward_std": 0.5804120898246765, | |
| "rewards/cosine_scaled_reward/mean": -0.1836952269077301, | |
| "rewards/cosine_scaled_reward/std": 0.3640914857387543, | |
| "rewards/format_reward/mean": 0.390625, | |
| "rewards/format_reward/std": 0.4917473793029785, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1737.0, | |
| "completions/mean_length": 1718.3125, | |
| "completions/mean_terminated_length": 875.7777709960938, | |
| "completions/min_length": 484.0, | |
| "completions/min_terminated_length": 484.0, | |
| "epoch": 0.02857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21169544756412506, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0649, | |
| "num_tokens": 3092704.0, | |
| "reward": -0.048267342150211334, | |
| "reward_std": 0.6947153210639954, | |
| "rewards/cosine_scaled_reward/mean": -0.17257116734981537, | |
| "rewards/cosine_scaled_reward/std": 0.33179494738578796, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.796875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2039.0, | |
| "completions/mean_length": 1931.46875, | |
| "completions/mean_terminated_length": 1474.3077392578125, | |
| "completions/min_length": 860.0, | |
| "completions/min_terminated_length": 860.0, | |
| "epoch": 0.029714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21874327957630157, | |
| "learning_rate": 9.982876141412855e-07, | |
| "loss": 0.0248, | |
| "num_tokens": 3226950.0, | |
| "reward": 0.07520664483308792, | |
| "reward_std": 0.5721991658210754, | |
| "rewards/cosine_scaled_reward/mean": -0.09520917385816574, | |
| "rewards/cosine_scaled_reward/std": 0.355131059885025, | |
| "rewards/format_reward/mean": 0.265625, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1709.0, | |
| "completions/mean_length": 1887.21875, | |
| "completions/mean_terminated_length": 904.6666870117188, | |
| "completions/min_length": 505.0, | |
| "completions/min_terminated_length": 505.0, | |
| "epoch": 0.030857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2260063886642456, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0282, | |
| "num_tokens": 3358020.0, | |
| "reward": -0.12340383231639862, | |
| "reward_std": 0.6229674220085144, | |
| "rewards/cosine_scaled_reward/mean": -0.1788894236087799, | |
| "rewards/cosine_scaled_reward/std": 0.27315112948417664, | |
| "rewards/format_reward/mean": 0.234375, | |
| "rewards/format_reward/std": 0.42695629596710205, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1976.0, | |
| "completions/mean_length": 1818.03125, | |
| "completions/mean_terminated_length": 1128.125, | |
| "completions/min_length": 441.0, | |
| "completions/min_terminated_length": 441.0, | |
| "epoch": 0.032, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2172878384590149, | |
| "learning_rate": 9.96645768238595e-07, | |
| "loss": 0.0203, | |
| "num_tokens": 3484710.0, | |
| "reward": -0.06130418926477432, | |
| "reward_std": 0.6516651511192322, | |
| "rewards/cosine_scaled_reward/mean": -0.17908960580825806, | |
| "rewards/cosine_scaled_reward/std": 0.3907976746559143, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.953125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1187.0, | |
| "completions/mean_length": 1990.765625, | |
| "completions/mean_terminated_length": 827.0, | |
| "completions/min_length": 625.0, | |
| "completions/min_terminated_length": 625.0, | |
| "epoch": 0.03314285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21073698997497559, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0469, | |
| "num_tokens": 3622591.0, | |
| "reward": -0.33952879905700684, | |
| "reward_std": 0.447256475687027, | |
| "rewards/cosine_scaled_reward/mean": -0.20882689952850342, | |
| "rewards/cosine_scaled_reward/std": 0.20297211408615112, | |
| "rewards/format_reward/mean": 0.078125, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2034.0, | |
| "completions/mean_length": 1843.828125, | |
| "completions/mean_terminated_length": 1231.3125, | |
| "completions/min_length": 767.0, | |
| "completions/min_terminated_length": 767.0, | |
| "epoch": 0.03428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21709226071834564, | |
| "learning_rate": 9.944597532678119e-07, | |
| "loss": 0.0171, | |
| "num_tokens": 3751132.0, | |
| "reward": -0.024381320923566818, | |
| "reward_std": 0.6315211057662964, | |
| "rewards/cosine_scaled_reward/mean": -0.16062816977500916, | |
| "rewards/cosine_scaled_reward/std": 0.2835782468318939, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1040.0, | |
| "completions/mean_length": 1853.625, | |
| "completions/mean_terminated_length": 665.7777709960938, | |
| "completions/min_length": 496.0, | |
| "completions/min_terminated_length": 496.0, | |
| "epoch": 0.03542857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20489497482776642, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0071, | |
| "num_tokens": 3880260.0, | |
| "reward": -0.22396349906921387, | |
| "reward_std": 0.6550674438476562, | |
| "rewards/cosine_scaled_reward/mean": -0.19791924953460693, | |
| "rewards/cosine_scaled_reward/std": 0.3350917100906372, | |
| "rewards/format_reward/mean": 0.171875, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1997.0, | |
| "completions/mean_length": 1902.109375, | |
| "completions/mean_terminated_length": 1269.916748046875, | |
| "completions/min_length": 772.0, | |
| "completions/min_terminated_length": 772.0, | |
| "epoch": 0.036571428571428574, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20957782864570618, | |
| "learning_rate": 9.917322325514487e-07, | |
| "loss": 0.0611, | |
| "num_tokens": 4012347.0, | |
| "reward": -0.22782376408576965, | |
| "reward_std": 0.6326622366905212, | |
| "rewards/cosine_scaled_reward/mean": -0.22328688204288483, | |
| "rewards/cosine_scaled_reward/std": 0.3028508722782135, | |
| "rewards/format_reward/mean": 0.21875, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1698.0, | |
| "completions/mean_length": 1945.34375, | |
| "completions/mean_terminated_length": 1226.75, | |
| "completions/min_length": 887.0, | |
| "completions/min_terminated_length": 887.0, | |
| "epoch": 0.037714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22317089140415192, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0347, | |
| "num_tokens": 4148065.0, | |
| "reward": -0.47040778398513794, | |
| "reward_std": 0.4409722089767456, | |
| "rewards/cosine_scaled_reward/mean": -0.30551639199256897, | |
| "rewards/cosine_scaled_reward/std": 0.22323259711265564, | |
| "rewards/format_reward/mean": 0.140625, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.515625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1826.0, | |
| "completions/mean_length": 1541.515625, | |
| "completions/mean_terminated_length": 1002.3547973632812, | |
| "completions/min_length": 475.0, | |
| "completions/min_terminated_length": 475.0, | |
| "epoch": 0.038857142857142854, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2360963523387909, | |
| "learning_rate": 9.88466529153356e-07, | |
| "loss": 0.0712, | |
| "num_tokens": 4256274.0, | |
| "reward": 0.5805569291114807, | |
| "reward_std": 0.8525061011314392, | |
| "rewards/cosine_scaled_reward/mean": 0.04027845710515976, | |
| "rewards/cosine_scaled_reward/std": 0.49936607480049133, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.796875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1861.0, | |
| "completions/mean_length": 1808.921875, | |
| "completions/mean_terminated_length": 871.0000610351562, | |
| "completions/min_length": 466.0, | |
| "completions/min_terminated_length": 466.0, | |
| "epoch": 0.04, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1972445547580719, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0577, | |
| "num_tokens": 4383541.0, | |
| "reward": 0.00036025047302246094, | |
| "reward_std": 0.8111597895622253, | |
| "rewards/cosine_scaled_reward/mean": -0.10919487476348877, | |
| "rewards/cosine_scaled_reward/std": 0.44675883650779724, | |
| "rewards/format_reward/mean": 0.21875, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.921875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1979.0, | |
| "completions/mean_length": 1990.765625, | |
| "completions/mean_terminated_length": 1315.4000244140625, | |
| "completions/min_length": 937.0, | |
| "completions/min_terminated_length": 937.0, | |
| "epoch": 0.04114285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2348623126745224, | |
| "learning_rate": 9.846666218300807e-07, | |
| "loss": 0.0216, | |
| "num_tokens": 4522062.0, | |
| "reward": -0.4222595691680908, | |
| "reward_std": 0.4755689203739166, | |
| "rewards/cosine_scaled_reward/mean": -0.2501922845840454, | |
| "rewards/cosine_scaled_reward/std": 0.2129606157541275, | |
| "rewards/format_reward/mean": 0.078125, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.84375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1928.0, | |
| "completions/mean_length": 1911.296875, | |
| "completions/mean_terminated_length": 1173.0999755859375, | |
| "completions/min_length": 629.0, | |
| "completions/min_terminated_length": 629.0, | |
| "epoch": 0.04228571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22154958546161652, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0467, | |
| "num_tokens": 4655409.0, | |
| "reward": -0.2846450209617615, | |
| "reward_std": 0.4525028467178345, | |
| "rewards/cosine_scaled_reward/mean": -0.23607251048088074, | |
| "rewards/cosine_scaled_reward/std": 0.19240929186344147, | |
| "rewards/format_reward/mean": 0.1875, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.84375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1530.0, | |
| "completions/mean_length": 1906.65625, | |
| "completions/mean_terminated_length": 1143.4000244140625, | |
| "completions/min_length": 530.0, | |
| "completions/min_terminated_length": 530.0, | |
| "epoch": 0.04342857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2259596437215805, | |
| "learning_rate": 9.80337140183366e-07, | |
| "loss": 0.0219, | |
| "num_tokens": 4789147.0, | |
| "reward": -0.14314083755016327, | |
| "reward_std": 0.4587753117084503, | |
| "rewards/cosine_scaled_reward/mean": -0.14969542622566223, | |
| "rewards/cosine_scaled_reward/std": 0.30969110131263733, | |
| "rewards/format_reward/mean": 0.15625, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.765625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2042.0, | |
| "completions/mean_length": 1729.984375, | |
| "completions/mean_terminated_length": 691.1333618164062, | |
| "completions/min_length": 312.0, | |
| "completions/min_terminated_length": 312.0, | |
| "epoch": 0.044571428571428574, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1975395530462265, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0518, | |
| "num_tokens": 4910650.0, | |
| "reward": 0.20782151818275452, | |
| "reward_std": 0.5801891088485718, | |
| "rewards/cosine_scaled_reward/mean": -0.08358924090862274, | |
| "rewards/cosine_scaled_reward/std": 0.3715744912624359, | |
| "rewards/format_reward/mean": 0.375, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.546875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1936.0, | |
| "completions/mean_length": 1565.40625, | |
| "completions/mean_terminated_length": 982.9655151367188, | |
| "completions/min_length": 393.0, | |
| "completions/min_terminated_length": 393.0, | |
| "epoch": 0.045714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19556699693202972, | |
| "learning_rate": 9.754833590196926e-07, | |
| "loss": 0.0176, | |
| "num_tokens": 5020908.0, | |
| "reward": 0.21666434407234192, | |
| "reward_std": 0.47607892751693726, | |
| "rewards/cosine_scaled_reward/mean": -0.12604281306266785, | |
| "rewards/cosine_scaled_reward/std": 0.4459211230278015, | |
| "rewards/format_reward/mean": 0.46875, | |
| "rewards/format_reward/std": 0.5029674172401428, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1971.0, | |
| "completions/mean_length": 1847.96875, | |
| "completions/mean_terminated_length": 1247.875, | |
| "completions/min_length": 799.0, | |
| "completions/min_terminated_length": 799.0, | |
| "epoch": 0.046857142857142854, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19488316774368286, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0491, | |
| "num_tokens": 5150330.0, | |
| "reward": -0.15268605947494507, | |
| "reward_std": 0.6881446838378906, | |
| "rewards/cosine_scaled_reward/mean": -0.22478052973747253, | |
| "rewards/cosine_scaled_reward/std": 0.3324533700942993, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2018.0, | |
| "completions/mean_length": 1661.296875, | |
| "completions/mean_terminated_length": 673.0555419921875, | |
| "completions/min_length": 134.0, | |
| "completions/min_terminated_length": 134.0, | |
| "epoch": 0.048, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21376171708106995, | |
| "learning_rate": 9.701111919237408e-07, | |
| "loss": 0.0433, | |
| "num_tokens": 5267013.0, | |
| "reward": -0.20060807466506958, | |
| "reward_std": 0.34422361850738525, | |
| "rewards/cosine_scaled_reward/mean": -0.24874155223369598, | |
| "rewards/cosine_scaled_reward/std": 0.17742608487606049, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.78125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2040.0, | |
| "completions/mean_length": 1802.484375, | |
| "completions/mean_terminated_length": 925.6428833007812, | |
| "completions/min_length": 580.0, | |
| "completions/min_terminated_length": 580.0, | |
| "epoch": 0.04914285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20949861407279968, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0468, | |
| "num_tokens": 5393988.0, | |
| "reward": 0.1097467839717865, | |
| "reward_std": 0.4439903795719147, | |
| "rewards/cosine_scaled_reward/mean": -0.07012660801410675, | |
| "rewards/cosine_scaled_reward/std": 0.35852304100990295, | |
| "rewards/format_reward/mean": 0.25, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1497.0, | |
| "completions/mean_length": 1639.375, | |
| "completions/mean_terminated_length": 740.4000244140625, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "epoch": 0.05028571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20765061676502228, | |
| "learning_rate": 9.64227184053598e-07, | |
| "loss": 0.0677, | |
| "num_tokens": 5509604.0, | |
| "reward": 0.1744289994239807, | |
| "reward_std": 0.7545564770698547, | |
| "rewards/cosine_scaled_reward/mean": -0.09247300028800964, | |
| "rewards/cosine_scaled_reward/std": 0.486594021320343, | |
| "rewards/format_reward/mean": 0.359375, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.921875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1784.0, | |
| "completions/mean_length": 2015.1875, | |
| "completions/mean_terminated_length": 1628.0, | |
| "completions/min_length": 1485.0, | |
| "completions/min_terminated_length": 1485.0, | |
| "epoch": 0.05142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22293689846992493, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0141, | |
| "num_tokens": 5650232.0, | |
| "reward": -0.28319618105888367, | |
| "reward_std": 0.44461578130722046, | |
| "rewards/cosine_scaled_reward/mean": -0.19628559052944183, | |
| "rewards/cosine_scaled_reward/std": 0.2942677140235901, | |
| "rewards/format_reward/mean": 0.109375, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.765625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1232.0, | |
| "completions/mean_length": 1769.8125, | |
| "completions/mean_terminated_length": 861.0667114257812, | |
| "completions/min_length": 538.0, | |
| "completions/min_terminated_length": 538.0, | |
| "epoch": 0.052571428571428575, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21012793481349945, | |
| "learning_rate": 9.578385041664925e-07, | |
| "loss": 0.0845, | |
| "num_tokens": 5774668.0, | |
| "reward": -0.19958055019378662, | |
| "reward_std": 0.37389740347862244, | |
| "rewards/cosine_scaled_reward/mean": -0.2247902750968933, | |
| "rewards/cosine_scaled_reward/std": 0.18379005789756775, | |
| "rewards/format_reward/mean": 0.25, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1969.0, | |
| "completions/mean_length": 1761.734375, | |
| "completions/mean_terminated_length": 1131.9500732421875, | |
| "completions/min_length": 370.0, | |
| "completions/min_terminated_length": 370.0, | |
| "epoch": 0.053714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2044854760169983, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0366, | |
| "num_tokens": 5897819.0, | |
| "reward": -0.11128583550453186, | |
| "reward_std": 0.7243642210960388, | |
| "rewards/cosine_scaled_reward/mean": -0.22751793265342712, | |
| "rewards/cosine_scaled_reward/std": 0.341621071100235, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.671875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2015.0, | |
| "completions/mean_length": 1720.890625, | |
| "completions/mean_terminated_length": 1051.09521484375, | |
| "completions/min_length": 430.0, | |
| "completions/min_terminated_length": 430.0, | |
| "epoch": 0.054857142857142854, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19758965075016022, | |
| "learning_rate": 9.509529358847654e-07, | |
| "loss": 0.046, | |
| "num_tokens": 6018500.0, | |
| "reward": 0.026797622442245483, | |
| "reward_std": 0.5594782829284668, | |
| "rewards/cosine_scaled_reward/mean": -0.16628868877887726, | |
| "rewards/cosine_scaled_reward/std": 0.29110410809516907, | |
| "rewards/format_reward/mean": 0.359375, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.546875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1723.0, | |
| "completions/mean_length": 1488.71875, | |
| "completions/mean_terminated_length": 813.72412109375, | |
| "completions/min_length": 402.0, | |
| "completions/min_terminated_length": 402.0, | |
| "epoch": 0.056, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1846495270729065, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0203, | |
| "num_tokens": 6123842.0, | |
| "reward": 0.3029339909553528, | |
| "reward_std": 0.6658899188041687, | |
| "rewards/cosine_scaled_reward/mean": -0.09853300452232361, | |
| "rewards/cosine_scaled_reward/std": 0.4083656370639801, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1959.0, | |
| "completions/mean_length": 1733.59375, | |
| "completions/mean_terminated_length": 790.375, | |
| "completions/min_length": 305.0, | |
| "completions/min_terminated_length": 305.0, | |
| "epoch": 0.05714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19527027010917664, | |
| "learning_rate": 9.43578868212728e-07, | |
| "loss": 0.0136, | |
| "num_tokens": 6245608.0, | |
| "reward": 0.15902790427207947, | |
| "reward_std": 0.46005839109420776, | |
| "rewards/cosine_scaled_reward/mean": -0.06892354786396027, | |
| "rewards/cosine_scaled_reward/std": 0.4567166864871979, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.515625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1916.0, | |
| "completions/mean_length": 1432.421875, | |
| "completions/mean_terminated_length": 777.1290283203125, | |
| "completions/min_length": 401.0, | |
| "completions/min_terminated_length": 401.0, | |
| "epoch": 0.05828571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21701110899448395, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0239, | |
| "num_tokens": 6347491.0, | |
| "reward": 0.2233203500509262, | |
| "reward_std": 0.6041151285171509, | |
| "rewards/cosine_scaled_reward/mean": -0.1383398175239563, | |
| "rewards/cosine_scaled_reward/std": 0.3747152090072632, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1448.0, | |
| "completions/mean_length": 1720.046875, | |
| "completions/mean_terminated_length": 736.1875, | |
| "completions/min_length": 301.0, | |
| "completions/min_terminated_length": 301.0, | |
| "epoch": 0.05942857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19577208161354065, | |
| "learning_rate": 9.357252853159505e-07, | |
| "loss": 0.0066, | |
| "num_tokens": 6468926.0, | |
| "reward": -0.1786521077156067, | |
| "reward_std": 0.3358575701713562, | |
| "rewards/cosine_scaled_reward/mean": -0.21432605385780334, | |
| "rewards/cosine_scaled_reward/std": 0.3689535856246948, | |
| "rewards/format_reward/mean": 0.25, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1931.0, | |
| "completions/mean_length": 1718.9375, | |
| "completions/mean_terminated_length": 878.0, | |
| "completions/min_length": 468.0, | |
| "completions/min_terminated_length": 468.0, | |
| "epoch": 0.060571428571428575, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21421696245670319, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0423, | |
| "num_tokens": 6589770.0, | |
| "reward": -0.03741084039211273, | |
| "reward_std": 0.7027454376220703, | |
| "rewards/cosine_scaled_reward/mean": -0.17495542764663696, | |
| "rewards/cosine_scaled_reward/std": 0.29642969369888306, | |
| "rewards/format_reward/mean": 0.3125, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2031.0, | |
| "completions/mean_length": 1664.625, | |
| "completions/mean_terminated_length": 1171.71435546875, | |
| "completions/min_length": 518.0, | |
| "completions/min_terminated_length": 518.0, | |
| "epoch": 0.061714285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19009248912334442, | |
| "learning_rate": 9.274017555754407e-07, | |
| "loss": 0.0958, | |
| "num_tokens": 6707450.0, | |
| "reward": 0.2984742522239685, | |
| "reward_std": 1.0811007022857666, | |
| "rewards/cosine_scaled_reward/mean": -0.08513787388801575, | |
| "rewards/cosine_scaled_reward/std": 0.455229252576828, | |
| "rewards/format_reward/mean": 0.46875, | |
| "rewards/format_reward/std": 0.5029674172401428, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.765625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1352.0, | |
| "completions/mean_length": 1757.359375, | |
| "completions/mean_terminated_length": 807.933349609375, | |
| "completions/min_length": 517.0, | |
| "completions/min_terminated_length": 517.0, | |
| "epoch": 0.06285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1981392800807953, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0294, | |
| "num_tokens": 6830209.0, | |
| "reward": 0.0005421042442321777, | |
| "reward_std": 0.512083888053894, | |
| "rewards/cosine_scaled_reward/mean": -0.1403539478778839, | |
| "rewards/cosine_scaled_reward/std": 0.37260064482688904, | |
| "rewards/format_reward/mean": 0.28125, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.65625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1955.0, | |
| "completions/mean_length": 1717.890625, | |
| "completions/mean_terminated_length": 1087.681884765625, | |
| "completions/min_length": 494.0, | |
| "completions/min_terminated_length": 494.0, | |
| "epoch": 0.064, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21031354367733002, | |
| "learning_rate": 9.186184199300463e-07, | |
| "loss": 0.0425, | |
| "num_tokens": 6951114.0, | |
| "reward": 0.25747445225715637, | |
| "reward_std": 0.5027350187301636, | |
| "rewards/cosine_scaled_reward/mean": -0.08220025897026062, | |
| "rewards/cosine_scaled_reward/std": 0.4609789550304413, | |
| "rewards/format_reward/mean": 0.421875, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.828125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1899.0, | |
| "completions/mean_length": 1946.203125, | |
| "completions/mean_terminated_length": 1455.727294921875, | |
| "completions/min_length": 844.0, | |
| "completions/min_terminated_length": 844.0, | |
| "epoch": 0.06514285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1840263158082962, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0406, | |
| "num_tokens": 7087239.0, | |
| "reward": -0.31278592348098755, | |
| "reward_std": 0.5103937387466431, | |
| "rewards/cosine_scaled_reward/mean": -0.2501429617404938, | |
| "rewards/cosine_scaled_reward/std": 0.23870430886745453, | |
| "rewards/format_reward/mean": 0.1875, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.515625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2034.0, | |
| "completions/mean_length": 1514.5625, | |
| "completions/mean_terminated_length": 946.7096557617188, | |
| "completions/min_length": 411.0, | |
| "completions/min_terminated_length": 411.0, | |
| "epoch": 0.06628571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18003003299236298, | |
| "learning_rate": 9.093859795212817e-07, | |
| "loss": 0.0669, | |
| "num_tokens": 7194267.0, | |
| "reward": 0.3626611530780792, | |
| "reward_std": 0.6513576507568359, | |
| "rewards/cosine_scaled_reward/mean": -0.09991942346096039, | |
| "rewards/cosine_scaled_reward/std": 0.42993852496147156, | |
| "rewards/format_reward/mean": 0.5625, | |
| "rewards/format_reward/std": 0.5, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2017.0, | |
| "completions/mean_length": 1704.8125, | |
| "completions/mean_terminated_length": 1132.8333740234375, | |
| "completions/min_length": 524.0, | |
| "completions/min_terminated_length": 524.0, | |
| "epoch": 0.06742857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17114725708961487, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0061, | |
| "num_tokens": 7313839.0, | |
| "reward": 0.15319865942001343, | |
| "reward_std": 0.6165874004364014, | |
| "rewards/cosine_scaled_reward/mean": -0.11871317774057388, | |
| "rewards/cosine_scaled_reward/std": 0.3659735918045044, | |
| "rewards/format_reward/mean": 0.390625, | |
| "rewards/format_reward/std": 0.4917473793029785, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1896.0, | |
| "completions/mean_length": 1767.53125, | |
| "completions/mean_terminated_length": 1050.77783203125, | |
| "completions/min_length": 459.0, | |
| "completions/min_terminated_length": 459.0, | |
| "epoch": 0.06857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1782463639974594, | |
| "learning_rate": 8.997156826556369e-07, | |
| "loss": 0.0527, | |
| "num_tokens": 7437849.0, | |
| "reward": -0.09879650175571442, | |
| "reward_std": 0.6538424491882324, | |
| "rewards/cosine_scaled_reward/mean": -0.2212732434272766, | |
| "rewards/cosine_scaled_reward/std": 0.3128809630870819, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1989.0, | |
| "completions/mean_length": 1799.53125, | |
| "completions/mean_terminated_length": 1054.125, | |
| "completions/min_length": 420.0, | |
| "completions/min_terminated_length": 420.0, | |
| "epoch": 0.06971428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19245384633541107, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0533, | |
| "num_tokens": 7564539.0, | |
| "reward": 0.1226256862282753, | |
| "reward_std": 0.7401602268218994, | |
| "rewards/cosine_scaled_reward/mean": -0.11056216061115265, | |
| "rewards/cosine_scaled_reward/std": 0.314616322517395, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.453125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2029.0, | |
| "completions/mean_length": 1458.1875, | |
| "completions/mean_terminated_length": 969.4857177734375, | |
| "completions/min_length": 364.0, | |
| "completions/min_terminated_length": 364.0, | |
| "epoch": 0.07085714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17495828866958618, | |
| "learning_rate": 8.896193111002475e-07, | |
| "loss": 0.0785, | |
| "num_tokens": 7668095.0, | |
| "reward": 0.6185990571975708, | |
| "reward_std": 0.6951406598091125, | |
| "rewards/cosine_scaled_reward/mean": 0.020237013697624207, | |
| "rewards/cosine_scaled_reward/std": 0.42793402075767517, | |
| "rewards/format_reward/mean": 0.578125, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1880.0, | |
| "completions/mean_length": 1369.65625, | |
| "completions/mean_terminated_length": 962.6500244140625, | |
| "completions/min_length": 384.0, | |
| "completions/min_terminated_length": 384.0, | |
| "epoch": 0.072, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17925356328487396, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0802, | |
| "num_tokens": 7766009.0, | |
| "reward": 0.588592529296875, | |
| "reward_std": 0.7614073753356934, | |
| "rewards/cosine_scaled_reward/mean": -0.0260162390768528, | |
| "rewards/cosine_scaled_reward/std": 0.47686251997947693, | |
| "rewards/format_reward/mean": 0.640625, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1849.0, | |
| "completions/mean_length": 1493.0625, | |
| "completions/mean_terminated_length": 1061.4444580078125, | |
| "completions/min_length": 421.0, | |
| "completions/min_terminated_length": 421.0, | |
| "epoch": 0.07314285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.197045236825943, | |
| "learning_rate": 8.791091657286267e-07, | |
| "loss": 0.1112, | |
| "num_tokens": 7872517.0, | |
| "reward": 0.4587404727935791, | |
| "reward_std": 0.7483726739883423, | |
| "rewards/cosine_scaled_reward/mean": -0.08312976360321045, | |
| "rewards/cosine_scaled_reward/std": 0.3704431354999542, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1805.0, | |
| "completions/mean_length": 1561.09375, | |
| "completions/mean_terminated_length": 749.5833740234375, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "epoch": 0.07428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17185057699680328, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0652, | |
| "num_tokens": 7983131.0, | |
| "reward": -0.022998124361038208, | |
| "reward_std": 0.5443873405456543, | |
| "rewards/cosine_scaled_reward/mean": -0.2146240472793579, | |
| "rewards/cosine_scaled_reward/std": 0.39696088433265686, | |
| "rewards/format_reward/mean": 0.40625, | |
| "rewards/format_reward/std": 0.49501484632492065, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1822.0, | |
| "completions/mean_length": 1160.96875, | |
| "completions/mean_terminated_length": 757.7727661132812, | |
| "completions/min_length": 245.0, | |
| "completions/min_terminated_length": 245.0, | |
| "epoch": 0.07542857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15682660043239594, | |
| "learning_rate": 8.681980515339463e-07, | |
| "loss": 0.0317, | |
| "num_tokens": 8067665.0, | |
| "reward": 0.7723344564437866, | |
| "reward_std": 0.5304180979728699, | |
| "rewards/cosine_scaled_reward/mean": 0.03460472822189331, | |
| "rewards/cosine_scaled_reward/std": 0.47199109196662903, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1529.0, | |
| "completions/mean_length": 1760.03125, | |
| "completions/mean_terminated_length": 1024.111083984375, | |
| "completions/min_length": 494.0, | |
| "completions/min_terminated_length": 494.0, | |
| "epoch": 0.07657142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18002018332481384, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0536, | |
| "num_tokens": 8191043.0, | |
| "reward": -0.27919694781303406, | |
| "reward_std": 0.3664131164550781, | |
| "rewards/cosine_scaled_reward/mean": -0.2724109888076782, | |
| "rewards/cosine_scaled_reward/std": 0.16395430266857147, | |
| "rewards/format_reward/mean": 0.265625, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1307.0, | |
| "completions/mean_length": 917.34375, | |
| "completions/mean_terminated_length": 600.760009765625, | |
| "completions/min_length": 295.0, | |
| "completions/min_terminated_length": 295.0, | |
| "epoch": 0.07771428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13612917065620422, | |
| "learning_rate": 8.568992620281243e-07, | |
| "loss": 0.0077, | |
| "num_tokens": 8259009.0, | |
| "reward": 0.6957368850708008, | |
| "reward_std": 0.5402743816375732, | |
| "rewards/cosine_scaled_reward/mean": -0.04275655001401901, | |
| "rewards/cosine_scaled_reward/std": 0.434044748544693, | |
| "rewards/format_reward/mean": 0.78125, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1526.0, | |
| "completions/mean_length": 1233.78125, | |
| "completions/mean_terminated_length": 863.6818237304688, | |
| "completions/min_length": 343.0, | |
| "completions/min_terminated_length": 343.0, | |
| "epoch": 0.07885714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19043830037117004, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0558, | |
| "num_tokens": 8348315.0, | |
| "reward": 0.21049074828624725, | |
| "reward_std": 0.5405222177505493, | |
| "rewards/cosine_scaled_reward/mean": -0.24631711840629578, | |
| "rewards/cosine_scaled_reward/std": 0.2778205871582031, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.796875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2019.0, | |
| "completions/mean_length": 1808.9375, | |
| "completions/mean_terminated_length": 871.0769653320312, | |
| "completions/min_length": 513.0, | |
| "completions/min_terminated_length": 513.0, | |
| "epoch": 0.08, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19704341888427734, | |
| "learning_rate": 8.452265630457282e-07, | |
| "loss": 0.0391, | |
| "num_tokens": 8475543.0, | |
| "reward": -0.18982277810573578, | |
| "reward_std": 0.5247766971588135, | |
| "rewards/cosine_scaled_reward/mean": -0.2355363965034485, | |
| "rewards/cosine_scaled_reward/std": 0.3067134916782379, | |
| "rewards/format_reward/mean": 0.28125, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.609375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1800.0, | |
| "completions/mean_length": 1563.21875, | |
| "completions/mean_terminated_length": 806.9599609375, | |
| "completions/min_length": 315.0, | |
| "completions/min_terminated_length": 315.0, | |
| "epoch": 0.08114285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18498718738555908, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.016, | |
| "num_tokens": 8586309.0, | |
| "reward": 0.19864726066589355, | |
| "reward_std": 0.576451301574707, | |
| "rewards/cosine_scaled_reward/mean": -0.10380134731531143, | |
| "rewards/cosine_scaled_reward/std": 0.476872056722641, | |
| "rewards/format_reward/mean": 0.40625, | |
| "rewards/format_reward/std": 0.49501484632492065, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2048.0, | |
| "completions/mean_length": 1406.0625, | |
| "completions/mean_terminated_length": 906.7777709960938, | |
| "completions/min_length": 353.0, | |
| "completions/min_terminated_length": 353.0, | |
| "epoch": 0.08228571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17368191480636597, | |
| "learning_rate": 8.331941759724268e-07, | |
| "loss": 0.0237, | |
| "num_tokens": 8686649.0, | |
| "reward": 0.22483232617378235, | |
| "reward_std": 0.45926159620285034, | |
| "rewards/cosine_scaled_reward/mean": -0.20789632201194763, | |
| "rewards/cosine_scaled_reward/std": 0.294547975063324, | |
| "rewards/format_reward/mean": 0.640625, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1930.0, | |
| "completions/mean_length": 1912.875, | |
| "completions/mean_terminated_length": 1327.3333740234375, | |
| "completions/min_length": 878.0, | |
| "completions/min_terminated_length": 878.0, | |
| "epoch": 0.08342857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20081810653209686, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0223, | |
| "num_tokens": 8819801.0, | |
| "reward": -0.18328779935836792, | |
| "reward_std": 0.5305245518684387, | |
| "rewards/cosine_scaled_reward/mean": -0.20883139967918396, | |
| "rewards/cosine_scaled_reward/std": 0.2695733904838562, | |
| "rewards/format_reward/mean": 0.234375, | |
| "rewards/format_reward/std": 0.42695629596710205, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2012.0, | |
| "completions/mean_length": 1517.875, | |
| "completions/mean_terminated_length": 987.75, | |
| "completions/min_length": 560.0, | |
| "completions/min_terminated_length": 560.0, | |
| "epoch": 0.08457142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1813385784626007, | |
| "learning_rate": 8.208167604184217e-07, | |
| "loss": 0.085, | |
| "num_tokens": 8926873.0, | |
| "reward": 0.46356096863746643, | |
| "reward_std": 0.6926693916320801, | |
| "rewards/cosine_scaled_reward/mean": -0.018219511955976486, | |
| "rewards/cosine_scaled_reward/std": 0.47079169750213623, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1907.0, | |
| "completions/mean_length": 1515.734375, | |
| "completions/mean_terminated_length": 1046.0882568359375, | |
| "completions/min_length": 374.0, | |
| "completions/min_terminated_length": 374.0, | |
| "epoch": 0.08571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18714174628257751, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0989, | |
| "num_tokens": 9034840.0, | |
| "reward": 0.5457433462142944, | |
| "reward_std": 0.6619582176208496, | |
| "rewards/cosine_scaled_reward/mean": -0.00837831199169159, | |
| "rewards/cosine_scaled_reward/std": 0.5059990882873535, | |
| "rewards/format_reward/mean": 0.5625, | |
| "rewards/format_reward/std": 0.5, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1222.0, | |
| "completions/mean_length": 1340.484375, | |
| "completions/mean_terminated_length": 790.1944580078125, | |
| "completions/min_length": 407.0, | |
| "completions/min_terminated_length": 407.0, | |
| "epoch": 0.08685714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17108581960201263, | |
| "learning_rate": 8.081093963579707e-07, | |
| "loss": 0.0209, | |
| "num_tokens": 9131031.0, | |
| "reward": 0.19882698357105255, | |
| "reward_std": 0.5817238092422485, | |
| "rewards/cosine_scaled_reward/mean": -0.18964898586273193, | |
| "rewards/cosine_scaled_reward/std": 0.3000561594963074, | |
| "rewards/format_reward/mean": 0.578125, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1755.0, | |
| "completions/mean_length": 1518.765625, | |
| "completions/mean_terminated_length": 1051.7940673828125, | |
| "completions/min_length": 641.0, | |
| "completions/min_terminated_length": 641.0, | |
| "epoch": 0.088, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1759587675333023, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0869, | |
| "num_tokens": 9239808.0, | |
| "reward": 0.2113216668367386, | |
| "reward_std": 0.5600536465644836, | |
| "rewards/cosine_scaled_reward/mean": -0.1599641740322113, | |
| "rewards/cosine_scaled_reward/std": 0.33541423082351685, | |
| "rewards/format_reward/mean": 0.53125, | |
| "rewards/format_reward/std": 0.5029674172401428, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.640625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1659.0, | |
| "completions/mean_length": 1656.15625, | |
| "completions/mean_terminated_length": 957.6522216796875, | |
| "completions/min_length": 530.0, | |
| "completions/min_terminated_length": 530.0, | |
| "epoch": 0.08914285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17662394046783447, | |
| "learning_rate": 7.950875657567621e-07, | |
| "loss": 0.0177, | |
| "num_tokens": 9356522.0, | |
| "reward": 0.25513648986816406, | |
| "reward_std": 0.5462654829025269, | |
| "rewards/cosine_scaled_reward/mean": -0.05993174761533737, | |
| "rewards/cosine_scaled_reward/std": 0.4486319124698639, | |
| "rewards/format_reward/mean": 0.375, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1957.0, | |
| "completions/mean_length": 1289.359375, | |
| "completions/mean_terminated_length": 834.1749877929688, | |
| "completions/min_length": 229.0, | |
| "completions/min_terminated_length": 229.0, | |
| "epoch": 0.09028571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15610884130001068, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.026, | |
| "num_tokens": 9449137.0, | |
| "reward": 0.4372347593307495, | |
| "reward_std": 0.5517712831497192, | |
| "rewards/cosine_scaled_reward/mean": -0.10950762033462524, | |
| "rewards/cosine_scaled_reward/std": 0.3864338994026184, | |
| "rewards/format_reward/mean": 0.65625, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.546875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1902.0, | |
| "completions/mean_length": 1623.5, | |
| "completions/mean_terminated_length": 1111.17236328125, | |
| "completions/min_length": 538.0, | |
| "completions/min_terminated_length": 538.0, | |
| "epoch": 0.09142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18003858625888824, | |
| "learning_rate": 7.817671337095244e-07, | |
| "loss": 0.017, | |
| "num_tokens": 9563433.0, | |
| "reward": 0.11363417655229568, | |
| "reward_std": 0.5530154705047607, | |
| "rewards/cosine_scaled_reward/mean": -0.16974541544914246, | |
| "rewards/cosine_scaled_reward/std": 0.3006208539009094, | |
| "rewards/format_reward/mean": 0.453125, | |
| "rewards/format_reward/std": 0.501733124256134, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1987.0, | |
| "completions/mean_length": 1432.125, | |
| "completions/mean_terminated_length": 1010.7368774414062, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "epoch": 0.09257142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2004833072423935, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0603, | |
| "num_tokens": 9666361.0, | |
| "reward": 0.512394905090332, | |
| "reward_std": 0.7596394419670105, | |
| "rewards/cosine_scaled_reward/mean": -0.05630255863070488, | |
| "rewards/cosine_scaled_reward/std": 0.43662360310554504, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1721.0, | |
| "completions/mean_length": 1341.03125, | |
| "completions/mean_terminated_length": 970.7142944335938, | |
| "completions/min_length": 431.0, | |
| "completions/min_terminated_length": 431.0, | |
| "epoch": 0.09371428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1563584953546524, | |
| "learning_rate": 7.681643291108517e-07, | |
| "loss": 0.0182, | |
| "num_tokens": 9762515.0, | |
| "reward": 0.746865451335907, | |
| "reward_std": 0.571272611618042, | |
| "rewards/cosine_scaled_reward/mean": 0.037495262920856476, | |
| "rewards/cosine_scaled_reward/std": 0.5523709654808044, | |
| "rewards/format_reward/mean": 0.671875, | |
| "rewards/format_reward/std": 0.4732423722743988, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.421875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1739.0, | |
| "completions/mean_length": 1357.640625, | |
| "completions/mean_terminated_length": 853.8648681640625, | |
| "completions/min_length": 455.0, | |
| "completions/min_terminated_length": 455.0, | |
| "epoch": 0.09485714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17990301549434662, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0313, | |
| "num_tokens": 9860492.0, | |
| "reward": 0.4607480764389038, | |
| "reward_std": 0.4022068381309509, | |
| "rewards/cosine_scaled_reward/mean": -0.0665009543299675, | |
| "rewards/cosine_scaled_reward/std": 0.36611077189445496, | |
| "rewards/format_reward/mean": 0.59375, | |
| "rewards/format_reward/std": 0.49501484632492065, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.390625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2044.0, | |
| "completions/mean_length": 1392.609375, | |
| "completions/mean_terminated_length": 972.4871826171875, | |
| "completions/min_length": 395.0, | |
| "completions/min_terminated_length": 395.0, | |
| "epoch": 0.096, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16545262932777405, | |
| "learning_rate": 7.54295724882796e-07, | |
| "loss": 0.016, | |
| "num_tokens": 9960315.0, | |
| "reward": 0.3932368755340576, | |
| "reward_std": 0.662509024143219, | |
| "rewards/cosine_scaled_reward/mean": -0.11588154733181, | |
| "rewards/cosine_scaled_reward/std": 0.428220272064209, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2034.0, | |
| "completions/mean_length": 1427.515625, | |
| "completions/mean_terminated_length": 1220.6875, | |
| "completions/min_length": 234.0, | |
| "completions/min_terminated_length": 234.0, | |
| "epoch": 0.09714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14229631423950195, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": -0.0247, | |
| "num_tokens": 10061996.0, | |
| "reward": 0.7478936910629272, | |
| "reward_std": 0.8706425428390503, | |
| "rewards/cosine_scaled_reward/mean": -0.00886566936969757, | |
| "rewards/cosine_scaled_reward/std": 0.4233645796775818, | |
| "rewards/format_reward/mean": 0.765625, | |
| "rewards/format_reward/std": 0.42695629596710205, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.390625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2023.0, | |
| "completions/mean_length": 1459.28125, | |
| "completions/mean_terminated_length": 1081.8974609375, | |
| "completions/min_length": 496.0, | |
| "completions/min_terminated_length": 496.0, | |
| "epoch": 0.09828571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.19291459023952484, | |
| "learning_rate": 7.401782177833147e-07, | |
| "loss": 0.0182, | |
| "num_tokens": 10166246.0, | |
| "reward": 0.30948999524116516, | |
| "reward_std": 0.55961012840271, | |
| "rewards/cosine_scaled_reward/mean": -0.1733800172805786, | |
| "rewards/cosine_scaled_reward/std": 0.30220499634742737, | |
| "rewards/format_reward/mean": 0.65625, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1866.0, | |
| "completions/mean_length": 1140.828125, | |
| "completions/mean_terminated_length": 972.8333129882812, | |
| "completions/min_length": 353.0, | |
| "completions/min_terminated_length": 353.0, | |
| "epoch": 0.09942857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14790062606334686, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0321, | |
| "num_tokens": 10249379.0, | |
| "reward": 0.429340660572052, | |
| "reward_std": 0.47173961997032166, | |
| "rewards/cosine_scaled_reward/mean": -0.207204669713974, | |
| "rewards/cosine_scaled_reward/std": 0.27721449732780457, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.328125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1519.0, | |
| "completions/mean_length": 1258.484375, | |
| "completions/mean_terminated_length": 872.906982421875, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "epoch": 0.10057142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1598745733499527, | |
| "learning_rate": 7.258290078201731e-07, | |
| "loss": 0.0482, | |
| "num_tokens": 10340434.0, | |
| "reward": 0.8419445157051086, | |
| "reward_std": 0.7817317247390747, | |
| "rewards/cosine_scaled_reward/mean": 0.06940975040197372, | |
| "rewards/cosine_scaled_reward/std": 0.4935828149318695, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1919.0, | |
| "completions/mean_length": 1373.1875, | |
| "completions/mean_terminated_length": 1166.6121826171875, | |
| "completions/min_length": 675.0, | |
| "completions/min_terminated_length": 675.0, | |
| "epoch": 0.10171428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1521584838628769, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0547, | |
| "num_tokens": 10439318.0, | |
| "reward": 0.648002028465271, | |
| "reward_std": 0.6874127984046936, | |
| "rewards/cosine_scaled_reward/mean": -0.0978739783167839, | |
| "rewards/cosine_scaled_reward/std": 0.41632241010665894, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2023.0, | |
| "completions/mean_length": 1239.703125, | |
| "completions/mean_terminated_length": 992.2652587890625, | |
| "completions/min_length": 162.0, | |
| "completions/min_terminated_length": 162.0, | |
| "epoch": 0.10285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1843656599521637, | |
| "learning_rate": 7.11265577295385e-07, | |
| "loss": 0.0371, | |
| "num_tokens": 10528659.0, | |
| "reward": 0.4645897150039673, | |
| "reward_std": 0.6535974740982056, | |
| "rewards/cosine_scaled_reward/mean": -0.15833015739917755, | |
| "rewards/cosine_scaled_reward/std": 0.3457205295562744, | |
| "rewards/format_reward/mean": 0.78125, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.453125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1961.0, | |
| "completions/mean_length": 1610.09375, | |
| "completions/mean_terminated_length": 1247.2572021484375, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "epoch": 0.104, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17640981078147888, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0305, | |
| "num_tokens": 10642273.0, | |
| "reward": 0.5222002267837524, | |
| "reward_std": 0.9113218784332275, | |
| "rewards/cosine_scaled_reward/mean": -0.05139988660812378, | |
| "rewards/cosine_scaled_reward/std": 0.4710950553417206, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2040.0, | |
| "completions/mean_length": 1320.09375, | |
| "completions/mean_terminated_length": 938.8095703125, | |
| "completions/min_length": 332.0, | |
| "completions/min_terminated_length": 332.0, | |
| "epoch": 0.10514285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15313342213630676, | |
| "learning_rate": 6.965056695057204e-07, | |
| "loss": 0.0055, | |
| "num_tokens": 10736751.0, | |
| "reward": 0.4166978597640991, | |
| "reward_std": 0.6364502310752869, | |
| "rewards/cosine_scaled_reward/mean": -0.13540107011795044, | |
| "rewards/cosine_scaled_reward/std": 0.3054071068763733, | |
| "rewards/format_reward/mean": 0.6875, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.703125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1964.0, | |
| "completions/mean_length": 1770.671875, | |
| "completions/mean_terminated_length": 1113.8421630859375, | |
| "completions/min_length": 632.0, | |
| "completions/min_terminated_length": 632.0, | |
| "epoch": 0.10628571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21292737126350403, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0675, | |
| "num_tokens": 10861418.0, | |
| "reward": -0.15841422975063324, | |
| "reward_std": 0.4093279242515564, | |
| "rewards/cosine_scaled_reward/mean": -0.24326962232589722, | |
| "rewards/cosine_scaled_reward/std": 0.16840828955173492, | |
| "rewards/format_reward/mean": 0.328125, | |
| "rewards/format_reward/std": 0.4732423722743988, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.53125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2042.0, | |
| "completions/mean_length": 1532.65625, | |
| "completions/mean_terminated_length": 948.6000366210938, | |
| "completions/min_length": 511.0, | |
| "completions/min_terminated_length": 511.0, | |
| "epoch": 0.10742857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20122359693050385, | |
| "learning_rate": 6.815672671252315e-07, | |
| "loss": 0.0623, | |
| "num_tokens": 10969276.0, | |
| "reward": 0.20252148807048798, | |
| "reward_std": 0.345744788646698, | |
| "rewards/cosine_scaled_reward/mean": -0.1409267634153366, | |
| "rewards/cosine_scaled_reward/std": 0.4320366382598877, | |
| "rewards/format_reward/mean": 0.484375, | |
| "rewards/format_reward/std": 0.5037065148353577, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.453125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1745.0, | |
| "completions/mean_length": 1530.03125, | |
| "completions/mean_terminated_length": 1100.857177734375, | |
| "completions/min_length": 700.0, | |
| "completions/min_terminated_length": 700.0, | |
| "epoch": 0.10857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16728746891021729, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0592, | |
| "num_tokens": 11077726.0, | |
| "reward": 0.05856095254421234, | |
| "reward_std": 0.5498154163360596, | |
| "rewards/cosine_scaled_reward/mean": -0.25196951627731323, | |
| "rewards/cosine_scaled_reward/std": 0.27556198835372925, | |
| "rewards/format_reward/mean": 0.5625, | |
| "rewards/format_reward/std": 0.5, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1633.0, | |
| "completions/mean_length": 1279.6875, | |
| "completions/mean_terminated_length": 1044.48974609375, | |
| "completions/min_length": 452.0, | |
| "completions/min_terminated_length": 452.0, | |
| "epoch": 0.10971428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1483285129070282, | |
| "learning_rate": 6.664685702961344e-07, | |
| "loss": 0.0161, | |
| "num_tokens": 11170762.0, | |
| "reward": 0.8373413681983948, | |
| "reward_std": 0.4410895109176636, | |
| "rewards/cosine_scaled_reward/mean": 0.01242067664861679, | |
| "rewards/cosine_scaled_reward/std": 0.46624863147735596, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1905.0, | |
| "completions/mean_length": 1312.640625, | |
| "completions/mean_terminated_length": 1024.891357421875, | |
| "completions/min_length": 343.0, | |
| "completions/min_terminated_length": 343.0, | |
| "epoch": 0.11085714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16424083709716797, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0535, | |
| "num_tokens": 11265035.0, | |
| "reward": 0.5586233139038086, | |
| "reward_std": 0.7126098871231079, | |
| "rewards/cosine_scaled_reward/mean": -0.11131332814693451, | |
| "rewards/cosine_scaled_reward/std": 0.3577263653278351, | |
| "rewards/format_reward/mean": 0.78125, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1940.0, | |
| "completions/mean_length": 1376.53125, | |
| "completions/mean_terminated_length": 1024.8095703125, | |
| "completions/min_length": 372.0, | |
| "completions/min_terminated_length": 372.0, | |
| "epoch": 0.112, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17384155094623566, | |
| "learning_rate": 6.512279744547392e-07, | |
| "loss": 0.0164, | |
| "num_tokens": 11364197.0, | |
| "reward": 0.6794039607048035, | |
| "reward_std": 0.4869590997695923, | |
| "rewards/cosine_scaled_reward/mean": -0.02748553454875946, | |
| "rewards/cosine_scaled_reward/std": 0.45645180344581604, | |
| "rewards/format_reward/mean": 0.734375, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.296875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2048.0, | |
| "completions/mean_length": 1280.09375, | |
| "completions/mean_terminated_length": 955.86669921875, | |
| "completions/min_length": 415.0, | |
| "completions/min_terminated_length": 415.0, | |
| "epoch": 0.11314285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17342573404312134, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0816, | |
| "num_tokens": 11457291.0, | |
| "reward": 0.7432724237442017, | |
| "reward_std": 0.6722617745399475, | |
| "rewards/cosine_scaled_reward/mean": -0.003363795578479767, | |
| "rewards/cosine_scaled_reward/std": 0.4415356516838074, | |
| "rewards/format_reward/mean": 0.75, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1965.0, | |
| "completions/mean_length": 1247.765625, | |
| "completions/mean_terminated_length": 1063.09619140625, | |
| "completions/min_length": 520.0, | |
| "completions/min_terminated_length": 520.0, | |
| "epoch": 0.11428571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15301530063152313, | |
| "learning_rate": 6.358640479194451e-07, | |
| "loss": 0.0125, | |
| "num_tokens": 11546860.0, | |
| "reward": 0.803851306438446, | |
| "reward_std": 0.6947499513626099, | |
| "rewards/cosine_scaled_reward/mean": -0.019949357956647873, | |
| "rewards/cosine_scaled_reward/std": 0.4705973267555237, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1911.0, | |
| "completions/mean_length": 1269.671875, | |
| "completions/mean_terminated_length": 1125.5369873046875, | |
| "completions/min_length": 485.0, | |
| "completions/min_terminated_length": 485.0, | |
| "epoch": 0.11542857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1690932661294937, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.013, | |
| "num_tokens": 11639551.0, | |
| "reward": 0.6836185455322266, | |
| "reward_std": 0.5046678781509399, | |
| "rewards/cosine_scaled_reward/mean": -0.08787819743156433, | |
| "rewards/cosine_scaled_reward/std": 0.40181559324264526, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1872.0, | |
| "completions/mean_length": 1174.265625, | |
| "completions/mean_terminated_length": 1012.4629516601562, | |
| "completions/min_length": 340.0, | |
| "completions/min_terminated_length": 340.0, | |
| "epoch": 0.11657142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16043449938297272, | |
| "learning_rate": 6.203955092681039e-07, | |
| "loss": 0.032, | |
| "num_tokens": 11724856.0, | |
| "reward": 0.67606520652771, | |
| "reward_std": 0.6234960556030273, | |
| "rewards/cosine_scaled_reward/mean": -0.09165491163730621, | |
| "rewards/cosine_scaled_reward/std": 0.37837859988212585, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1953.0, | |
| "completions/mean_length": 1157.8125, | |
| "completions/mean_terminated_length": 930.9019775390625, | |
| "completions/min_length": 247.0, | |
| "completions/min_terminated_length": 247.0, | |
| "epoch": 0.11771428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1574372500181198, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0249, | |
| "num_tokens": 11809308.0, | |
| "reward": 0.4326379895210266, | |
| "reward_std": 0.5444109439849854, | |
| "rewards/cosine_scaled_reward/mean": -0.1977435052394867, | |
| "rewards/cosine_scaled_reward/std": 0.3261271119117737, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1988.0, | |
| "completions/mean_length": 1330.484375, | |
| "completions/mean_terminated_length": 954.6428833007812, | |
| "completions/min_length": 371.0, | |
| "completions/min_terminated_length": 371.0, | |
| "epoch": 0.11885714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18211479485034943, | |
| "learning_rate": 6.048412045323164e-07, | |
| "loss": 0.0439, | |
| "num_tokens": 11904923.0, | |
| "reward": 0.4620264172554016, | |
| "reward_std": 0.5293800830841064, | |
| "rewards/cosine_scaled_reward/mean": -0.12054930627346039, | |
| "rewards/cosine_scaled_reward/std": 0.3497216999530792, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2003.0, | |
| "completions/mean_length": 1224.859375, | |
| "completions/mean_terminated_length": 994.3800048828125, | |
| "completions/min_length": 499.0, | |
| "completions/min_terminated_length": 499.0, | |
| "epoch": 0.12, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1528584063053131, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0233, | |
| "num_tokens": 11994602.0, | |
| "reward": 0.7569347620010376, | |
| "reward_std": 0.6899948120117188, | |
| "rewards/cosine_scaled_reward/mean": -0.027782641351222992, | |
| "rewards/cosine_scaled_reward/std": 0.5096075534820557, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.296875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2046.0, | |
| "completions/mean_length": 1239.625, | |
| "completions/mean_terminated_length": 898.3111572265625, | |
| "completions/min_length": 293.0, | |
| "completions/min_terminated_length": 293.0, | |
| "epoch": 0.12114285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1494080275297165, | |
| "learning_rate": 5.892200842364462e-07, | |
| "loss": 0.0226, | |
| "num_tokens": 12084770.0, | |
| "reward": 1.043992519378662, | |
| "reward_std": 0.7194849252700806, | |
| "rewards/cosine_scaled_reward/mean": 0.13918372988700867, | |
| "rewards/cosine_scaled_reward/std": 0.46339961886405945, | |
| "rewards/format_reward/mean": 0.765625, | |
| "rewards/format_reward/std": 0.42695629596710205, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2036.0, | |
| "completions/mean_length": 1139.515625, | |
| "completions/mean_terminated_length": 971.2777709960938, | |
| "completions/min_length": 401.0, | |
| "completions/min_terminated_length": 401.0, | |
| "epoch": 0.12228571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1769389808177948, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0041, | |
| "num_tokens": 12168851.0, | |
| "reward": 0.46204712986946106, | |
| "reward_std": 0.5935191512107849, | |
| "rewards/cosine_scaled_reward/mean": -0.18303894996643066, | |
| "rewards/cosine_scaled_reward/std": 0.30380427837371826, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1791.0, | |
| "completions/mean_length": 1382.375, | |
| "completions/mean_terminated_length": 983.0, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "epoch": 0.12342857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16175994277000427, | |
| "learning_rate": 5.735511803093248e-07, | |
| "loss": 0.0651, | |
| "num_tokens": 12267683.0, | |
| "reward": 0.3516117334365845, | |
| "reward_std": 0.7561339735984802, | |
| "rewards/cosine_scaled_reward/mean": -0.17575663328170776, | |
| "rewards/cosine_scaled_reward/std": 0.35719168186187744, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1965.0, | |
| "completions/mean_length": 1392.828125, | |
| "completions/mean_terminated_length": 1049.642822265625, | |
| "completions/min_length": 543.0, | |
| "completions/min_terminated_length": 543.0, | |
| "epoch": 0.12457142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16766144335269928, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0149, | |
| "num_tokens": 12368072.0, | |
| "reward": 0.7171763181686401, | |
| "reward_std": 0.4656876027584076, | |
| "rewards/cosine_scaled_reward/mean": 0.007025681436061859, | |
| "rewards/cosine_scaled_reward/std": 0.4227021336555481, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1881.0, | |
| "completions/mean_length": 1239.796875, | |
| "completions/mean_terminated_length": 923.5435180664062, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "epoch": 0.12571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14556895196437836, | |
| "learning_rate": 5.578535828967777e-07, | |
| "loss": 0.0102, | |
| "num_tokens": 12458195.0, | |
| "reward": 0.3774694800376892, | |
| "reward_std": 0.654548704624176, | |
| "rewards/cosine_scaled_reward/mean": -0.1784527748823166, | |
| "rewards/cosine_scaled_reward/std": 0.331076443195343, | |
| "rewards/format_reward/mean": 0.734375, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1808.0, | |
| "completions/mean_length": 1170.40625, | |
| "completions/mean_terminated_length": 1045.0357666015625, | |
| "completions/min_length": 508.0, | |
| "completions/min_terminated_length": 508.0, | |
| "epoch": 0.12685714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1637505292892456, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0741, | |
| "num_tokens": 12543221.0, | |
| "reward": 0.6489747762680054, | |
| "reward_std": 0.654654860496521, | |
| "rewards/cosine_scaled_reward/mean": -0.12082511186599731, | |
| "rewards/cosine_scaled_reward/std": 0.34212014079093933, | |
| "rewards/format_reward/mean": 0.890625, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.421875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1831.0, | |
| "completions/mean_length": 1475.09375, | |
| "completions/mean_terminated_length": 1057.027099609375, | |
| "completions/min_length": 351.0, | |
| "completions/min_terminated_length": 351.0, | |
| "epoch": 0.128, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1809089183807373, | |
| "learning_rate": 5.421464171032224e-07, | |
| "loss": 0.0187, | |
| "num_tokens": 12648723.0, | |
| "reward": 0.6672303676605225, | |
| "reward_std": 0.7431913614273071, | |
| "rewards/cosine_scaled_reward/mean": 0.01330268383026123, | |
| "rewards/cosine_scaled_reward/std": 0.4883294403553009, | |
| "rewards/format_reward/mean": 0.640625, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1961.0, | |
| "completions/mean_length": 1056.4375, | |
| "completions/mean_terminated_length": 914.7857666015625, | |
| "completions/min_length": 340.0, | |
| "completions/min_terminated_length": 340.0, | |
| "epoch": 0.12914285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1637895107269287, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0644, | |
| "num_tokens": 12726631.0, | |
| "reward": 0.6515660881996155, | |
| "reward_std": 0.5848349332809448, | |
| "rewards/cosine_scaled_reward/mean": -0.11952944099903107, | |
| "rewards/cosine_scaled_reward/std": 0.4174686074256897, | |
| "rewards/format_reward/mean": 0.890625, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1711.0, | |
| "completions/mean_length": 1097.859375, | |
| "completions/mean_terminated_length": 962.1250610351562, | |
| "completions/min_length": 141.0, | |
| "completions/min_terminated_length": 141.0, | |
| "epoch": 0.13028571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1319950371980667, | |
| "learning_rate": 5.264488196906752e-07, | |
| "loss": 0.0226, | |
| "num_tokens": 12806742.0, | |
| "reward": 0.6668691635131836, | |
| "reward_std": 0.6580501794815063, | |
| "rewards/cosine_scaled_reward/mean": -0.1431279182434082, | |
| "rewards/cosine_scaled_reward/std": 0.378142774105072, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1997.0, | |
| "completions/mean_length": 1380.0625, | |
| "completions/mean_terminated_length": 1076.45458984375, | |
| "completions/min_length": 322.0, | |
| "completions/min_terminated_length": 322.0, | |
| "epoch": 0.13142857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1882496327161789, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.04, | |
| "num_tokens": 12906282.0, | |
| "reward": 0.43996283411979675, | |
| "reward_std": 0.6503387093544006, | |
| "rewards/cosine_scaled_reward/mean": -0.13939358294010162, | |
| "rewards/cosine_scaled_reward/std": 0.3781909942626953, | |
| "rewards/format_reward/mean": 0.71875, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1852.0, | |
| "completions/mean_length": 1364.125, | |
| "completions/mean_terminated_length": 953.7999877929688, | |
| "completions/min_length": 343.0, | |
| "completions/min_terminated_length": 343.0, | |
| "epoch": 0.13257142857142856, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1886526346206665, | |
| "learning_rate": 5.107799157635538e-07, | |
| "loss": 0.1079, | |
| "num_tokens": 13004970.0, | |
| "reward": 0.5331847667694092, | |
| "reward_std": 0.7935209274291992, | |
| "rewards/cosine_scaled_reward/mean": -0.08497010916471481, | |
| "rewards/cosine_scaled_reward/std": 0.4501515328884125, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1944.0, | |
| "completions/mean_length": 1136.4375, | |
| "completions/mean_terminated_length": 1024.4912109375, | |
| "completions/min_length": 505.0, | |
| "completions/min_terminated_length": 505.0, | |
| "epoch": 0.1337142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1523984968662262, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0671, | |
| "num_tokens": 13088726.0, | |
| "reward": 0.7468037009239197, | |
| "reward_std": 0.7615803480148315, | |
| "rewards/cosine_scaled_reward/mean": -0.08753564208745956, | |
| "rewards/cosine_scaled_reward/std": 0.44001707434654236, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1918.0, | |
| "completions/mean_length": 1274.578125, | |
| "completions/mean_terminated_length": 971.934814453125, | |
| "completions/min_length": 474.0, | |
| "completions/min_terminated_length": 474.0, | |
| "epoch": 0.13485714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14875811338424683, | |
| "learning_rate": 4.951587954676837e-07, | |
| "loss": 0.0166, | |
| "num_tokens": 13180835.0, | |
| "reward": 0.6522707939147949, | |
| "reward_std": 0.589940071105957, | |
| "rewards/cosine_scaled_reward/mean": -0.041052110493183136, | |
| "rewards/cosine_scaled_reward/std": 0.5126345157623291, | |
| "rewards/format_reward/mean": 0.734375, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1962.0, | |
| "completions/mean_length": 1070.359375, | |
| "completions/mean_terminated_length": 844.7500610351562, | |
| "completions/min_length": 333.0, | |
| "completions/min_terminated_length": 333.0, | |
| "epoch": 0.136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15418609976768494, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0135, | |
| "num_tokens": 13259746.0, | |
| "reward": 0.8924436569213867, | |
| "reward_std": 0.6925675272941589, | |
| "rewards/cosine_scaled_reward/mean": 0.00872182846069336, | |
| "rewards/cosine_scaled_reward/std": 0.49334391951560974, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1899.0, | |
| "completions/mean_length": 868.90625, | |
| "completions/mean_terminated_length": 850.1905517578125, | |
| "completions/min_length": 137.0, | |
| "completions/min_terminated_length": 137.0, | |
| "epoch": 0.13714285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12889909744262695, | |
| "learning_rate": 4.79604490731896e-07, | |
| "loss": -0.0038, | |
| "num_tokens": 13325812.0, | |
| "reward": 0.833016574382782, | |
| "reward_std": 0.6583147048950195, | |
| "rewards/cosine_scaled_reward/mean": -0.08349171280860901, | |
| "rewards/cosine_scaled_reward/std": 0.43434619903564453, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1890.0, | |
| "completions/mean_length": 792.359375, | |
| "completions/mean_terminated_length": 730.6065063476562, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "epoch": 0.1382857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14095140993595123, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0479, | |
| "num_tokens": 13386219.0, | |
| "reward": 1.289149284362793, | |
| "reward_std": 0.6984070539474487, | |
| "rewards/cosine_scaled_reward/mean": 0.16801217198371887, | |
| "rewards/cosine_scaled_reward/std": 0.5607498288154602, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.328125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1805.0, | |
| "completions/mean_length": 1384.5, | |
| "completions/mean_terminated_length": 1060.465087890625, | |
| "completions/min_length": 334.0, | |
| "completions/min_terminated_length": 334.0, | |
| "epoch": 0.13942857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17925438284873962, | |
| "learning_rate": 4.641359520805548e-07, | |
| "loss": 0.0462, | |
| "num_tokens": 13486387.0, | |
| "reward": 0.4263126254081726, | |
| "reward_std": 0.6481289267539978, | |
| "rewards/cosine_scaled_reward/mean": -0.1462186872959137, | |
| "rewards/cosine_scaled_reward/std": 0.3027765154838562, | |
| "rewards/format_reward/mean": 0.71875, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1833.0, | |
| "completions/mean_length": 1208.140625, | |
| "completions/mean_terminated_length": 1014.3269653320312, | |
| "completions/min_length": 519.0, | |
| "completions/min_terminated_length": 519.0, | |
| "epoch": 0.14057142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14084899425506592, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": -0.0087, | |
| "num_tokens": 13573940.0, | |
| "reward": 0.5345523357391357, | |
| "reward_std": 0.35669955611228943, | |
| "rewards/cosine_scaled_reward/mean": -0.16241134703159332, | |
| "rewards/cosine_scaled_reward/std": 0.3877701759338379, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2019.0, | |
| "completions/mean_length": 1185.703125, | |
| "completions/mean_terminated_length": 1026.0185546875, | |
| "completions/min_length": 455.0, | |
| "completions/min_terminated_length": 455.0, | |
| "epoch": 0.1417142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13726963102817535, | |
| "learning_rate": 4.4877202554526084e-07, | |
| "loss": 0.0309, | |
| "num_tokens": 13660777.0, | |
| "reward": 0.802190363407135, | |
| "reward_std": 0.6432194709777832, | |
| "rewards/cosine_scaled_reward/mean": -0.044217295944690704, | |
| "rewards/cosine_scaled_reward/std": 0.4381820559501648, | |
| "rewards/format_reward/mean": 0.890625, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2033.0, | |
| "completions/mean_length": 1272.40625, | |
| "completions/mean_terminated_length": 1074.7059326171875, | |
| "completions/min_length": 451.0, | |
| "completions/min_terminated_length": 451.0, | |
| "epoch": 0.14285714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1567695438861847, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": -0.0098, | |
| "num_tokens": 13753139.0, | |
| "reward": 0.8288029432296753, | |
| "reward_std": 0.6727226972579956, | |
| "rewards/cosine_scaled_reward/mean": 0.0003389418125152588, | |
| "rewards/cosine_scaled_reward/std": 0.501276433467865, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1738.0, | |
| "completions/mean_length": 1239.6875, | |
| "completions/mean_terminated_length": 992.244873046875, | |
| "completions/min_length": 609.0, | |
| "completions/min_terminated_length": 609.0, | |
| "epoch": 0.144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1638517528772354, | |
| "learning_rate": 4.3353142970386557e-07, | |
| "loss": 0.0357, | |
| "num_tokens": 13843775.0, | |
| "reward": 0.8066681623458862, | |
| "reward_std": 0.8093670010566711, | |
| "rewards/cosine_scaled_reward/mean": -0.0029159002006053925, | |
| "rewards/cosine_scaled_reward/std": 0.40039899945259094, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1968.0, | |
| "completions/mean_length": 1259.078125, | |
| "completions/mean_terminated_length": 1038.179931640625, | |
| "completions/min_length": 430.0, | |
| "completions/min_terminated_length": 430.0, | |
| "epoch": 0.14514285714285713, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18749745190143585, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.1248, | |
| "num_tokens": 13935452.0, | |
| "reward": 0.3689166009426117, | |
| "reward_std": 0.5908951759338379, | |
| "rewards/cosine_scaled_reward/mean": -0.22960419952869415, | |
| "rewards/cosine_scaled_reward/std": 0.2868925929069519, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1942.0, | |
| "completions/mean_length": 1131.578125, | |
| "completions/mean_terminated_length": 1070.4833984375, | |
| "completions/min_length": 449.0, | |
| "completions/min_terminated_length": 449.0, | |
| "epoch": 0.1462857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14950093626976013, | |
| "learning_rate": 4.1843273287476854e-07, | |
| "loss": 0.0065, | |
| "num_tokens": 14018225.0, | |
| "reward": 0.9214786291122437, | |
| "reward_std": 0.7154524922370911, | |
| "rewards/cosine_scaled_reward/mean": -0.01582319289445877, | |
| "rewards/cosine_scaled_reward/std": 0.47363659739494324, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1927.0, | |
| "completions/mean_length": 1407.71875, | |
| "completions/mean_terminated_length": 1211.7142333984375, | |
| "completions/min_length": 455.0, | |
| "completions/min_terminated_length": 455.0, | |
| "epoch": 0.14742857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16768500208854675, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0736, | |
| "num_tokens": 14119023.0, | |
| "reward": 0.5007042288780212, | |
| "reward_std": 0.6594030261039734, | |
| "rewards/cosine_scaled_reward/mean": -0.14808538556098938, | |
| "rewards/cosine_scaled_reward/std": 0.3597432076931, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.359375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1705.0, | |
| "completions/mean_length": 1331.546875, | |
| "completions/mean_terminated_length": 929.6340942382812, | |
| "completions/min_length": 364.0, | |
| "completions/min_terminated_length": 364.0, | |
| "epoch": 0.14857142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1533743441104889, | |
| "learning_rate": 4.034943304942796e-07, | |
| "loss": 0.065, | |
| "num_tokens": 14214746.0, | |
| "reward": 0.18521776795387268, | |
| "reward_std": 0.4527278244495392, | |
| "rewards/cosine_scaled_reward/mean": -0.25895363092422485, | |
| "rewards/cosine_scaled_reward/std": 0.2297503650188446, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1983.0, | |
| "completions/mean_length": 1240.703125, | |
| "completions/mean_terminated_length": 971.6041870117188, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "epoch": 0.14971428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16382953524589539, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0716, | |
| "num_tokens": 14303887.0, | |
| "reward": 1.0216246843338013, | |
| "reward_std": 0.8127155303955078, | |
| "rewards/cosine_scaled_reward/mean": 0.10456232726573944, | |
| "rewards/cosine_scaled_reward/std": 0.48323893547058105, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1521.0, | |
| "completions/mean_length": 1340.046875, | |
| "completions/mean_terminated_length": 855.6578979492188, | |
| "completions/min_length": 297.0, | |
| "completions/min_terminated_length": 297.0, | |
| "epoch": 0.15085714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1577305793762207, | |
| "learning_rate": 3.8873442270461485e-07, | |
| "loss": 0.0288, | |
| "num_tokens": 14400714.0, | |
| "reward": 0.4232841432094574, | |
| "reward_std": 0.6519888639450073, | |
| "rewards/cosine_scaled_reward/mean": -0.1008579432964325, | |
| "rewards/cosine_scaled_reward/std": 0.42636433243751526, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1879.0, | |
| "completions/mean_length": 1292.671875, | |
| "completions/mean_terminated_length": 1061.448974609375, | |
| "completions/min_length": 459.0, | |
| "completions/min_terminated_length": 459.0, | |
| "epoch": 0.152, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1743871122598648, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0212, | |
| "num_tokens": 14494669.0, | |
| "reward": 0.6165566444396973, | |
| "reward_std": 0.5660312175750732, | |
| "rewards/cosine_scaled_reward/mean": -0.08234670013189316, | |
| "rewards/cosine_scaled_reward/std": 0.31525060534477234, | |
| "rewards/format_reward/mean": 0.78125, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2039.0, | |
| "completions/mean_length": 1096.953125, | |
| "completions/mean_terminated_length": 998.5689697265625, | |
| "completions/min_length": 364.0, | |
| "completions/min_terminated_length": 364.0, | |
| "epoch": 0.15314285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16208074986934662, | |
| "learning_rate": 3.7417099217982686e-07, | |
| "loss": 0.0283, | |
| "num_tokens": 14575442.0, | |
| "reward": 1.0213682651519775, | |
| "reward_std": 0.6743905544281006, | |
| "rewards/cosine_scaled_reward/mean": 0.041934188455343246, | |
| "rewards/cosine_scaled_reward/std": 0.5223273038864136, | |
| "rewards/format_reward/mean": 0.9375, | |
| "rewards/format_reward/std": 0.24397502839565277, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1720.0, | |
| "completions/mean_length": 861.328125, | |
| "completions/mean_terminated_length": 738.5689697265625, | |
| "completions/min_length": 284.0, | |
| "completions/min_terminated_length": 284.0, | |
| "epoch": 0.15428571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15264089405536652, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0412, | |
| "num_tokens": 14641039.0, | |
| "reward": 1.173776388168335, | |
| "reward_std": 0.741400957107544, | |
| "rewards/cosine_scaled_reward/mean": 0.12595069408416748, | |
| "rewards/cosine_scaled_reward/std": 0.5099307298660278, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2047.0, | |
| "completions/mean_length": 1234.375, | |
| "completions/mean_terminated_length": 1118.1429443359375, | |
| "completions/min_length": 429.0, | |
| "completions/min_terminated_length": 429.0, | |
| "epoch": 0.15542857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14350080490112305, | |
| "learning_rate": 3.5982178221668533e-07, | |
| "loss": 0.0484, | |
| "num_tokens": 14730711.0, | |
| "reward": 0.7637453675270081, | |
| "reward_std": 0.6790728569030762, | |
| "rewards/cosine_scaled_reward/mean": -0.10250230133533478, | |
| "rewards/cosine_scaled_reward/std": 0.4094173312187195, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1865.0, | |
| "completions/mean_length": 1222.53125, | |
| "completions/mean_terminated_length": 1152.5762939453125, | |
| "completions/min_length": 555.0, | |
| "completions/min_terminated_length": 555.0, | |
| "epoch": 0.15657142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1349533200263977, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": -0.0333, | |
| "num_tokens": 14819561.0, | |
| "reward": 0.6314640641212463, | |
| "reward_std": 0.6037685871124268, | |
| "rewards/cosine_scaled_reward/mean": -0.16083045303821564, | |
| "rewards/cosine_scaled_reward/std": 0.3636666238307953, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1955.0, | |
| "completions/mean_length": 1173.65625, | |
| "completions/mean_terminated_length": 1048.75, | |
| "completions/min_length": 320.0, | |
| "completions/min_terminated_length": 320.0, | |
| "epoch": 0.15771428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13684934377670288, | |
| "learning_rate": 3.45704275117204e-07, | |
| "loss": 0.0176, | |
| "num_tokens": 14905987.0, | |
| "reward": 0.8157724142074585, | |
| "reward_std": 0.7757042646408081, | |
| "rewards/cosine_scaled_reward/mean": -0.04523882642388344, | |
| "rewards/cosine_scaled_reward/std": 0.4742158055305481, | |
| "rewards/format_reward/mean": 0.90625, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2009.0, | |
| "completions/mean_length": 1268.453125, | |
| "completions/mean_terminated_length": 1124.0926513671875, | |
| "completions/min_length": 291.0, | |
| "completions/min_terminated_length": 291.0, | |
| "epoch": 0.15885714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14245833456516266, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": -0.0152, | |
| "num_tokens": 14997808.0, | |
| "reward": 0.7688822746276855, | |
| "reward_std": 0.5957136750221252, | |
| "rewards/cosine_scaled_reward/mean": -0.09212135523557663, | |
| "rewards/cosine_scaled_reward/std": 0.42672204971313477, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1748.0, | |
| "completions/mean_length": 1121.875, | |
| "completions/mean_terminated_length": 970.3272705078125, | |
| "completions/min_length": 471.0, | |
| "completions/min_terminated_length": 471.0, | |
| "epoch": 0.16, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.14092370867729187, | |
| "learning_rate": 3.3183567088914833e-07, | |
| "loss": 0.0336, | |
| "num_tokens": 15079832.0, | |
| "reward": 0.6852799654006958, | |
| "reward_std": 0.412535697221756, | |
| "rewards/cosine_scaled_reward/mean": -0.0948600098490715, | |
| "rewards/cosine_scaled_reward/std": 0.46610429883003235, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1891.0, | |
| "completions/mean_length": 1039.921875, | |
| "completions/mean_terminated_length": 954.4915161132812, | |
| "completions/min_length": 442.0, | |
| "completions/min_terminated_length": 442.0, | |
| "epoch": 0.16114285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1313817948102951, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0563, | |
| "num_tokens": 15156947.0, | |
| "reward": 1.052842140197754, | |
| "reward_std": 0.7119845151901245, | |
| "rewards/cosine_scaled_reward/mean": 0.03423358500003815, | |
| "rewards/cosine_scaled_reward/std": 0.4524931311607361, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.296875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1770.0, | |
| "completions/mean_length": 1376.234375, | |
| "completions/mean_terminated_length": 1092.5999755859375, | |
| "completions/min_length": 326.0, | |
| "completions/min_terminated_length": 326.0, | |
| "epoch": 0.16228571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1682240515947342, | |
| "learning_rate": 3.182328662904756e-07, | |
| "loss": 0.0936, | |
| "num_tokens": 15255530.0, | |
| "reward": 0.44548213481903076, | |
| "reward_std": 0.7928640842437744, | |
| "rewards/cosine_scaled_reward/mean": -0.18350891768932343, | |
| "rewards/cosine_scaled_reward/std": 0.36820653080940247, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1989.0, | |
| "completions/mean_length": 1057.03125, | |
| "completions/mean_terminated_length": 1008.2950439453125, | |
| "completions/min_length": 416.0, | |
| "completions/min_terminated_length": 416.0, | |
| "epoch": 0.16342857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15701259672641754, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.032, | |
| "num_tokens": 15333996.0, | |
| "reward": 0.681940495967865, | |
| "reward_std": 0.6061316728591919, | |
| "rewards/cosine_scaled_reward/mean": -0.1434047520160675, | |
| "rewards/cosine_scaled_reward/std": 0.31647545099258423, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1879.0, | |
| "completions/mean_length": 1187.875, | |
| "completions/mean_terminated_length": 968.6275024414062, | |
| "completions/min_length": 316.0, | |
| "completions/min_terminated_length": 316.0, | |
| "epoch": 0.16457142857142856, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1424182802438736, | |
| "learning_rate": 3.0491243424323783e-07, | |
| "loss": 0.0431, | |
| "num_tokens": 15421508.0, | |
| "reward": 1.0751841068267822, | |
| "reward_std": 0.7788275480270386, | |
| "rewards/cosine_scaled_reward/mean": 0.12352952361106873, | |
| "rewards/cosine_scaled_reward/std": 0.5238592028617859, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1908.0, | |
| "completions/mean_length": 908.546875, | |
| "completions/mean_terminated_length": 852.5081787109375, | |
| "completions/min_length": 261.0, | |
| "completions/min_terminated_length": 261.0, | |
| "epoch": 0.1657142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1289522349834442, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0264, | |
| "num_tokens": 15489599.0, | |
| "reward": 1.0159393548965454, | |
| "reward_std": 0.6956236958503723, | |
| "rewards/cosine_scaled_reward/mean": 0.023594655096530914, | |
| "rewards/cosine_scaled_reward/std": 0.472563236951828, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1742.0, | |
| "completions/mean_length": 1074.765625, | |
| "completions/mean_terminated_length": 1009.8833618164062, | |
| "completions/min_length": 457.0, | |
| "completions/min_terminated_length": 457.0, | |
| "epoch": 0.16685714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12770003080368042, | |
| "learning_rate": 2.918906036420294e-07, | |
| "loss": 0.0393, | |
| "num_tokens": 15569000.0, | |
| "reward": 0.5655175447463989, | |
| "reward_std": 0.5674481987953186, | |
| "rewards/cosine_scaled_reward/mean": -0.19380369782447815, | |
| "rewards/cosine_scaled_reward/std": 0.32235828042030334, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2029.0, | |
| "completions/mean_length": 1272.375, | |
| "completions/mean_terminated_length": 1111.396240234375, | |
| "completions/min_length": 399.0, | |
| "completions/min_terminated_length": 399.0, | |
| "epoch": 0.168, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.160521000623703, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.017, | |
| "num_tokens": 15661216.0, | |
| "reward": 0.5459345579147339, | |
| "reward_std": 0.7825783491134644, | |
| "rewards/cosine_scaled_reward/mean": -0.14890772104263306, | |
| "rewards/cosine_scaled_reward/std": 0.4268314838409424, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1729.0, | |
| "completions/mean_length": 1068.40625, | |
| "completions/mean_terminated_length": 948.1052856445312, | |
| "completions/min_length": 388.0, | |
| "completions/min_terminated_length": 388.0, | |
| "epoch": 0.16914285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13231147825717926, | |
| "learning_rate": 2.791832395815782e-07, | |
| "loss": 0.0355, | |
| "num_tokens": 15740778.0, | |
| "reward": 0.8093540668487549, | |
| "reward_std": 0.5906412601470947, | |
| "rewards/cosine_scaled_reward/mean": -0.08751046657562256, | |
| "rewards/cosine_scaled_reward/std": 0.40702494978904724, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1865.0, | |
| "completions/mean_length": 1392.28125, | |
| "completions/mean_terminated_length": 998.8500366210938, | |
| "completions/min_length": 559.0, | |
| "completions/min_terminated_length": 559.0, | |
| "epoch": 0.1702857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1449200063943863, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0033, | |
| "num_tokens": 15841780.0, | |
| "reward": 0.4228026866912842, | |
| "reward_std": 0.745114266872406, | |
| "rewards/cosine_scaled_reward/mean": -0.1323486566543579, | |
| "rewards/cosine_scaled_reward/std": 0.37805312871932983, | |
| "rewards/format_reward/mean": 0.6875, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2032.0, | |
| "completions/mean_length": 1004.125, | |
| "completions/mean_terminated_length": 952.7868041992188, | |
| "completions/min_length": 378.0, | |
| "completions/min_terminated_length": 378.0, | |
| "epoch": 0.17142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13021093606948853, | |
| "learning_rate": 2.6680582402757324e-07, | |
| "loss": 0.0355, | |
| "num_tokens": 15916548.0, | |
| "reward": 0.7599377632141113, | |
| "reward_std": 0.5821801424026489, | |
| "rewards/cosine_scaled_reward/mean": -0.11221860349178314, | |
| "rewards/cosine_scaled_reward/std": 0.3788122236728668, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1859.0, | |
| "completions/mean_length": 1107.015625, | |
| "completions/mean_terminated_length": 911.7169799804688, | |
| "completions/min_length": 179.0, | |
| "completions/min_terminated_length": 179.0, | |
| "epoch": 0.17257142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14420656859874725, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0295, | |
| "num_tokens": 15998077.0, | |
| "reward": 1.2211229801177979, | |
| "reward_std": 0.7430520057678223, | |
| "rewards/cosine_scaled_reward/mean": 0.18087396025657654, | |
| "rewards/cosine_scaled_reward/std": 0.5226595401763916, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2028.0, | |
| "completions/mean_length": 1256.234375, | |
| "completions/mean_terminated_length": 946.4130859375, | |
| "completions/min_length": 167.0, | |
| "completions/min_terminated_length": 167.0, | |
| "epoch": 0.1737142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15784548223018646, | |
| "learning_rate": 2.547734369542718e-07, | |
| "loss": 0.0658, | |
| "num_tokens": 16089140.0, | |
| "reward": 0.6517580151557922, | |
| "reward_std": 0.7057055830955505, | |
| "rewards/cosine_scaled_reward/mean": -0.056933484971523285, | |
| "rewards/cosine_scaled_reward/std": 0.403768390417099, | |
| "rewards/format_reward/mean": 0.765625, | |
| "rewards/format_reward/std": 0.42695629596710205, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2045.0, | |
| "completions/mean_length": 1169.125, | |
| "completions/mean_terminated_length": 1043.571533203125, | |
| "completions/min_length": 332.0, | |
| "completions/min_terminated_length": 332.0, | |
| "epoch": 0.17485714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13566994667053223, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0529, | |
| "num_tokens": 16175108.0, | |
| "reward": 0.4462122321128845, | |
| "reward_std": 0.4172056317329407, | |
| "rewards/cosine_scaled_reward/mean": -0.22220639884471893, | |
| "rewards/cosine_scaled_reward/std": 0.19565363228321075, | |
| "rewards/format_reward/mean": 0.890625, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1909.0, | |
| "completions/mean_length": 1226.546875, | |
| "completions/mean_terminated_length": 975.0816040039062, | |
| "completions/min_length": 443.0, | |
| "completions/min_terminated_length": 443.0, | |
| "epoch": 0.176, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15715065598487854, | |
| "learning_rate": 2.4310073797187573e-07, | |
| "loss": 0.0615, | |
| "num_tokens": 16264671.0, | |
| "reward": 0.6308701038360596, | |
| "reward_std": 0.6271623373031616, | |
| "rewards/cosine_scaled_reward/mean": -0.11425244808197021, | |
| "rewards/cosine_scaled_reward/std": 0.37054499983787537, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2018.0, | |
| "completions/mean_length": 995.125, | |
| "completions/mean_terminated_length": 822.8363647460938, | |
| "completions/min_length": 273.0, | |
| "completions/min_terminated_length": 273.0, | |
| "epoch": 0.17714285714285713, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13968248665332794, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": -0.0003, | |
| "num_tokens": 16338983.0, | |
| "reward": 0.7562404870986938, | |
| "reward_std": 0.70821213722229, | |
| "rewards/cosine_scaled_reward/mean": -0.08281721919775009, | |
| "rewards/cosine_scaled_reward/std": 0.44696903228759766, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1915.0, | |
| "completions/mean_length": 1327.6875, | |
| "completions/mean_terminated_length": 950.3809814453125, | |
| "completions/min_length": 284.0, | |
| "completions/min_terminated_length": 284.0, | |
| "epoch": 0.1782857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15466168522834778, | |
| "learning_rate": 2.3180194846605364e-07, | |
| "loss": 0.0102, | |
| "num_tokens": 16434059.0, | |
| "reward": 0.6187171936035156, | |
| "reward_std": 0.7333636283874512, | |
| "rewards/cosine_scaled_reward/mean": -0.026578888297080994, | |
| "rewards/cosine_scaled_reward/std": 0.49515098333358765, | |
| "rewards/format_reward/mean": 0.671875, | |
| "rewards/format_reward/std": 0.4732423722743988, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2021.0, | |
| "completions/mean_length": 1259.796875, | |
| "completions/mean_terminated_length": 1077.9039306640625, | |
| "completions/min_length": 471.0, | |
| "completions/min_terminated_length": 471.0, | |
| "epoch": 0.17942857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15207421779632568, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.002, | |
| "num_tokens": 16524646.0, | |
| "reward": 0.48799604177474976, | |
| "reward_std": 0.5923628211021423, | |
| "rewards/cosine_scaled_reward/mean": -0.18568949401378632, | |
| "rewards/cosine_scaled_reward/std": 0.28887510299682617, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1980.0, | |
| "completions/mean_length": 1055.890625, | |
| "completions/mean_terminated_length": 971.8135375976562, | |
| "completions/min_length": 132.0, | |
| "completions/min_terminated_length": 132.0, | |
| "epoch": 0.18057142857142858, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.13212887942790985, | |
| "learning_rate": 2.2089083427137329e-07, | |
| "loss": 0.0164, | |
| "num_tokens": 16602343.0, | |
| "reward": 0.9118403196334839, | |
| "reward_std": 0.5433474779129028, | |
| "rewards/cosine_scaled_reward/mean": -0.03626735508441925, | |
| "rewards/cosine_scaled_reward/std": 0.5205101370811462, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2008.0, | |
| "completions/mean_length": 1232.875, | |
| "completions/mean_terminated_length": 1099.4908447265625, | |
| "completions/min_length": 426.0, | |
| "completions/min_terminated_length": 426.0, | |
| "epoch": 0.18171428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1433304101228714, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.066, | |
| "num_tokens": 16692927.0, | |
| "reward": 0.4464070796966553, | |
| "reward_std": 0.5299515128135681, | |
| "rewards/cosine_scaled_reward/mean": -0.22210896015167236, | |
| "rewards/cosine_scaled_reward/std": 0.27688807249069214, | |
| "rewards/format_reward/mean": 0.890625, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1549.0, | |
| "completions/mean_length": 1081.25, | |
| "completions/mean_terminated_length": 923.0545043945312, | |
| "completions/min_length": 361.0, | |
| "completions/min_terminated_length": 361.0, | |
| "epoch": 0.18285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15510301291942596, | |
| "learning_rate": 2.1038068889975259e-07, | |
| "loss": 0.071, | |
| "num_tokens": 16773711.0, | |
| "reward": 0.8579483032226562, | |
| "reward_std": 0.7331453561782837, | |
| "rewards/cosine_scaled_reward/mean": -0.024150855839252472, | |
| "rewards/cosine_scaled_reward/std": 0.43525949120521545, | |
| "rewards/format_reward/mean": 0.90625, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1896.0, | |
| "completions/mean_length": 1213.984375, | |
| "completions/mean_terminated_length": 1059.5369873046875, | |
| "completions/min_length": 421.0, | |
| "completions/min_terminated_length": 421.0, | |
| "epoch": 0.184, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1579800397157669, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0563, | |
| "num_tokens": 16861398.0, | |
| "reward": 0.790129542350769, | |
| "reward_std": 0.8190513849258423, | |
| "rewards/cosine_scaled_reward/mean": -0.04243520647287369, | |
| "rewards/cosine_scaled_reward/std": 0.4257972538471222, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1871.0, | |
| "completions/mean_length": 1173.828125, | |
| "completions/mean_terminated_length": 972.09619140625, | |
| "completions/min_length": 513.0, | |
| "completions/min_terminated_length": 513.0, | |
| "epoch": 0.18514285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14933519065380096, | |
| "learning_rate": 2.0028431734436308e-07, | |
| "loss": 0.0335, | |
| "num_tokens": 16946827.0, | |
| "reward": 0.6066349744796753, | |
| "reward_std": 0.807995080947876, | |
| "rewards/cosine_scaled_reward/mean": -0.11855749785900116, | |
| "rewards/cosine_scaled_reward/std": 0.40160706639289856, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1776.0, | |
| "completions/mean_length": 1202.625, | |
| "completions/mean_terminated_length": 920.8333740234375, | |
| "completions/min_length": 410.0, | |
| "completions/min_terminated_length": 410.0, | |
| "epoch": 0.18628571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17073573172092438, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.008, | |
| "num_tokens": 17034979.0, | |
| "reward": 1.164199709892273, | |
| "reward_std": 0.6732690930366516, | |
| "rewards/cosine_scaled_reward/mean": 0.22272484004497528, | |
| "rewards/cosine_scaled_reward/std": 0.5151689648628235, | |
| "rewards/format_reward/mean": 0.71875, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2032.0, | |
| "completions/mean_length": 1055.265625, | |
| "completions/mean_terminated_length": 933.3508911132812, | |
| "completions/min_length": 278.0, | |
| "completions/min_terminated_length": 278.0, | |
| "epoch": 0.18742857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14955352246761322, | |
| "learning_rate": 1.9061402047871833e-07, | |
| "loss": 0.0617, | |
| "num_tokens": 17113044.0, | |
| "reward": 1.0745567083358765, | |
| "reward_std": 0.44688692688941956, | |
| "rewards/cosine_scaled_reward/mean": 0.07634085416793823, | |
| "rewards/cosine_scaled_reward/std": 0.45942261815071106, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1950.0, | |
| "completions/mean_length": 1082.15625, | |
| "completions/mean_terminated_length": 982.2413940429688, | |
| "completions/min_length": 425.0, | |
| "completions/min_terminated_length": 425.0, | |
| "epoch": 0.18857142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15888644754886627, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0088, | |
| "num_tokens": 17193718.0, | |
| "reward": 0.9942861199378967, | |
| "reward_std": 0.6299077272415161, | |
| "rewards/cosine_scaled_reward/mean": 0.02058056741952896, | |
| "rewards/cosine_scaled_reward/std": 0.46080252528190613, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1826.0, | |
| "completions/mean_length": 1271.890625, | |
| "completions/mean_terminated_length": 1013.1875, | |
| "completions/min_length": 363.0, | |
| "completions/min_terminated_length": 363.0, | |
| "epoch": 0.18971428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14011530578136444, | |
| "learning_rate": 1.8138158006995363e-07, | |
| "loss": 0.0285, | |
| "num_tokens": 17286695.0, | |
| "reward": 0.5431326627731323, | |
| "reward_std": 0.6457577347755432, | |
| "rewards/cosine_scaled_reward/mean": -0.13468365371227264, | |
| "rewards/cosine_scaled_reward/std": 0.3553418219089508, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1882.0, | |
| "completions/mean_length": 1100.046875, | |
| "completions/mean_terminated_length": 1001.9827270507812, | |
| "completions/min_length": 476.0, | |
| "completions/min_terminated_length": 476.0, | |
| "epoch": 0.19085714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14937180280685425, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0807, | |
| "num_tokens": 17368642.0, | |
| "reward": 0.6264936923980713, | |
| "reward_std": 0.5748982429504395, | |
| "rewards/cosine_scaled_reward/mean": -0.14769065380096436, | |
| "rewards/cosine_scaled_reward/std": 0.2645467221736908, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1992.0, | |
| "completions/mean_length": 1130.84375, | |
| "completions/mean_terminated_length": 1116.2857666015625, | |
| "completions/min_length": 437.0, | |
| "completions/min_terminated_length": 437.0, | |
| "epoch": 0.192, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13785859942436218, | |
| "learning_rate": 1.7259824442455923e-07, | |
| "loss": -0.0247, | |
| "num_tokens": 17451856.0, | |
| "reward": 1.0183875560760498, | |
| "reward_std": 0.7866266965866089, | |
| "rewards/cosine_scaled_reward/mean": 0.017006313428282738, | |
| "rewards/cosine_scaled_reward/std": 0.48554277420043945, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1877.0, | |
| "completions/mean_length": 963.734375, | |
| "completions/mean_terminated_length": 928.758056640625, | |
| "completions/min_length": 498.0, | |
| "completions/min_terminated_length": 498.0, | |
| "epoch": 0.19314285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.132929727435112, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0452, | |
| "num_tokens": 17524159.0, | |
| "reward": 1.5141942501068115, | |
| "reward_std": 0.7578620910644531, | |
| "rewards/cosine_scaled_reward/mean": 0.26490968465805054, | |
| "rewards/cosine_scaled_reward/std": 0.53211909532547, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1697.0, | |
| "completions/mean_length": 1058.453125, | |
| "completions/mean_terminated_length": 781.3800048828125, | |
| "completions/min_length": 337.0, | |
| "completions/min_terminated_length": 337.0, | |
| "epoch": 0.19428571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14723296463489532, | |
| "learning_rate": 1.6427471468404952e-07, | |
| "loss": 0.0659, | |
| "num_tokens": 17601684.0, | |
| "reward": 0.8584200739860535, | |
| "reward_std": 0.4904913902282715, | |
| "rewards/cosine_scaled_reward/mean": 0.007335059344768524, | |
| "rewards/cosine_scaled_reward/std": 0.44158241152763367, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1732.0, | |
| "completions/mean_length": 1232.28125, | |
| "completions/mean_terminated_length": 960.375, | |
| "completions/min_length": 414.0, | |
| "completions/min_terminated_length": 414.0, | |
| "epoch": 0.19542857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16656361520290375, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": -0.0264, | |
| "num_tokens": 17690942.0, | |
| "reward": 0.6898657083511353, | |
| "reward_std": 0.6278946399688721, | |
| "rewards/cosine_scaled_reward/mean": -0.030067168176174164, | |
| "rewards/cosine_scaled_reward/std": 0.45971429347991943, | |
| "rewards/format_reward/mean": 0.75, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1805.0, | |
| "completions/mean_length": 1040.625, | |
| "completions/mean_terminated_length": 875.7817993164062, | |
| "completions/min_length": 162.0, | |
| "completions/min_terminated_length": 162.0, | |
| "epoch": 0.19657142857142856, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15691731870174408, | |
| "learning_rate": 1.5642113178727193e-07, | |
| "loss": 0.0625, | |
| "num_tokens": 17768158.0, | |
| "reward": 1.2213534116744995, | |
| "reward_std": 0.6515992879867554, | |
| "rewards/cosine_scaled_reward/mean": 0.17317672073841095, | |
| "rewards/cosine_scaled_reward/std": 0.5265737771987915, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1873.0, | |
| "completions/mean_length": 899.28125, | |
| "completions/mean_terminated_length": 758.2105102539062, | |
| "completions/min_length": 292.0, | |
| "completions/min_terminated_length": 292.0, | |
| "epoch": 0.1977142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1264735609292984, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0532, | |
| "num_tokens": 17837024.0, | |
| "reward": 0.7364885210990906, | |
| "reward_std": 0.6678578853607178, | |
| "rewards/cosine_scaled_reward/mean": -0.0848807543516159, | |
| "rewards/cosine_scaled_reward/std": 0.4483066201210022, | |
| "rewards/format_reward/mean": 0.90625, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1777.0, | |
| "completions/max_terminated_length": 1777.0, | |
| "completions/mean_length": 953.328125, | |
| "completions/mean_terminated_length": 953.328125, | |
| "completions/min_length": 508.0, | |
| "completions/min_terminated_length": 508.0, | |
| "epoch": 0.19885714285714284, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13611741364002228, | |
| "learning_rate": 1.4904706411523448e-07, | |
| "loss": 0.0037, | |
| "num_tokens": 17908373.0, | |
| "reward": 0.9751720428466797, | |
| "reward_std": 0.5935230255126953, | |
| "rewards/cosine_scaled_reward/mean": -0.012413978576660156, | |
| "rewards/cosine_scaled_reward/std": 0.4495556354522705, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1927.0, | |
| "completions/mean_length": 1108.25, | |
| "completions/mean_terminated_length": 974.0000610351562, | |
| "completions/min_length": 390.0, | |
| "completions/min_terminated_length": 390.0, | |
| "epoch": 0.2, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15151762962341309, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0591, | |
| "num_tokens": 17990125.0, | |
| "reward": 0.881943941116333, | |
| "reward_std": 0.575822114944458, | |
| "rewards/cosine_scaled_reward/mean": -0.0121530219912529, | |
| "rewards/cosine_scaled_reward/std": 0.49256107211112976, | |
| "rewards/format_reward/mean": 0.90625, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1940.0, | |
| "completions/mean_length": 1168.84375, | |
| "completions/mean_terminated_length": 986.3773803710938, | |
| "completions/min_length": 257.0, | |
| "completions/min_terminated_length": 257.0, | |
| "epoch": 0.20114285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15197090804576874, | |
| "learning_rate": 1.4216149583350755e-07, | |
| "loss": 0.0193, | |
| "num_tokens": 18076099.0, | |
| "reward": 0.5906968712806702, | |
| "reward_std": 0.5817879438400269, | |
| "rewards/cosine_scaled_reward/mean": -0.12652656435966492, | |
| "rewards/cosine_scaled_reward/std": 0.3300129473209381, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1916.0, | |
| "completions/mean_length": 1038.0625, | |
| "completions/mean_terminated_length": 970.7333984375, | |
| "completions/min_length": 390.0, | |
| "completions/min_terminated_length": 390.0, | |
| "epoch": 0.2022857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15584589540958405, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0401, | |
| "num_tokens": 18152759.0, | |
| "reward": 1.083601474761963, | |
| "reward_std": 0.8219331502914429, | |
| "rewards/cosine_scaled_reward/mean": 0.08086325228214264, | |
| "rewards/cosine_scaled_reward/std": 0.47295841574668884, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1839.0, | |
| "completions/mean_length": 1048.5625, | |
| "completions/mean_terminated_length": 905.7857666015625, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "epoch": 0.20342857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14128631353378296, | |
| "learning_rate": 1.3577281594640182e-07, | |
| "loss": 0.0298, | |
| "num_tokens": 18231403.0, | |
| "reward": 0.9733308553695679, | |
| "reward_std": 0.6629190444946289, | |
| "rewards/cosine_scaled_reward/mean": 0.02572791464626789, | |
| "rewards/cosine_scaled_reward/std": 0.47114452719688416, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.265625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1927.0, | |
| "completions/mean_length": 1293.578125, | |
| "completions/mean_terminated_length": 1020.7020874023438, | |
| "completions/min_length": 245.0, | |
| "completions/min_terminated_length": 245.0, | |
| "epoch": 0.20457142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16287265717983246, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0196, | |
| "num_tokens": 18325024.0, | |
| "reward": 0.5872488617897034, | |
| "reward_std": 0.6428846120834351, | |
| "rewards/cosine_scaled_reward/mean": -0.08137557655572891, | |
| "rewards/cosine_scaled_reward/std": 0.3453543484210968, | |
| "rewards/format_reward/mean": 0.75, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1951.0, | |
| "completions/mean_length": 1036.78125, | |
| "completions/mean_terminated_length": 932.1724243164062, | |
| "completions/min_length": 262.0, | |
| "completions/min_terminated_length": 262.0, | |
| "epoch": 0.2057142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14584973454475403, | |
| "learning_rate": 1.2988880807625927e-07, | |
| "loss": 0.0066, | |
| "num_tokens": 18402554.0, | |
| "reward": 1.347097396850586, | |
| "reward_std": 0.8030112385749817, | |
| "rewards/cosine_scaled_reward/mean": 0.19698619842529297, | |
| "rewards/cosine_scaled_reward/std": 0.48687708377838135, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1869.0, | |
| "completions/mean_length": 1165.484375, | |
| "completions/mean_terminated_length": 940.5294189453125, | |
| "completions/min_length": 442.0, | |
| "completions/min_terminated_length": 442.0, | |
| "epoch": 0.20685714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1534472554922104, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0795, | |
| "num_tokens": 18488617.0, | |
| "reward": 0.6842443346977234, | |
| "reward_std": 0.6290575265884399, | |
| "rewards/cosine_scaled_reward/mean": -0.0563153512775898, | |
| "rewards/cosine_scaled_reward/std": 0.5009898543357849, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.265625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2037.0, | |
| "completions/mean_length": 1240.1875, | |
| "completions/mean_terminated_length": 948.0, | |
| "completions/min_length": 264.0, | |
| "completions/min_terminated_length": 264.0, | |
| "epoch": 0.208, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13502204418182373, | |
| "learning_rate": 1.2451664098030743e-07, | |
| "loss": 0.0042, | |
| "num_tokens": 18577781.0, | |
| "reward": 0.5206961631774902, | |
| "reward_std": 0.6657352447509766, | |
| "rewards/cosine_scaled_reward/mean": -0.1380893886089325, | |
| "rewards/cosine_scaled_reward/std": 0.3631601333618164, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1898.0, | |
| "completions/mean_length": 1029.140625, | |
| "completions/mean_terminated_length": 942.796630859375, | |
| "completions/min_length": 459.0, | |
| "completions/min_terminated_length": 459.0, | |
| "epoch": 0.20914285714285713, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12827463448047638, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0038, | |
| "num_tokens": 18654262.0, | |
| "reward": 1.1728923320770264, | |
| "reward_std": 0.6444723010063171, | |
| "rewards/cosine_scaled_reward/mean": 0.08644616603851318, | |
| "rewards/cosine_scaled_reward/std": 0.49451789259910583, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2041.0, | |
| "completions/mean_length": 1162.984375, | |
| "completions/mean_terminated_length": 979.3018798828125, | |
| "completions/min_length": 100.0, | |
| "completions/min_terminated_length": 100.0, | |
| "epoch": 0.2102857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1496909260749817, | |
| "learning_rate": 1.1966285981663407e-07, | |
| "loss": 0.0474, | |
| "num_tokens": 18740045.0, | |
| "reward": 0.738210916519165, | |
| "reward_std": 0.540239155292511, | |
| "rewards/cosine_scaled_reward/mean": -0.07620704174041748, | |
| "rewards/cosine_scaled_reward/std": 0.37467995285987854, | |
| "rewards/format_reward/mean": 0.890625, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1991.0, | |
| "completions/mean_length": 1057.15625, | |
| "completions/mean_terminated_length": 991.1000366210938, | |
| "completions/min_length": 290.0, | |
| "completions/min_terminated_length": 290.0, | |
| "epoch": 0.21142857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13016612827777863, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0175, | |
| "num_tokens": 18817887.0, | |
| "reward": 0.5949590802192688, | |
| "reward_std": 0.6293296813964844, | |
| "rewards/cosine_scaled_reward/mean": -0.1868954598903656, | |
| "rewards/cosine_scaled_reward/std": 0.4017287492752075, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1410.0, | |
| "completions/mean_length": 1108.890625, | |
| "completions/mean_terminated_length": 974.732177734375, | |
| "completions/min_length": 354.0, | |
| "completions/min_terminated_length": 354.0, | |
| "epoch": 0.21257142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14262138307094574, | |
| "learning_rate": 1.1533337816991931e-07, | |
| "loss": 0.0015, | |
| "num_tokens": 18899552.0, | |
| "reward": 0.6897875070571899, | |
| "reward_std": 0.5968158841133118, | |
| "rewards/cosine_scaled_reward/mean": -0.08479373157024384, | |
| "rewards/cosine_scaled_reward/std": 0.4098339378833771, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2022.0, | |
| "completions/mean_length": 1067.40625, | |
| "completions/mean_terminated_length": 1002.0333862304688, | |
| "completions/min_length": 408.0, | |
| "completions/min_terminated_length": 408.0, | |
| "epoch": 0.21371428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16053920984268188, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0076, | |
| "num_tokens": 18978290.0, | |
| "reward": 0.7425481677055359, | |
| "reward_std": 0.5081203579902649, | |
| "rewards/cosine_scaled_reward/mean": -0.12091340124607086, | |
| "rewards/cosine_scaled_reward/std": 0.43119898438453674, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1906.0, | |
| "completions/mean_length": 1119.453125, | |
| "completions/mean_terminated_length": 1005.4210815429688, | |
| "completions/min_length": 563.0, | |
| "completions/min_terminated_length": 563.0, | |
| "epoch": 0.21485714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1582225263118744, | |
| "learning_rate": 1.1153347084664419e-07, | |
| "loss": 0.0305, | |
| "num_tokens": 19061735.0, | |
| "reward": 0.5219712257385254, | |
| "reward_std": 0.5593596696853638, | |
| "rewards/cosine_scaled_reward/mean": -0.1999519169330597, | |
| "rewards/cosine_scaled_reward/std": 0.32119491696357727, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1796.0, | |
| "completions/mean_length": 872.5625, | |
| "completions/mean_terminated_length": 814.7540283203125, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "epoch": 0.216, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13409367203712463, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0456, | |
| "num_tokens": 19126867.0, | |
| "reward": 0.7454105615615845, | |
| "reward_std": 0.605484127998352, | |
| "rewards/cosine_scaled_reward/mean": -0.11166971176862717, | |
| "rewards/cosine_scaled_reward/std": 0.4444236159324646, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2046.0, | |
| "completions/mean_length": 1080.25, | |
| "completions/mean_terminated_length": 998.2373046875, | |
| "completions/min_length": 423.0, | |
| "completions/min_terminated_length": 423.0, | |
| "epoch": 0.21714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12813109159469604, | |
| "learning_rate": 1.0826776744855121e-07, | |
| "loss": -0.0287, | |
| "num_tokens": 19205771.0, | |
| "reward": 1.0522401332855225, | |
| "reward_std": 0.5290870070457458, | |
| "rewards/cosine_scaled_reward/mean": 0.026120096445083618, | |
| "rewards/cosine_scaled_reward/std": 0.4774343967437744, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1906.0, | |
| "completions/mean_length": 952.4375, | |
| "completions/mean_terminated_length": 898.5573120117188, | |
| "completions/min_length": 284.0, | |
| "completions/min_terminated_length": 284.0, | |
| "epoch": 0.21828571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13046617805957794, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0316, | |
| "num_tokens": 19277015.0, | |
| "reward": 1.01558518409729, | |
| "reward_std": 0.6485674381256104, | |
| "rewards/cosine_scaled_reward/mean": 0.023417577147483826, | |
| "rewards/cosine_scaled_reward/std": 0.4800501763820648, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.328125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1859.0, | |
| "completions/mean_length": 1371.515625, | |
| "completions/mean_terminated_length": 1041.1395263671875, | |
| "completions/min_length": 382.0, | |
| "completions/min_terminated_length": 382.0, | |
| "epoch": 0.21942857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14968900382518768, | |
| "learning_rate": 1.0554024673218806e-07, | |
| "loss": 0.0953, | |
| "num_tokens": 19376088.0, | |
| "reward": 0.3939949572086334, | |
| "reward_std": 0.577399730682373, | |
| "rewards/cosine_scaled_reward/mean": -0.19362753629684448, | |
| "rewards/cosine_scaled_reward/std": 0.30269211530685425, | |
| "rewards/format_reward/mean": 0.78125, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1859.0, | |
| "completions/mean_length": 1186.921875, | |
| "completions/mean_terminated_length": 945.8200073242188, | |
| "completions/min_length": 493.0, | |
| "completions/min_terminated_length": 493.0, | |
| "epoch": 0.22057142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16263115406036377, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0846, | |
| "num_tokens": 19463195.0, | |
| "reward": 0.6804449558258057, | |
| "reward_std": 0.794600248336792, | |
| "rewards/cosine_scaled_reward/mean": -0.058215029537677765, | |
| "rewards/cosine_scaled_reward/std": 0.45185160636901855, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1870.0, | |
| "completions/mean_length": 1393.921875, | |
| "completions/mean_terminated_length": 1051.3095703125, | |
| "completions/min_length": 483.0, | |
| "completions/min_terminated_length": 483.0, | |
| "epoch": 0.22171428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1629265695810318, | |
| "learning_rate": 1.0335423176140511e-07, | |
| "loss": -0.0049, | |
| "num_tokens": 19563766.0, | |
| "reward": 0.7986553907394409, | |
| "reward_std": 0.874267578125, | |
| "rewards/cosine_scaled_reward/mean": 0.03214021399617195, | |
| "rewards/cosine_scaled_reward/std": 0.47694674134254456, | |
| "rewards/format_reward/mean": 0.734375, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1993.0, | |
| "completions/mean_length": 1115.015625, | |
| "completions/mean_terminated_length": 1035.9490966796875, | |
| "completions/min_length": 458.0, | |
| "completions/min_terminated_length": 458.0, | |
| "epoch": 0.22285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.139028862118721, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": -0.0035, | |
| "num_tokens": 19646271.0, | |
| "reward": 0.7042949795722961, | |
| "reward_std": 0.5829262733459473, | |
| "rewards/cosine_scaled_reward/mean": -0.10879002511501312, | |
| "rewards/cosine_scaled_reward/std": 0.38450202345848083, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1984.0, | |
| "completions/mean_length": 1338.078125, | |
| "completions/mean_terminated_length": 1139.2999267578125, | |
| "completions/min_length": 390.0, | |
| "completions/min_terminated_length": 390.0, | |
| "epoch": 0.224, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17117220163345337, | |
| "learning_rate": 1.017123858587145e-07, | |
| "loss": 0.0298, | |
| "num_tokens": 19743500.0, | |
| "reward": 0.3932352066040039, | |
| "reward_std": 0.6573115587234497, | |
| "rewards/cosine_scaled_reward/mean": -0.20181991159915924, | |
| "rewards/cosine_scaled_reward/std": 0.3404424488544464, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1562.0, | |
| "completions/mean_length": 872.078125, | |
| "completions/mean_terminated_length": 853.4127197265625, | |
| "completions/min_length": 416.0, | |
| "completions/min_terminated_length": 416.0, | |
| "epoch": 0.22514285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12287131696939468, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": -0.0059, | |
| "num_tokens": 19809681.0, | |
| "reward": 1.2395715713500977, | |
| "reward_std": 0.6934706568717957, | |
| "rewards/cosine_scaled_reward/mean": 0.11978581547737122, | |
| "rewards/cosine_scaled_reward/std": 0.5448962450027466, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2024.0, | |
| "completions/mean_length": 1120.28125, | |
| "completions/mean_terminated_length": 987.7500610351562, | |
| "completions/min_length": 320.0, | |
| "completions/min_terminated_length": 320.0, | |
| "epoch": 0.22628571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15039725601673126, | |
| "learning_rate": 1.0061670936044178e-07, | |
| "loss": 0.0362, | |
| "num_tokens": 19892883.0, | |
| "reward": 1.0277272462844849, | |
| "reward_std": 0.74528968334198, | |
| "rewards/cosine_scaled_reward/mean": 0.021676115691661835, | |
| "rewards/cosine_scaled_reward/std": 0.5368949174880981, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1958.0, | |
| "completions/mean_length": 1294.71875, | |
| "completions/mean_terminated_length": 1187.107177734375, | |
| "completions/min_length": 577.0, | |
| "completions/min_terminated_length": 577.0, | |
| "epoch": 0.22742857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.15263773500919342, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0084, | |
| "num_tokens": 19987249.0, | |
| "reward": 0.6131043434143066, | |
| "reward_std": 0.7018917798995972, | |
| "rewards/cosine_scaled_reward/mean": -0.1543852984905243, | |
| "rewards/cosine_scaled_reward/std": 0.35418131947517395, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1991.0, | |
| "completions/mean_length": 1115.796875, | |
| "completions/mean_terminated_length": 922.3207397460938, | |
| "completions/min_length": 509.0, | |
| "completions/min_terminated_length": 509.0, | |
| "epoch": 0.22857142857142856, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16787739098072052, | |
| "learning_rate": 1.0006853717962393e-07, | |
| "loss": 0.0407, | |
| "num_tokens": 20068780.0, | |
| "reward": 0.9602231979370117, | |
| "reward_std": 0.8039394617080688, | |
| "rewards/cosine_scaled_reward/mean": 0.05823659524321556, | |
| "rewards/cosine_scaled_reward/std": 0.5022075772285461, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "step": 200, | |
| "total_flos": 0.0, | |
| "train_loss": 0.03711814505979419, | |
| "train_runtime": 10340.5912, | |
| "train_samples_per_second": 1.238, | |
| "train_steps_per_second": 0.019 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 200, | |
| "num_input_tokens_seen": 20068780, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |