{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11048059056897504, "eval_steps": 500, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 459.2, "completions/max_terminated_length": 272.7, "completions/mean_length": 76.24375, "completions/mean_terminated_length": 64.11458358764648, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.0010043690051725004, "frac_reward_zero_std": 0.875, "grad_norm": 0.0, "learning_rate": 1.1999999999999998e-08, "loss": 0.0208, "num_tokens": 108131.0, "reward": 1.2312812566757203, "reward_std": 0.05931956073036417, "rewards/combined_reward/mean": 1.2312812566757203, "rewards/combined_reward/std": 0.4361365109682083, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 330.9, "completions/max_terminated_length": 147.6, "completions/mean_length": 75.425, "completions/mean_terminated_length": 61.425418090820315, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.002008738010345001, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 2.5333333333333335e-08, "loss": 0.0279, "num_tokens": 233579.0, "reward": 1.3428645849227905, "reward_std": 0.029872814007103444, "rewards/combined_reward/mean": 1.3428645849227905, "rewards/combined_reward/std": 0.3860916443169117, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.6, "completions/max_terminated_length": 110.6, "completions/mean_length": 51.04375, "completions/mean_terminated_length": 51.04375, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.003013107015517501, "frac_reward_zero_std": 0.875, "grad_norm": 3.3646392822265625, "learning_rate": 3.866666666666666e-08, "loss": -0.0132, "num_tokens": 352258.0, "reward": 1.323312509059906, "reward_std": 0.05337500050663948, "rewards/combined_reward/mean": 1.323312509059906, "rewards/combined_reward/std": 0.39539981335401536, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 307.1, "completions/max_terminated_length": 211.8, "completions/mean_length": 95.83125, "completions/mean_terminated_length": 60.24375, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.004017476020690002, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 5.2e-08, "loss": 0.0143, "num_tokens": 485155.0, "reward": 1.2628658890724183, "reward_std": 0.03280075653456151, "rewards/combined_reward/mean": 1.2628658890724183, "rewards/combined_reward/std": 0.4110621690750122, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.5, "completions/max_terminated_length": 140.5, "completions/mean_length": 61.7875, "completions/mean_terminated_length": 61.7875, "completions/min_length": 23.7, "completions/min_terminated_length": 23.7, "epoch": 0.005021845025862502, "frac_reward_zero_std": 0.95, "grad_norm": 1.8525996208190918, "learning_rate": 6.533333333333332e-08, "loss": 0.0147, "num_tokens": 607629.0, "reward": 1.3795833349227906, "reward_std": 0.00583496168255806, "rewards/combined_reward/mean": 1.3795833349227906, "rewards/combined_reward/std": 0.30837071537971494, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 338.5, "completions/max_terminated_length": 238.4, "completions/mean_length": 102.60625, "completions/mean_terminated_length": 91.32791748046876, "completions/min_length": 21.6, "completions/min_terminated_length": 21.6, "epoch": 0.006026214031035002, "frac_reward_zero_std": 0.9, "grad_norm": 2.992983818054199, "learning_rate": 7.866666666666666e-08, "loss": 0.0045, "num_tokens": 728802.0, "reward": 1.3164896011352538, "reward_std": 0.02619450243655592, "rewards/combined_reward/mean": 1.3164896011352538, "rewards/combined_reward/std": 0.3474510669708252, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 62.81875, "completions/mean_terminated_length": 61.769583511352536, "completions/min_length": 20.2, "completions/min_terminated_length": 20.2, "epoch": 0.007030583036207502, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 9.2e-08, "loss": 0.0098, "num_tokens": 836341.0, "reward": 1.355798614025116, "reward_std": 0.004375000763684511, "rewards/combined_reward/mean": 1.355798614025116, "rewards/combined_reward/std": 0.29267608374357224, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.2, "completions/max_terminated_length": 110.2, "completions/mean_length": 54.3375, "completions/mean_terminated_length": 54.3375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.008034952041380003, "frac_reward_zero_std": 0.925, "grad_norm": 13.820528984069824, "learning_rate": 1.0533333333333332e-07, "loss": 0.0119, "num_tokens": 945703.0, "reward": 1.4564843893051147, "reward_std": 0.003906251955777406, "rewards/combined_reward/mean": 1.4564843893051147, "rewards/combined_reward/std": 0.1776508768554777, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.9, "completions/max_terminated_length": 201.9, "completions/mean_length": 70.6125, "completions/mean_terminated_length": 70.6125, "completions/min_length": 21.5, "completions/min_terminated_length": 21.5, "epoch": 0.009039321046552503, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.1866666666666667e-07, "loss": 0.0195, "num_tokens": 1062961.0, "reward": 1.3238854348659514, "reward_std": 0.005562501423992216, "rewards/combined_reward/mean": 1.3238854348659514, "rewards/combined_reward/std": 0.22054901346564293, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.7, "completions/max_terminated_length": 113.7, "completions/mean_length": 60.275, "completions/mean_terminated_length": 60.275, "completions/min_length": 24.1, "completions/min_terminated_length": 24.1, "epoch": 0.010043690051725004, "frac_reward_zero_std": 0.875, "grad_norm": 0.0, "learning_rate": 1.32e-07, "loss": 0.0058, "num_tokens": 1175365.0, "reward": 1.4070937514305115, "reward_std": 0.034517763555049895, "rewards/combined_reward/mean": 1.4070937514305115, "rewards/combined_reward/std": 0.26661672741174697, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 158.5, "completions/max_terminated_length": 158.5, "completions/mean_length": 65.46875, "completions/mean_terminated_length": 64.41750030517578, "completions/min_length": 19.1, "completions/min_terminated_length": 19.1, "epoch": 0.011048059056897505, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.4533333333333334e-07, "loss": 0.0019, "num_tokens": 1288772.0, "reward": 1.2793750286102294, "reward_std": 0.0024999996647238733, "rewards/combined_reward/mean": 1.2793750286102294, "rewards/combined_reward/std": 0.31086390763521193, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 322.2, "completions/max_terminated_length": 134.1, "completions/mean_length": 77.01875, "completions/mean_terminated_length": 64.77458343505859, "completions/min_length": 20.7, "completions/min_terminated_length": 20.7, "epoch": 0.012052428062070004, "frac_reward_zero_std": 0.9, "grad_norm": 8.276171684265137, "learning_rate": 1.5866666666666666e-07, "loss": 0.0134, "num_tokens": 1403035.0, "reward": 1.3504362106323242, "reward_std": 0.030459362699184568, "rewards/combined_reward/mean": 1.3504362106323242, "rewards/combined_reward/std": 0.309928272664547, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.1, "completions/max_terminated_length": 102.1, "completions/mean_length": 61.0625, "completions/mean_terminated_length": 61.0625, "completions/min_length": 31.5, "completions/min_terminated_length": 31.5, "epoch": 0.013056797067242505, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.7199999999999998e-07, "loss": -0.0027, "num_tokens": 1524697.0, "reward": 1.361527794599533, "reward_std": 0.008749999664723873, "rewards/combined_reward/mean": 1.361527794599533, "rewards/combined_reward/std": 0.2736371263861656, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.4, "completions/max_terminated_length": 123.4, "completions/mean_length": 58.05, "completions/mean_terminated_length": 58.05, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.014061166072415004, "frac_reward_zero_std": 0.95, "grad_norm": 3.9411776065826416, "learning_rate": 1.8533333333333333e-07, "loss": 0.0062, "num_tokens": 1622389.0, "reward": 1.3123229265213012, "reward_std": 0.03212499991059303, "rewards/combined_reward/mean": 1.3123229265213012, "rewards/combined_reward/std": 0.35334871551021935, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 333.1, "completions/max_terminated_length": 135.6, "completions/mean_length": 111.125, "completions/mean_terminated_length": 61.191666793823245, "completions/min_length": 21.8, "completions/min_terminated_length": 21.8, "epoch": 0.015065535077587506, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9866666666666665e-07, "loss": 0.0039, "num_tokens": 1734901.0, "reward": 1.2678720355033875, "reward_std": 0.0006250014062970877, "rewards/combined_reward/mean": 1.2678720355033875, "rewards/combined_reward/std": 0.2531693406403065, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.7, "completions/max_terminated_length": 85.7, "completions/mean_length": 48.81875, "completions/mean_terminated_length": 48.81875, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.016069904082760007, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9999507890797406e-07, "loss": 0.0046, "num_tokens": 1847536.0, "reward": 1.345395851135254, "reward_std": 0.0016666671261191368, "rewards/combined_reward/mean": 1.345395851135254, "rewards/combined_reward/std": 0.29257251909002663, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 467.7, "completions/max_terminated_length": 277.7, "completions/mean_length": 144.23125, "completions/mean_terminated_length": 95.81041717529297, "completions/min_length": 28.9, "completions/min_terminated_length": 28.9, "epoch": 0.017074273087932506, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.9997806834748455e-07, "loss": -0.0018, "num_tokens": 1970837.0, "reward": 1.3027083039283753, "reward_std": 0.004424501396715641, "rewards/combined_reward/mean": 1.3027083039283753, "rewards/combined_reward/std": 0.4294335596263409, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.5, "completions/max_terminated_length": 99.5, "completions/mean_length": 50.44375, "completions/mean_terminated_length": 50.44375, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.018078642093105005, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9994890963073946e-07, "loss": 0.0059, "num_tokens": 2088820.0, "reward": 1.2765364408493043, "reward_std": 0.00015624959487468005, "rewards/combined_reward/mean": 1.2765364408493043, "rewards/combined_reward/std": 0.3481216669082642, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 67.0, "completions/mean_terminated_length": 67.0, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.019083011098277508, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.9990760630076236e-07, "loss": -0.0197, "num_tokens": 2217116.0, "reward": 1.3771250247955322, "reward_std": 0.001916667865589261, "rewards/combined_reward/mean": 1.3771250247955322, "rewards/combined_reward/std": 0.29997652024030685, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.2, "completions/max_terminated_length": 99.2, "completions/mean_length": 41.91875, "completions/mean_terminated_length": 41.91875, "completions/min_length": 12.8, "completions/min_terminated_length": 12.8, "epoch": 0.020087380103450007, "frac_reward_zero_std": 0.95, "grad_norm": 3.989150047302246, "learning_rate": 1.99854163376247e-07, "loss": 0.0011, "num_tokens": 2329863.0, "reward": 1.1117187559604644, "reward_std": 0.02916821506805718, "rewards/combined_reward/mean": 1.1117187559604644, "rewards/combined_reward/std": 0.37413454949855807, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 408.0, "completions/max_terminated_length": 220.7, "completions/mean_length": 133.575, "completions/mean_terminated_length": 84.2875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.021091749108622507, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.9978858735094754e-07, "loss": 0.0285, "num_tokens": 2457743.0, "reward": 1.3693958520889282, "reward_std": 0.004563984216656536, "rewards/combined_reward/mean": 1.3693958520889282, "rewards/combined_reward/std": 0.33579447590745987, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.4, "completions/max_terminated_length": 115.4, "completions/mean_length": 60.24375, "completions/mean_terminated_length": 60.24375, "completions/min_length": 20.8, "completions/min_terminated_length": 20.8, "epoch": 0.02209611811379501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9971088619288948e-07, "loss": 0.0, "num_tokens": 2581282.0, "reward": 1.284375011920929, "reward_std": 0.0, "rewards/combined_reward/mean": 1.284375011920929, "rewards/combined_reward/std": 0.3291483834385872, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 110.9, "completions/max_terminated_length": 110.9, "completions/mean_length": 52.08125, "completions/mean_terminated_length": 51.73625030517578, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.02310048711896751, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.996210693434016e-07, "loss": 0.0, "num_tokens": 2716695.0, "reward": 1.3078229188919068, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3078229188919068, "rewards/combined_reward/std": 0.3146174341440201, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 316.8, "completions/max_terminated_length": 296.7, "completions/mean_length": 106.325, "completions/mean_terminated_length": 71.55961608886719, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.024104856124140008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9951914771596858e-07, "loss": 0.0, "num_tokens": 2820347.0, "reward": 1.2994583308696748, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2994583308696748, "rewards/combined_reward/std": 0.35011555850505827, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 58.80625, "completions/mean_terminated_length": 57.67589340209961, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.02510922512931251, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.9940513369490513e-07, "loss": 0.0119, "num_tokens": 2937640.0, "reward": 1.2942708253860473, "reward_std": 0.0020473659737035633, "rewards/combined_reward/mean": 1.2942708253860473, "rewards/combined_reward/std": 0.34473495446145536, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.2, "completions/max_terminated_length": 136.2, "completions/mean_length": 68.56875, "completions/mean_terminated_length": 68.56875, "completions/min_length": 27.3, "completions/min_terminated_length": 27.3, "epoch": 0.02611359413448501, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.9927904113385096e-07, "loss": 0.0134, "num_tokens": 3051799.0, "reward": 1.3380468726158141, "reward_std": 0.00270459558814764, "rewards/combined_reward/mean": 1.3380468726158141, "rewards/combined_reward/std": 0.28382683396339414, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.2, "completions/max_terminated_length": 172.2, "completions/mean_length": 72.875, "completions/mean_terminated_length": 72.875, "completions/min_length": 26.4, "completions/min_terminated_length": 26.4, "epoch": 0.02711796313965751, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.9914088535408765e-07, "loss": -0.0019, "num_tokens": 3164803.0, "reward": 1.4464478969573975, "reward_std": 0.0021736113354563712, "rewards/combined_reward/mean": 1.4464478969573975, "rewards/combined_reward/std": 0.19929498732089995, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.5, "completions/max_terminated_length": 140.5, "completions/mean_length": 59.38125, "completions/mean_terminated_length": 59.38125, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.02812233214483001, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9899068314267685e-07, "loss": 0.001, "num_tokens": 3280220.0, "reward": 1.3454687356948853, "reward_std": 0.004999999329447747, "rewards/combined_reward/mean": 1.3454687356948853, "rewards/combined_reward/std": 0.31286893486976625, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.7, "completions/max_terminated_length": 94.7, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.029126701150002512, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.9882845275042067e-07, "loss": 0.0065, "num_tokens": 3385228.0, "reward": 1.4142057299613953, "reward_std": 0.00044270951766520736, "rewards/combined_reward/mean": 1.4142057299613953, "rewards/combined_reward/std": 0.20944447480142117, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.3, "completions/max_terminated_length": 173.3, "completions/mean_length": 76.13125, "completions/mean_terminated_length": 76.13125, "completions/min_length": 23.4, "completions/min_terminated_length": 23.4, "epoch": 0.03013107015517501, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9865421388964382e-07, "loss": -0.0017, "num_tokens": 3496189.0, "reward": 1.3910624980926514, "reward_std": 0.0021650632843375206, "rewards/combined_reward/mean": 1.3910624980926514, "rewards/combined_reward/std": 0.28597628474235537, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.4, "completions/max_terminated_length": 315.4, "completions/mean_length": 99.93125, "completions/mean_terminated_length": 99.93125, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.03113543916034751, "frac_reward_zero_std": 0.9, "grad_norm": 3.8047702312469482, "learning_rate": 1.9846798773179865e-07, "loss": 0.0118, "num_tokens": 3602282.0, "reward": 1.2963680744171142, "reward_std": 0.01609460562467575, "rewards/combined_reward/mean": 1.2963680744171142, "rewards/combined_reward/std": 0.3926819786429405, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.8, "completions/max_terminated_length": 103.8, "completions/mean_length": 52.2875, "completions/mean_terminated_length": 52.2875, "completions/min_length": 20.6, "completions/min_terminated_length": 20.6, "epoch": 0.03213980816552001, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9826979690489249e-07, "loss": 0.0014, "num_tokens": 3717904.0, "reward": 1.403697907924652, "reward_std": 0.0003125001909211278, "rewards/combined_reward/mean": 1.403697907924652, "rewards/combined_reward/std": 0.24410614371299744, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.8, "completions/max_terminated_length": 79.8, "completions/mean_length": 44.49375, "completions/mean_terminated_length": 44.49375, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.03314417717069251, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.9805966549073822e-07, "loss": 0.0057, "num_tokens": 3825867.0, "reward": 1.3135937452316284, "reward_std": 0.007812501117587089, "rewards/combined_reward/mean": 1.3135937452316284, "rewards/combined_reward/std": 0.3756252348423004, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.9, "completions/max_terminated_length": 117.9, "completions/mean_length": 54.15, "completions/mean_terminated_length": 54.15, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.03414854617586501, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9783761902202812e-07, "loss": 0.0067, "num_tokens": 3942087.0, "reward": 1.290208351612091, "reward_std": 0.0010206203907728196, "rewards/combined_reward/mean": 1.290208351612091, "rewards/combined_reward/std": 0.27491325289011004, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.2, "completions/max_terminated_length": 89.2, "completions/mean_length": 45.46875, "completions/mean_terminated_length": 45.46875, "completions/min_length": 12.9, "completions/min_terminated_length": 12.9, "epoch": 0.03515291518103751, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9760368447923143e-07, "loss": 0.0, "num_tokens": 4077218.0, "reward": 1.271875011920929, "reward_std": 0.0, "rewards/combined_reward/mean": 1.271875011920929, "rewards/combined_reward/std": 0.3903637401759624, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.1, "completions/max_terminated_length": 86.1, "completions/mean_length": 47.9125, "completions/mean_terminated_length": 47.9125, "completions/min_length": 19.1, "completions/min_terminated_length": 19.1, "epoch": 0.03615728418621001, "frac_reward_zero_std": 0.925, "grad_norm": 5.847682952880859, "learning_rate": 1.9735789028731602e-07, "loss": -0.0023, "num_tokens": 4189144.0, "reward": 1.3238541960716248, "reward_std": 0.03020833432674408, "rewards/combined_reward/mean": 1.3238541960716248, "rewards/combined_reward/std": 0.32445888966321945, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.3, "completions/max_terminated_length": 103.3, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 20.6, "completions/min_terminated_length": 20.6, "epoch": 0.03716165319138252, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.9710026631229448e-07, "loss": 0.0001, "num_tokens": 4294100.0, "reward": 1.3909027934074403, "reward_std": 0.00034722290001809597, "rewards/combined_reward/mean": 1.3909027934074403, "rewards/combined_reward/std": 0.2816110193729401, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.4, "completions/max_terminated_length": 119.4, "completions/mean_length": 57.65625, "completions/mean_terminated_length": 57.65625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.038166022196555016, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9683084385759522e-07, "loss": -0.0002, "num_tokens": 4400477.0, "reward": 1.333958351612091, "reward_std": 0.0012500007636845113, "rewards/combined_reward/mean": 1.333958351612091, "rewards/combined_reward/std": 0.2801030218601227, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.6, "completions/max_terminated_length": 112.6, "completions/mean_length": 55.225, "completions/mean_terminated_length": 55.225, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.039170391201727515, "frac_reward_zero_std": 0.925, "grad_norm": 4.480510234832764, "learning_rate": 1.9654965566025878e-07, "loss": 0.006, "num_tokens": 4516865.0, "reward": 1.370369803905487, "reward_std": 0.002187502384185791, "rewards/combined_reward/mean": 1.370369803905487, "rewards/combined_reward/std": 0.27093904092907906, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.2, "completions/max_terminated_length": 139.2, "completions/mean_length": 55.54375, "completions/mean_terminated_length": 55.54375, "completions/min_length": 12.2, "completions/min_terminated_length": 12.2, "epoch": 0.040174760206900015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9625673588696007e-07, "loss": 0.0, "num_tokens": 4634776.0, "reward": 1.2619999647140503, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2619999647140503, "rewards/combined_reward/std": 0.3673270642757416, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.5, "completions/max_terminated_length": 106.5, "completions/mean_length": 52.2875, "completions/mean_terminated_length": 52.2875, "completions/min_length": 13.1, "completions/min_terminated_length": 13.1, "epoch": 0.041179129212072514, "frac_reward_zero_std": 0.95, "grad_norm": 5.624104022979736, "learning_rate": 1.959521201298568e-07, "loss": 0.0061, "num_tokens": 4766894.0, "reward": 1.3308506846427917, "reward_std": 0.003342500701546669, "rewards/combined_reward/mean": 1.3308506846427917, "rewards/combined_reward/std": 0.37019643262028695, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.9, "completions/max_terminated_length": 144.9, "completions/mean_length": 63.63125, "completions/mean_terminated_length": 63.63125, "completions/min_length": 18.3, "completions/min_terminated_length": 18.3, "epoch": 0.042183498217245013, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.956358454022648e-07, "loss": -0.0011, "num_tokens": 4887883.0, "reward": 1.3249478936195374, "reward_std": 0.016550703253597022, "rewards/combined_reward/mean": 1.3249478936195374, "rewards/combined_reward/std": 0.31248683035373687, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.8, "completions/max_terminated_length": 70.8, "completions/mean_length": 40.03125, "completions/mean_terminated_length": 40.03125, "completions/min_length": 21.5, "completions/min_terminated_length": 21.5, "epoch": 0.04318786722241751, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9530795013416043e-07, "loss": -0.0062, "num_tokens": 5017432.0, "reward": 1.2040624856948852, "reward_std": 0.003125, "rewards/combined_reward/mean": 1.2040624856948852, "rewards/combined_reward/std": 0.28724531903862954, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 95.9, "completions/max_terminated_length": 95.9, "completions/mean_length": 47.64375, "completions/mean_terminated_length": 46.64416732788086, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.04419223622759002, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.9496847416751122e-07, "loss": -0.0055, "num_tokens": 5127539.0, "reward": 1.3247395992279052, "reward_std": 0.005520834401249885, "rewards/combined_reward/mean": 1.3247395992279052, "rewards/combined_reward/std": 0.353334778547287, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.6, "completions/max_terminated_length": 101.6, "completions/mean_length": 53.95625, "completions/mean_terminated_length": 53.95625, "completions/min_length": 21.6, "completions/min_terminated_length": 21.6, "epoch": 0.04519660523276252, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9461745875143477e-07, "loss": -0.0013, "num_tokens": 5239592.0, "reward": 1.2362499833106995, "reward_std": 0.0016666660085320473, "rewards/combined_reward/mean": 1.2362499833106995, "rewards/combined_reward/std": 0.33721971064805983, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.9, "completions/max_terminated_length": 156.9, "completions/mean_length": 73.56875, "completions/mean_terminated_length": 73.56875, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.04620097423793502, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.942549465371863e-07, "loss": -0.0051, "num_tokens": 5360759.0, "reward": 1.364300584793091, "reward_std": 0.0033333331346511843, "rewards/combined_reward/mean": 1.364300584793091, "rewards/combined_reward/std": 0.29198225438594816, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.4, "completions/max_terminated_length": 89.4, "completions/mean_length": 49.9, "completions/mean_terminated_length": 49.9, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.04720534324310752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.938809815729766e-07, "loss": 0.0, "num_tokens": 5489735.0, "reward": 1.2914583563804627, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2914583563804627, "rewards/combined_reward/std": 0.32128691375255586, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.8, "completions/max_terminated_length": 116.8, "completions/mean_length": 54.26875, "completions/mean_terminated_length": 54.26875, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.048209712248280016, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.9349560929861957e-07, "loss": 0.0036, "num_tokens": 5618126.0, "reward": 1.2964062452316285, "reward_std": 0.0034375011920928953, "rewards/combined_reward/mean": 1.2964062452316285, "rewards/combined_reward/std": 0.3410232897847891, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.5, "completions/max_terminated_length": 138.5, "completions/mean_length": 63.425, "completions/mean_terminated_length": 63.425, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.049214081253452516, "frac_reward_zero_std": 0.975, "grad_norm": 5.859716892242432, "learning_rate": 1.9309887654001093e-07, "loss": -0.0122, "num_tokens": 5732858.0, "reward": 1.3710416555404663, "reward_std": 0.005000000074505806, "rewards/combined_reward/mean": 1.3710416555404663, "rewards/combined_reward/std": 0.2569635409861803, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.1, "completions/max_terminated_length": 71.1, "completions/mean_length": 37.5125, "completions/mean_terminated_length": 37.5125, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.05021845025862502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9269083150343857e-07, "loss": 0.0, "num_tokens": 5827508.0, "reward": 1.2737499952316285, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2737499952316285, "rewards/combined_reward/std": 0.36351585388183594, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.3, "completions/max_terminated_length": 92.3, "completions/mean_length": 49.31875, "completions/mean_terminated_length": 49.31875, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.05122281926379752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9227152376972505e-07, "loss": 0.0, "num_tokens": 5940043.0, "reward": 1.3223958492279053, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3223958492279053, "rewards/combined_reward/std": 0.32680114805698396, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.1, "completions/max_terminated_length": 112.1, "completions/mean_length": 60.84375, "completions/mean_terminated_length": 60.84375, "completions/min_length": 22.7, "completions/min_terminated_length": 22.7, "epoch": 0.05222718826897002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.91841004288203e-07, "loss": 0.0, "num_tokens": 6061038.0, "reward": 1.3749479293823241, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3749479293823241, "rewards/combined_reward/std": 0.2760587348602712, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 383.1, "completions/max_terminated_length": 211.9, "completions/mean_length": 101.45, "completions/mean_terminated_length": 89.37000045776367, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.05323155727414252, "frac_reward_zero_std": 0.875, "grad_norm": 0.0, "learning_rate": 1.913993253705246e-07, "loss": 0.0182, "num_tokens": 6172502.0, "reward": 1.3482013940811157, "reward_std": 0.004686582600697875, "rewards/combined_reward/mean": 1.3482013940811157, "rewards/combined_reward/std": 0.26615125834941866, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 61.33125, "completions/mean_terminated_length": 61.33125, "completions/min_length": 26.7, "completions/min_terminated_length": 26.7, "epoch": 0.05423592627931502, "frac_reward_zero_std": 0.95, "grad_norm": 6.519238471984863, "learning_rate": 1.9094654068430515e-07, "loss": -0.014, "num_tokens": 6279539.0, "reward": 1.456402564048767, "reward_std": 0.0006212619598954916, "rewards/combined_reward/mean": 1.456402564048767, "rewards/combined_reward/std": 0.17502975650131702, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 319.8, "completions/max_terminated_length": 302.8, "completions/mean_length": 102.7, "completions/mean_terminated_length": 92.22833557128907, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.05524029528448752, "frac_reward_zero_std": 0.9, "grad_norm": 6.630038738250732, "learning_rate": 1.9048270524660196e-07, "loss": 0.0001, "num_tokens": 6401355.0, "reward": 1.2464791774749755, "reward_std": 0.016750000603497028, "rewards/combined_reward/mean": 1.2464791774749755, "rewards/combined_reward/std": 0.43877428472042085, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.4, "completions/max_terminated_length": 108.4, "completions/mean_length": 57.21875, "completions/mean_terminated_length": 57.21875, "completions/min_length": 22.1, "completions/min_terminated_length": 22.1, "epoch": 0.05624466428966002, "frac_reward_zero_std": 0.95, "grad_norm": 4.464468955993652, "learning_rate": 1.9000787541722936e-07, "loss": -0.0008, "num_tokens": 6512806.0, "reward": 1.3637500047683715, "reward_std": 0.0056250004563480616, "rewards/combined_reward/mean": 1.3637500047683715, "rewards/combined_reward/std": 0.25516389338299633, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 296.2, "completions/max_terminated_length": 201.7, "completions/mean_length": 86.75625, "completions/mean_terminated_length": 75.22125091552735, "completions/min_length": 24.9, "completions/min_terminated_length": 24.9, "epoch": 0.057249033294832524, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.8952210889191065e-07, "loss": -0.0016, "num_tokens": 6619515.0, "reward": 1.3538541674613953, "reward_std": 0.009270833618938924, "rewards/combined_reward/mean": 1.3538541674613953, "rewards/combined_reward/std": 0.35525577939115466, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.1, "completions/max_terminated_length": 104.1, "completions/mean_length": 48.9, "completions/mean_terminated_length": 48.9, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.058253402300005024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.890254646952674e-07, "loss": 0.0, "num_tokens": 6728163.0, "reward": 1.2268749833106996, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2268749833106996, "rewards/combined_reward/std": 0.33372554890811446, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.7, "completions/max_terminated_length": 115.7, "completions/mean_length": 61.34375, "completions/mean_terminated_length": 61.34375, "completions/min_length": 20.3, "completions/min_terminated_length": 20.3, "epoch": 0.05925777130517752, "frac_reward_zero_std": 0.95, "grad_norm": 3.076678991317749, "learning_rate": 1.885180031736477e-07, "loss": -0.0013, "num_tokens": 6845358.0, "reward": 1.3715885639190675, "reward_std": 0.0037068985402584076, "rewards/combined_reward/mean": 1.3715885639190675, "rewards/combined_reward/std": 0.3188589945435524, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.6, "completions/max_terminated_length": 122.6, "completions/mean_length": 55.81875, "completions/mean_terminated_length": 55.81875, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.06026214031035002, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.879997859877932e-07, "loss": 0.0032, "num_tokens": 6971649.0, "reward": 1.280833327770233, "reward_std": 0.0006132050417363644, "rewards/combined_reward/mean": 1.280833327770233, "rewards/combined_reward/std": 0.338599956035614, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 387.6, "completions/max_terminated_length": 192.1, "completions/mean_length": 122.46875, "completions/mean_terminated_length": 72.42708358764648, "completions/min_length": 23.3, "completions/min_terminated_length": 23.3, "epoch": 0.06126650931552252, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.8747087610534734e-07, "loss": 0.019, "num_tokens": 7087600.0, "reward": 1.338072907924652, "reward_std": 0.013132144883275031, "rewards/combined_reward/mean": 1.338072907924652, "rewards/combined_reward/std": 0.30777021273970606, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.7, "completions/max_terminated_length": 112.7, "completions/mean_length": 58.44375, "completions/mean_terminated_length": 58.44375, "completions/min_length": 23.9, "completions/min_terminated_length": 23.9, "epoch": 0.06227087832069502, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.8693133779320382e-07, "loss": -0.0031, "num_tokens": 7191467.0, "reward": 1.3348880290985108, "reward_std": 0.007124999910593033, "rewards/combined_reward/mean": 1.3348880290985108, "rewards/combined_reward/std": 0.2751554258167744, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.8, "completions/max_terminated_length": 194.8, "completions/mean_length": 84.76875, "completions/mean_terminated_length": 84.76875, "completions/min_length": 21.4, "completions/min_terminated_length": 21.4, "epoch": 0.06327524732586752, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.8638123660969793e-07, "loss": -0.0084, "num_tokens": 7304146.0, "reward": 1.3757467865943909, "reward_std": 0.0030034731142222883, "rewards/combined_reward/mean": 1.3757467865943909, "rewards/combined_reward/std": 0.28882216811180117, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.9, "completions/max_terminated_length": 101.9, "completions/mean_length": 56.925, "completions/mean_terminated_length": 56.925, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.06427961633104003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.858206393966405e-07, "loss": 0.0, "num_tokens": 7415006.0, "reward": 1.3215104341506958, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3215104341506958, "rewards/combined_reward/std": 0.33309968262910844, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.5, "completions/max_terminated_length": 106.5, "completions/mean_length": 58.26875, "completions/mean_terminated_length": 58.26875, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.06528398533621252, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.8524961427119615e-07, "loss": -0.009, "num_tokens": 7546381.0, "reward": 1.3129427313804627, "reward_std": 0.002951054647564888, "rewards/combined_reward/mean": 1.3129427313804627, "rewards/combined_reward/std": 0.3575292468070984, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 46.75625, "completions/mean_terminated_length": 46.75625, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.06628835434138503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.846682306176065e-07, "loss": 0.0, "num_tokens": 7668158.0, "reward": 1.3184374928474427, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3184374928474427, "rewards/combined_reward/std": 0.35122168958187105, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.7, "completions/max_terminated_length": 101.7, "completions/mean_length": 56.3375, "completions/mean_terminated_length": 56.3375, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.06729272334655753, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.8407655907875938e-07, "loss": 0.0006, "num_tokens": 7794644.0, "reward": 1.331454861164093, "reward_std": 0.007124999910593033, "rewards/combined_reward/mean": 1.331454861164093, "rewards/combined_reward/std": 0.3434182394295931, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.7, "completions/max_terminated_length": 135.7, "completions/mean_length": 68.90625, "completions/mean_terminated_length": 68.90625, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.06829709235173002, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.8347467154760515e-07, "loss": 0.0079, "num_tokens": 7913933.0, "reward": 1.3356944441795349, "reward_std": 0.0053335148841142654, "rewards/combined_reward/mean": 1.3356944441795349, "rewards/combined_reward/std": 0.3590264985337853, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.9, "completions/max_terminated_length": 265.9, "completions/mean_length": 91.5, "completions/mean_terminated_length": 91.5, "completions/min_length": 24.8, "completions/min_terminated_length": 24.8, "epoch": 0.06930146135690253, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.8286264115842114e-07, "loss": 0.0017, "num_tokens": 8033153.0, "reward": 1.3431249916553498, "reward_std": 0.0044791650027036665, "rewards/combined_reward/mean": 1.3431249916553498, "rewards/combined_reward/std": 0.3242304854094982, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.2, "completions/max_terminated_length": 73.2, "completions/mean_length": 39.55625, "completions/mean_terminated_length": 39.55625, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.07030583036207502, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.8224054227792522e-07, "loss": -0.003, "num_tokens": 8147198.0, "reward": 1.3440885424613953, "reward_std": 0.0002604176523163915, "rewards/combined_reward/mean": 1.3440885424613953, "rewards/combined_reward/std": 0.3006736177019775, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.6, "completions/max_terminated_length": 123.6, "completions/mean_length": 67.76875, "completions/mean_terminated_length": 67.76875, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.07131019936724753, "frac_reward_zero_std": 0.975, "grad_norm": 1.436936616897583, "learning_rate": 1.816084504962396e-07, "loss": 0.0009, "num_tokens": 8248985.0, "reward": 1.459496557712555, "reward_std": 0.002500000596046448, "rewards/combined_reward/mean": 1.459496557712555, "rewards/combined_reward/std": 0.15663873171433806, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.1, "completions/max_terminated_length": 172.1, "completions/mean_length": 76.96875, "completions/mean_terminated_length": 76.96875, "completions/min_length": 24.5, "completions/min_terminated_length": 24.5, "epoch": 0.07231456837242002, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.8096644261770608e-07, "loss": 0.0179, "num_tokens": 8373128.0, "reward": 1.3943750143051148, "reward_std": 0.005624998733401299, "rewards/combined_reward/mean": 1.3943750143051148, "rewards/combined_reward/std": 0.24296645894646646, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 60.15625, "completions/mean_terminated_length": 60.15625, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "epoch": 0.07331893737759253, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.8031459665155363e-07, "loss": -0.001, "num_tokens": 8487649.0, "reward": 1.4223046898841858, "reward_std": 0.0001302093267440796, "rewards/combined_reward/mean": 1.4223046898841858, "rewards/combined_reward/std": 0.2848698660731316, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.8, "completions/max_terminated_length": 77.8, "completions/mean_length": 45.84375, "completions/mean_terminated_length": 45.84375, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.07432330638276503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.796529918024196e-07, "loss": 0.0, "num_tokens": 8603284.0, "reward": 1.37947918176651, "reward_std": 0.0, "rewards/combined_reward/mean": 1.37947918176651, "rewards/combined_reward/std": 0.27231944501399996, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.9, "completions/max_terminated_length": 204.9, "completions/mean_length": 76.9375, "completions/mean_terminated_length": 76.9375, "completions/min_length": 18.3, "completions/min_terminated_length": 18.3, "epoch": 0.07532767538793753, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.7898170846072592e-07, "loss": 0.0009, "num_tokens": 8718758.0, "reward": 1.32010418176651, "reward_std": 0.002500000596046448, "rewards/combined_reward/mean": 1.32010418176651, "rewards/combined_reward/std": 0.34439257588237526, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 64.11875, "completions/mean_terminated_length": 64.11875, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.07633204439311003, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.783008281929106e-07, "loss": -0.0051, "num_tokens": 8833993.0, "reward": 1.3178860425949097, "reward_std": 0.016688717156648637, "rewards/combined_reward/mean": 1.3178860425949097, "rewards/combined_reward/std": 0.3388564258813858, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.4, "completions/max_terminated_length": 122.4, "completions/mean_length": 62.99375, "completions/mean_terminated_length": 62.99375, "completions/min_length": 21.2, "completions/min_terminated_length": 21.2, "epoch": 0.07733641339828252, "frac_reward_zero_std": 0.975, "grad_norm": 1.1234172582626343, "learning_rate": 1.7761043373151713e-07, "loss": -0.0046, "num_tokens": 8950896.0, "reward": 1.3376388788223266, "reward_std": 0.00034722290001809597, "rewards/combined_reward/mean": 1.3376388788223266, "rewards/combined_reward/std": 0.34661323949694633, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 56.3, "completions/mean_terminated_length": 56.3, "completions/min_length": 20.3, "completions/min_terminated_length": 20.3, "epoch": 0.07834078240345503, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.7691060896514168e-07, "loss": -0.0003, "num_tokens": 9071600.0, "reward": 1.3996267199516297, "reward_std": 0.002080751396715641, "rewards/combined_reward/mean": 1.3996267199516297, "rewards/combined_reward/std": 0.26108508543111386, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.4, "completions/max_terminated_length": 79.4, "completions/mean_length": 45.76875, "completions/mean_terminated_length": 45.76875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.07934515140862752, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.7620143892823975e-07, "loss": -0.0062, "num_tokens": 9174599.0, "reward": 1.378697919845581, "reward_std": 0.0003125001909211278, "rewards/combined_reward/mean": 1.378697919845581, "rewards/combined_reward/std": 0.2739857309497893, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.3, "completions/max_terminated_length": 98.3, "completions/mean_length": 50.98125, "completions/mean_terminated_length": 50.98125, "completions/min_length": 19.2, "completions/min_terminated_length": 19.2, "epoch": 0.08034952041380003, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.7548300979079413e-07, "loss": -0.0008, "num_tokens": 9284796.0, "reward": 1.368190097808838, "reward_std": 0.004609373956918716, "rewards/combined_reward/mean": 1.368190097808838, "rewards/combined_reward/std": 0.25843119765631856, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 41.175, "completions/mean_terminated_length": 41.175, "completions/min_length": 12.8, "completions/min_terminated_length": 12.8, "epoch": 0.08135388941897254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7475540884784422e-07, "loss": 0.0, "num_tokens": 9398356.0, "reward": 1.2378819465637207, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2378819465637207, "rewards/combined_reward/std": 0.3914600659161806, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.3, "completions/max_terminated_length": 96.3, "completions/mean_length": 54.50625, "completions/mean_terminated_length": 54.50625, "completions/min_length": 19.6, "completions/min_terminated_length": 19.6, "epoch": 0.08235825842414503, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.7401872450887915e-07, "loss": -0.0007, "num_tokens": 9497821.0, "reward": 1.3947187542915345, "reward_std": 0.0015624999767169356, "rewards/combined_reward/mean": 1.3947187542915345, "rewards/combined_reward/std": 0.2990885377395898, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.7, "completions/max_terminated_length": 96.7, "completions/mean_length": 49.1875, "completions/mean_terminated_length": 49.1875, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.08336262742931753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7327304628709528e-07, "loss": 0.0, "num_tokens": 9641355.0, "reward": 1.3011458396911622, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3011458396911622, "rewards/combined_reward/std": 0.2698082665912807, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.3, "completions/max_terminated_length": 99.3, "completions/mean_length": 54.9125, "completions/mean_terminated_length": 54.9125, "completions/min_length": 21.7, "completions/min_terminated_length": 21.7, "epoch": 0.08436699643449003, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.7251846478851951e-07, "loss": 0.0083, "num_tokens": 9759969.0, "reward": 1.2925694584846497, "reward_std": 0.0019245008006691933, "rewards/combined_reward/mean": 1.2925694584846497, "rewards/combined_reward/std": 0.26882885694503783, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.1, "completions/max_terminated_length": 168.1, "completions/mean_length": 66.68125, "completions/mean_terminated_length": 66.68125, "completions/min_length": 19.3, "completions/min_terminated_length": 19.3, "epoch": 0.08537136543966253, "frac_reward_zero_std": 0.925, "grad_norm": 6.147635459899902, "learning_rate": 1.7175507170100008e-07, "loss": -0.0077, "num_tokens": 9881310.0, "reward": 1.2720364809036255, "reward_std": 0.011238560592755676, "rewards/combined_reward/mean": 1.2720364809036255, "rewards/combined_reward/std": 0.31835093796253205, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 23.2, "completions/min_terminated_length": 23.2, "epoch": 0.08637573444483503, "frac_reward_zero_std": 0.95, "grad_norm": 1.287226676940918, "learning_rate": 1.7098295978306552e-07, "loss": -0.012, "num_tokens": 9981046.0, "reward": 1.322606337070465, "reward_std": 0.0022470591589808463, "rewards/combined_reward/mean": 1.322606337070465, "rewards/combined_reward/std": 0.3106359137222171, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.9, "completions/max_terminated_length": 91.9, "completions/mean_length": 46.50625, "completions/mean_terminated_length": 46.50625, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.08738010345000753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7020222285265395e-07, "loss": 0.0, "num_tokens": 10089371.0, "reward": 1.2643750071525575, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2643750071525575, "rewards/combined_reward/std": 0.4044176399707794, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.1, "completions/max_terminated_length": 217.1, "completions/mean_length": 70.81875, "completions/mean_terminated_length": 70.81875, "completions/min_length": 18.9, "completions/min_terminated_length": 18.9, "epoch": 0.08838447245518004, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.6941295577571328e-07, "loss": 0.0079, "num_tokens": 10197254.0, "reward": 1.309374988079071, "reward_std": 0.002500000596046448, "rewards/combined_reward/mean": 1.309374988079071, "rewards/combined_reward/std": 0.325995758920908, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.7, "completions/max_terminated_length": 94.7, "completions/mean_length": 53.04375, "completions/mean_terminated_length": 53.04375, "completions/min_length": 22.5, "completions/min_terminated_length": 22.5, "epoch": 0.08938884146035253, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.686152544546743e-07, "loss": 0.0008, "num_tokens": 10316525.0, "reward": 1.3464062690734864, "reward_std": 0.00416666641831398, "rewards/combined_reward/mean": 1.3464062690734864, "rewards/combined_reward/std": 0.2880703628063202, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 50.95625, "completions/mean_terminated_length": 50.95625, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.09039321046552504, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.6780921581679763e-07, "loss": 0.0021, "num_tokens": 10435242.0, "reward": 1.2726041793823242, "reward_std": 0.009523502597585321, "rewards/combined_reward/mean": 1.2726041793823242, "rewards/combined_reward/std": 0.33535852897912266, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.1, "completions/max_terminated_length": 104.1, "completions/mean_length": 57.20625, "completions/mean_terminated_length": 57.20625, "completions/min_length": 19.4, "completions/min_terminated_length": 19.4, "epoch": 0.09139757947069753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6699493780239649e-07, "loss": 0.0, "num_tokens": 10548043.0, "reward": 1.3535937666893005, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3535937666893005, "rewards/combined_reward/std": 0.33704030215740205, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.5, "completions/max_terminated_length": 107.5, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.09240194847587004, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.6617251935293588e-07, "loss": -0.0028, "num_tokens": 10675027.0, "reward": 1.3419270992279053, "reward_std": 0.0015625, "rewards/combined_reward/mean": 1.3419270992279053, "rewards/combined_reward/std": 0.32070667631924155, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.1, "completions/max_terminated_length": 104.1, "completions/mean_length": 58.05625, "completions/mean_terminated_length": 58.05625, "completions/min_length": 25.7, "completions/min_terminated_length": 25.7, "epoch": 0.09340631748104254, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.6534206039901054e-07, "loss": 0.0, "num_tokens": 10805048.0, "reward": 1.4538020730018615, "reward_std": 0.0005208343267440796, "rewards/combined_reward/mean": 1.4538020730018615, "rewards/combined_reward/std": 0.17151957787573338, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.8, "completions/max_terminated_length": 79.8, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 12.6, "completions/min_terminated_length": 12.6, "epoch": 0.09441068648621503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6450366184820256e-07, "loss": 0.0, "num_tokens": 10906272.0, "reward": 1.258458322286606, "reward_std": 0.0, "rewards/combined_reward/mean": 1.258458322286606, "rewards/combined_reward/std": 0.3260463088750839, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.2, "completions/max_terminated_length": 118.2, "completions/mean_length": 61.65625, "completions/mean_terminated_length": 61.65625, "completions/min_length": 22.5, "completions/min_terminated_length": 22.5, "epoch": 0.09541505549138754, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.6365742557282017e-07, "loss": 0.0091, "num_tokens": 11023301.0, "reward": 1.3930208325386046, "reward_std": 0.0050495008006691934, "rewards/combined_reward/mean": 1.3930208325386046, "rewards/combined_reward/std": 0.30010328590869906, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.4, "completions/max_terminated_length": 105.4, "completions/mean_length": 55.79375, "completions/mean_terminated_length": 55.79375, "completions/min_length": 23.8, "completions/min_terminated_length": 23.8, "epoch": 0.09641942449656003, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.6280345439751956e-07, "loss": 0.0044, "num_tokens": 11148588.0, "reward": 1.3295885443687439, "reward_std": 0.024523502215743065, "rewards/combined_reward/mean": 1.3295885443687439, "rewards/combined_reward/std": 0.2928910902235657, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.7, "completions/max_terminated_length": 121.7, "completions/mean_length": 57.56875, "completions/mean_terminated_length": 57.56875, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.09742379350173254, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.6194185208681082e-07, "loss": -0.0043, "num_tokens": 11268271.0, "reward": 1.2413020730018616, "reward_std": 0.005312500335276127, "rewards/combined_reward/mean": 1.2413020730018616, "rewards/combined_reward/std": 0.3525692358613014, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.1, "completions/max_terminated_length": 117.1, "completions/mean_length": 57.45625, "completions/mean_terminated_length": 57.45625, "completions/min_length": 19.1, "completions/min_terminated_length": 19.1, "epoch": 0.09842816250690503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.610727233324495e-07, "loss": 0.0, "num_tokens": 11388376.0, "reward": 1.2743749976158143, "reward_std": 0.0, "rewards/combined_reward/mean": 1.2743749976158143, "rewards/combined_reward/std": 0.2959941983222961, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 51.1875, "completions/mean_terminated_length": 51.1875, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.09943253151207754, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.6019617374071597e-07, "loss": 0.0001, "num_tokens": 11503346.0, "reward": 1.3223437547683716, "reward_std": 0.0028867511078715324, "rewards/combined_reward/mean": 1.3223437547683716, "rewards/combined_reward/std": 0.37292833551764487, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.8, "completions/max_terminated_length": 146.8, "completions/mean_length": 64.61875, "completions/mean_terminated_length": 64.61875, "completions/min_length": 23.9, "completions/min_terminated_length": 23.9, "epoch": 0.10043690051725004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5931230981958326e-07, "loss": 0.0, "num_tokens": 11600585.0, "reward": 1.3246874928474426, "reward_std": 0.0, "rewards/combined_reward/mean": 1.3246874928474426, "rewards/combined_reward/std": 0.23927139891311527, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.5, "completions/max_terminated_length": 118.5, "completions/mean_length": 65.5, "completions/mean_terminated_length": 65.5, "completions/min_length": 19.9, "completions/min_terminated_length": 19.9, "epoch": 0.10144126952242254, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.5842123896577543e-07, "loss": -0.0036, "num_tokens": 11737513.0, "reward": 1.4228541851043701, "reward_std": 0.001154701132327318, "rewards/combined_reward/mean": 1.4228541851043701, "rewards/combined_reward/std": 0.25313766626641154, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 311.5, "completions/max_terminated_length": 224.3, "completions/mean_length": 90.28125, "completions/mean_terminated_length": 54.49903869628906, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.10244563852759504, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 1.5752306945171818e-07, "loss": -0.0115, "num_tokens": 11875626.0, "reward": 1.2103593707084657, "reward_std": 0.004468750953674316, "rewards/combined_reward/mean": 1.2103593707084657, "rewards/combined_reward/std": 0.40379793345928194, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.9, "completions/max_terminated_length": 128.9, "completions/mean_length": 59.56875, "completions/mean_terminated_length": 59.56875, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.10345000753276754, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.5661791041238254e-07, "loss": 0.0054, "num_tokens": 11995581.0, "reward": 1.3099791407585144, "reward_std": 0.00020833313465118408, "rewards/combined_reward/mean": 1.3099791407585144, "rewards/combined_reward/std": 0.33452749061398207, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 343.2, "completions/max_terminated_length": 228.1, "completions/mean_length": 114.825, "completions/mean_terminated_length": 78.1860580444336, "completions/min_length": 25.7, "completions/min_terminated_length": 25.7, "epoch": 0.10445437653794004, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.5570587183202433e-07, "loss": -0.0099, "num_tokens": 12114797.0, "reward": 1.2818815290927887, "reward_std": 0.0018619796261191367, "rewards/combined_reward/mean": 1.2818815290927887, "rewards/combined_reward/std": 0.31765228807926177, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.8, "completions/max_terminated_length": 113.8, "completions/mean_length": 55.68125, "completions/mean_terminated_length": 55.68125, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.10545874554311253, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.5478706453082016e-07, "loss": -0.0016, "num_tokens": 12246978.0, "reward": 1.3307923913002013, "reward_std": 0.0002604176523163915, "rewards/combined_reward/mean": 1.3307923913002013, "rewards/combined_reward/std": 0.3518651008605957, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.1, "completions/max_terminated_length": 144.1, "completions/mean_length": 69.0, "completions/mean_terminated_length": 69.0, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.10646311454828504, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.5386160015140167e-07, "loss": 0.0061, "num_tokens": 12363690.0, "reward": 1.3816666841506957, "reward_std": 0.00692450013011694, "rewards/combined_reward/mean": 1.3816666841506957, "rewards/combined_reward/std": 0.2784981057047844, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.3, "completions/max_terminated_length": 94.3, "completions/mean_length": 49.63125, "completions/mean_terminated_length": 49.63125, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.10746748355345755, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.5292959114529024e-07, "loss": 0.0011, "num_tokens": 12481815.0, "reward": 1.3338541746139527, "reward_std": 0.002886752039194107, "rewards/combined_reward/mean": 1.3338541746139527, "rewards/combined_reward/std": 0.3240374196320772, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 49.3375, "completions/mean_terminated_length": 49.3375, "completions/min_length": 19.6, "completions/min_terminated_length": 19.6, "epoch": 0.10847185255863004, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.5199115075923323e-07, "loss": -0.0008, "num_tokens": 12604637.0, "reward": 1.2796875, "reward_std": 0.0003608435858041048, "rewards/combined_reward/mean": 1.2796875, "rewards/combined_reward/std": 0.3038814663887024, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 313.5, "completions/max_terminated_length": 115.4, "completions/mean_length": 112.05, "completions/mean_terminated_length": 61.88333358764648, "completions/min_length": 20.3, "completions/min_terminated_length": 20.3, "epoch": 0.10947622156380254, "frac_reward_zero_std": 0.975, "grad_norm": 0.0, "learning_rate": 1.5104639302144326e-07, "loss": 0.0052, "num_tokens": 12735697.0, "reward": 1.342291682958603, "reward_std": 0.0007216888945549727, "rewards/combined_reward/mean": 1.342291682958603, "rewards/combined_reward/std": 0.31657470017671585, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.9, "completions/max_terminated_length": 127.9, "completions/mean_length": 61.70625, "completions/mean_terminated_length": 61.70625, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.11048059056897504, "frac_reward_zero_std": 0.95, "grad_norm": 0.0, "learning_rate": 1.5009543272774323e-07, "loss": 0.0029, "num_tokens": 12842590.0, "reward": 1.3991406440734864, "reward_std": 0.000572918844409287, "rewards/combined_reward/mean": 1.3991406440734864, "rewards/combined_reward/std": 0.27981497598811983, "step": 1100 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 12842590, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }