{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 35.689655172413794, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3608.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 836.2421875, "completions/mean_terminated_length": 763.040283203125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.06896551724137931, "frac_reward_zero_std": 0.875, "grad_norm": 7.329275313378857, "kl": 0.0007262229919433594, "learning_rate": 0.0, "loss": -0.0208, "num_tokens": 237015.0, "reward": 0.0023437500931322575, "reward_std": 0.00662912568077445, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 738.0234375, "completions/mean_terminated_length": 690.4166870117188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.13793103448275862, "frac_reward_zero_std": 1.0, "grad_norm": 0.03884243258483543, "kl": 0.0013523101806640625, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "num_tokens": 462554.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 3194.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 859.375, "completions/mean_terminated_length": 734.9915161132812, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.20689655172413793, "frac_reward_zero_std": 0.75, "grad_norm": 0.2713581262227546, "kl": 0.0028324127197265625, "learning_rate": 6.666666666666667e-07, "loss": 0.0134, "num_tokens": 703626.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 3311.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 829.578125, "completions/mean_terminated_length": 674.7344970703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.27586206896551724, "frac_reward_zero_std": 0.875, "grad_norm": 0.29708280749661614, "kl": 0.0035877227783203125, "learning_rate": 1.0000000000000002e-06, "loss": -0.0728, "num_tokens": 940884.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 2788.0, "completions/max_terminated_length": 2788.0, "completions/mean_length": 816.3515625, "completions/mean_terminated_length": 765.6854858398438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3448275862068966, "frac_reward_zero_std": 1.0, "grad_norm": 0.06949683850604436, "kl": 0.001728057861328125, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "num_tokens": 1176009.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 3751.0, "completions/max_terminated_length": 2844.0, "completions/mean_length": 866.28125, "completions/mean_terminated_length": 833.5556030273438, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.41379310344827586, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1143311609167342, "kl": 0.0008478164672851562, "learning_rate": 1.6666666666666667e-06, "loss": -0.0167, "num_tokens": 1415893.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 805.8359375, "completions/mean_terminated_length": 770.1349487304688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4827586206896552, "frac_reward_zero_std": 1.0, "grad_norm": 0.07492278282994858, "kl": 0.0012269020080566406, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "num_tokens": 1650112.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3898.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 723.9296875, "completions/mean_terminated_length": 688.2177124023438, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.5517241379310345, "frac_reward_zero_std": 1.0, "grad_norm": 0.04363979198005147, "kl": 0.0010972023010253906, "learning_rate": 2.3333333333333336e-06, "loss": 0.0, "num_tokens": 1872679.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3639.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 739.3828125, "completions/mean_terminated_length": 699.8225708007812, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.6206896551724138, "frac_reward_zero_std": 0.9375, "grad_norm": 0.11751018346403032, "kl": 0.0007791519165039062, "learning_rate": 2.666666666666667e-06, "loss": 0.0139, "num_tokens": 2098392.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 843.828125, "completions/mean_terminated_length": 818.0400390625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.6896551724137931, "frac_reward_zero_std": 0.8125, "grad_norm": 0.282798190145457, "kl": 0.009571075439453125, "learning_rate": 3e-06, "loss": 0.0048, "num_tokens": 2337474.0, "reward": 0.0062500000931322575, "reward_std": 0.01462521031498909, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 3478.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 788.875, "completions/mean_terminated_length": 751.2222900390625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7586206896551724, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2217463855001291, "kl": 0.0009412765502929688, "learning_rate": 3.3333333333333333e-06, "loss": -0.0439, "num_tokens": 2568146.0, "reward": 0.0062500000931322575, "reward_std": 0.01462521031498909, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 3367.0, "completions/max_terminated_length": 3367.0, "completions/mean_length": 872.359375, "completions/mean_terminated_length": 821.3901977539062, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.8275862068965517, "frac_reward_zero_std": 0.9375, "grad_norm": 1.9919377838221575, "kl": 0.0614471435546875, "learning_rate": 3.6666666666666666e-06, "loss": -0.0038, "num_tokens": 2810880.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 3649.0, "completions/max_terminated_length": 3049.0, "completions/mean_length": 799.0390625, "completions/mean_terminated_length": 745.2000122070312, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.896551724137931, "frac_reward_zero_std": 0.8125, "grad_norm": 0.8404371378196254, "kl": 0.021940231323242188, "learning_rate": 4.000000000000001e-06, "loss": -0.0194, "num_tokens": 3044229.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3196.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 820.6953125, "completions/mean_terminated_length": 801.9921264648438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.9655172413793104, "frac_reward_zero_std": 0.75, "grad_norm": 0.29176753147358486, "kl": 0.004749298095703125, "learning_rate": 4.333333333333334e-06, "loss": -0.0113, "num_tokens": 3280350.0, "reward": 0.0078125, "reward_std": 0.019044626504182816, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3712.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 984.875, "completions/mean_terminated_length": 850.49169921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.0689655172413792, "frac_reward_zero_std": 0.375, "grad_norm": 0.6613227639845897, "kl": 0.13519287109375, "learning_rate": 4.666666666666667e-06, "loss": 0.0119, "num_tokens": 3536582.0, "reward": 0.02031249925494194, "reward_std": 0.047098226845264435, "rewards/code_format_reward/mean": 0.09375, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 3213.0, "completions/max_terminated_length": 3158.0, "completions/mean_length": 1020.6484375, "completions/mean_terminated_length": 842.5438842773438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1379310344827587, "frac_reward_zero_std": 0.125, "grad_norm": 43.83469868815612, "kl": 1.2880859375, "learning_rate": 5e-06, "loss": 0.0709, "num_tokens": 3798297.0, "reward": 0.03125, "reward_std": 0.06938961893320084, "rewards/code_format_reward/mean": 0.15625, "rewards/code_format_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.3645188808441162, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3634.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 816.6875, "completions/mean_terminated_length": 726.9193115234375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.206896551724138, "frac_reward_zero_std": 0.1875, "grad_norm": 995.2557894224733, "kl": 23.7371826171875, "learning_rate": 4.999952797253148e-06, "loss": 0.277, "num_tokens": 4033905.0, "reward": 0.03671874850988388, "reward_std": 0.06861686706542969, "rewards/code_format_reward/mean": 0.1796875, "rewards/code_format_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.39184603095054626, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 3677.0, "completions/max_terminated_length": 3677.0, "completions/mean_length": 828.2109375, "completions/mean_terminated_length": 792.1817626953125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 1.2758620689655173, "frac_reward_zero_std": 0.125, "grad_norm": 15620.298455105083, "kl": 711.0061645507812, "learning_rate": 4.9998111909931225e-06, "loss": 7.2128, "num_tokens": 4270988.0, "reward": 0.05546875298023224, "reward_std": 0.08068342506885529, "rewards/code_format_reward/mean": 0.2734375, "rewards/code_format_reward/std": 0.447474867105484, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4513758420944214, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 3792.0, "completions/max_terminated_length": 3792.0, "completions/mean_length": 876.3046875, "completions/mean_terminated_length": 780.710693359375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.3448275862068966, "frac_reward_zero_std": 0.0, "grad_norm": 65.50568634933693, "kl": 5.886474609375, "learning_rate": 4.999575187161439e-06, "loss": 0.1377, "num_tokens": 4513995.0, "reward": 0.07265625149011612, "reward_std": 0.09746605902910233, "rewards/code_format_reward/mean": 0.359375, "rewards/code_format_reward/std": 0.481702595949173, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.4839322865009308, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 848.6328125, "completions/mean_terminated_length": 777.4343872070312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.4137931034482758, "frac_reward_zero_std": 0.0, "grad_norm": 151.71342486055147, "kl": 5.810791015625, "learning_rate": 4.9992447956603455e-06, "loss": 0.1517, "num_tokens": 4753692.0, "reward": 0.10703124850988388, "reward_std": 0.10140325874090195, "rewards/code_format_reward/mean": 0.515625, "rewards/code_format_reward/std": 0.5017194747924805, "rewards/format_reward/mean": 0.5546875, "rewards/format_reward/std": 0.4989531338214874, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2993.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 744.5625, "completions/mean_terminated_length": 690.6400146484375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 1.4827586206896552, "frac_reward_zero_std": 0.0, "grad_norm": 1.4736174877248536, "kl": 0.04266357421875, "learning_rate": 4.998820030352409e-06, "loss": 0.0836, "num_tokens": 4978900.0, "reward": 0.12890625, "reward_std": 0.0890723466873169, "rewards/code_format_reward/mean": 0.625, "rewards/code_format_reward/std": 0.4860251843929291, "rewards/format_reward/mean": 0.6640625, "rewards/format_reward/std": 0.47417303919792175, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 710.6171875, "completions/mean_terminated_length": 666.0000610351562, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.5517241379310345, "frac_reward_zero_std": 0.0625, "grad_norm": 38.20579455574688, "kl": 0.62890625, "learning_rate": 4.998300909059929e-06, "loss": 0.1571, "num_tokens": 5200931.0, "reward": 0.13750000298023224, "reward_std": 0.08701484650373459, "rewards/code_format_reward/mean": 0.671875, "rewards/code_format_reward/std": 0.4713755249977112, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.45867621898651123, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 722.390625, "completions/mean_terminated_length": 711.5556030273438, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.6206896551724137, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5678012619151958, "kl": 0.09613037109375, "learning_rate": 4.997687453564198e-06, "loss": 0.0384, "num_tokens": 5423325.0, "reward": 0.15312500298023224, "reward_std": 0.07640768587589264, "rewards/code_format_reward/mean": 0.7265625, "rewards/code_format_reward/std": 0.447474867105484, "rewards/format_reward/mean": 0.8046875, "rewards/format_reward/std": 0.3979988098144531, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2973.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 717.1640625, "completions/mean_terminated_length": 662.1156616210938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.6896551724137931, "frac_reward_zero_std": 0.1875, "grad_norm": 195.2594908645977, "kl": 7.546875, "learning_rate": 4.9969796896045775e-06, "loss": 0.234, "num_tokens": 5645026.0, "reward": 0.15937501192092896, "reward_std": 0.06703707575798035, "rewards/code_format_reward/mean": 0.765625, "rewards/code_format_reward/std": 0.42527204751968384, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3787541687488556, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 677.21875, "completions/mean_terminated_length": 664.8897705078125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.7586206896551724, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4865500559586326, "kl": 0.05767822265625, "learning_rate": 4.996177646877426e-06, "loss": 0.0349, "num_tokens": 5862782.0, "reward": 0.17734375596046448, "reward_std": 0.04920876771211624, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 691.984375, "completions/mean_terminated_length": 663.6400146484375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.8275862068965516, "frac_reward_zero_std": 0.25, "grad_norm": 19.735744331019188, "kl": 1.9664306640625, "learning_rate": 4.995281359034851e-06, "loss": 0.1256, "num_tokens": 6081988.0, "reward": 0.17656250298023224, "reward_std": 0.04989224672317505, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 612.3046875, "completions/mean_terminated_length": 612.3046875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 1.896551724137931, "frac_reward_zero_std": 0.5, "grad_norm": 0.4611562798663595, "kl": 0.07080078125, "learning_rate": 4.994290863683296e-06, "loss": 0.0295, "num_tokens": 6290339.0, "reward": 0.18828123807907104, "reward_std": 0.029640421271324158, "rewards/code_format_reward/mean": 0.921875, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 2192.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 700.1875, "completions/mean_terminated_length": 656.491943359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.9655172413793105, "frac_reward_zero_std": 0.3125, "grad_norm": 1833.9655339013984, "kl": 105.04736328125, "learning_rate": 4.99320620238196e-06, "loss": 1.1265, "num_tokens": 6511035.0, "reward": 0.17890626192092896, "reward_std": 0.04762028157711029, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 3102.0, "completions/max_terminated_length": 3102.0, "completions/mean_length": 683.8359375, "completions/mean_terminated_length": 625.14404296875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 2.0689655172413794, "frac_reward_zero_std": 0.5625, "grad_norm": 6.485350364733725, "kl": 0.551513671875, "learning_rate": 4.99202742064106e-06, "loss": 0.1122, "num_tokens": 6729198.0, "reward": 0.17421875894069672, "reward_std": 0.03528411686420441, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 561.75, "completions/mean_terminated_length": 551.93701171875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.1379310344827585, "frac_reward_zero_std": 0.6875, "grad_norm": 42.970609061888695, "kl": 3.5179443359375, "learning_rate": 4.990754567919917e-06, "loss": 0.0717, "num_tokens": 6932174.0, "reward": 0.18828123807907104, "reward_std": 0.02506173402070999, "rewards/code_format_reward/mean": 0.9375, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 560.6953125, "completions/mean_terminated_length": 544.1370849609375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 2.206896551724138, "frac_reward_zero_std": 0.75, "grad_norm": 2.8351996778137165, "kl": 0.4176025390625, "learning_rate": 4.989387697624881e-06, "loss": 0.0621, "num_tokens": 7135015.0, "reward": 0.1875, "reward_std": 0.02130674012005329, "rewards/code_format_reward/mean": 0.9375, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24301259219646454, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 3781.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 605.7578125, "completions/mean_terminated_length": 553.1869506835938, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 2.2758620689655173, "frac_reward_zero_std": 0.375, "grad_norm": 9.39713171992309, "kl": 0.841064453125, "learning_rate": 4.987926867107095e-06, "loss": 0.0963, "num_tokens": 7342528.0, "reward": 0.18203124403953552, "reward_std": 0.04166591912508011, "rewards/code_format_reward/mean": 0.890625, "rewards/code_format_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 561.1953125, "completions/mean_terminated_length": 548.1825561523438, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 2.344827586206897, "frac_reward_zero_std": 0.625, "grad_norm": 18.13031332736665, "kl": 0.6373291015625, "learning_rate": 4.986372137660078e-06, "loss": 0.0154, "num_tokens": 7544289.0, "reward": 0.18906250596046448, "reward_std": 0.024831000715494156, "rewards/code_format_reward/mean": 0.9375, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 586.734375, "completions/mean_terminated_length": 562.1370849609375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.413793103448276, "frac_reward_zero_std": 0.4375, "grad_norm": 615.3181219972676, "kl": 37.55224609375, "learning_rate": 4.984723574517165e-06, "loss": 0.4356, "num_tokens": 7749295.0, "reward": 0.18125000596046448, "reward_std": 0.043191660195589066, "rewards/code_format_reward/mean": 0.90625, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 1358.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 563.5, "completions/mean_terminated_length": 550.9761962890625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.4827586206896552, "frac_reward_zero_std": 0.3125, "grad_norm": 0.5637978091750905, "kl": 0.08837890625, "learning_rate": 4.9829812468487655e-06, "loss": 0.0903, "num_tokens": 7952495.0, "reward": 0.18125000596046448, "reward_std": 0.047710247337818146, "rewards/code_format_reward/mean": 0.8984375, "rewards/code_format_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 517.3671875, "completions/mean_terminated_length": 517.3671875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.5517241379310347, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5349360457928953, "kl": 0.1214599609375, "learning_rate": 4.981145227759457e-06, "loss": -0.0018, "num_tokens": 8149790.0, "reward": 0.18828125298023224, "reward_std": 0.03314562886953354, "rewards/code_format_reward/mean": 0.9296875, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 626.6328125, "completions/mean_terminated_length": 573.9173583984375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 2.6206896551724137, "frac_reward_zero_std": 0.25, "grad_norm": 125.2304296272452, "kl": 20.3221435546875, "learning_rate": 4.979215594284924e-06, "loss": 0.3347, "num_tokens": 8361071.0, "reward": 0.16093750298023224, "reward_std": 0.0606289878487587, "rewards/code_format_reward/mean": 0.78125, "rewards/code_format_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3787541687488556, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 2936.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 516.2265625, "completions/mean_terminated_length": 511.8582763671875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.689655172413793, "frac_reward_zero_std": 0.4375, "grad_norm": 9.576331967136065, "kl": 0.2928466796875, "learning_rate": 4.977192427388722e-06, "loss": 0.096, "num_tokens": 8558220.0, "reward": 0.17890626192092896, "reward_std": 0.0444980263710022, "rewards/code_format_reward/mean": 0.890625, "rewards/code_format_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 571.1640625, "completions/mean_terminated_length": 516.280029296875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.7586206896551726, "frac_reward_zero_std": 0.375, "grad_norm": 85.00285353330676, "kl": 14.6357421875, "learning_rate": 4.9750758119588824e-06, "loss": 0.3169, "num_tokens": 8762169.0, "reward": 0.16484375298023224, "reward_std": 0.057621706277132034, "rewards/code_format_reward/mean": 0.8203125, "rewards/code_format_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3787541687488556, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 599.609375, "completions/mean_terminated_length": 592.5079956054688, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.8275862068965516, "frac_reward_zero_std": 0.125, "grad_norm": 5.760759174842388, "kl": 1.2294921875, "learning_rate": 4.972865836804349e-06, "loss": 0.1275, "num_tokens": 8969991.0, "reward": 0.16953125596046448, "reward_std": 0.0626438558101654, "rewards/code_format_reward/mean": 0.8359375, "rewards/code_format_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3490002751350403, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 492.015625, "completions/mean_terminated_length": 492.015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 2.896551724137931, "frac_reward_zero_std": 0.5, "grad_norm": 0.5019665707220321, "kl": 0.1136474609375, "learning_rate": 4.970562594651254e-06, "loss": 0.0647, "num_tokens": 9164041.0, "reward": 0.18203124403953552, "reward_std": 0.03968694061040878, "rewards/code_format_reward/mean": 0.90625, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 566.1640625, "completions/mean_terminated_length": 549.26611328125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.9655172413793105, "frac_reward_zero_std": 0.0, "grad_norm": 2.5630106181026244, "kl": 0.353515625, "learning_rate": 4.968166182139026e-06, "loss": 0.0204, "num_tokens": 9366678.0, "reward": 0.13671875, "reward_std": 0.09157585352659225, "rewards/code_format_reward/mean": 0.6796875, "rewards/code_format_reward/std": 0.4684300124645233, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.4653336703777313, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 563.0, "completions/mean_terminated_length": 550.0159301757812, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.0689655172413794, "frac_reward_zero_std": 0.125, "grad_norm": 0.5947582776397699, "kl": 0.0906982421875, "learning_rate": 4.9656766998163306e-06, "loss": 0.071, "num_tokens": 9568206.0, "reward": 0.15625, "reward_std": 0.07245282828807831, "rewards/code_format_reward/mean": 0.765625, "rewards/code_format_reward/std": 0.42527204751968384, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40390563011169434, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 617.5703125, "completions/mean_terminated_length": 560.99169921875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.1379310344827585, "frac_reward_zero_std": 0.1875, "grad_norm": 29.528488713072353, "kl": 0.284423828125, "learning_rate": 4.963094252136865e-06, "loss": 0.1132, "num_tokens": 9778327.0, "reward": 0.14531250298023224, "reward_std": 0.07226449996232986, "rewards/code_format_reward/mean": 0.71875, "rewards/code_format_reward/std": 0.4513758420944214, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44340085983276367, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 523.90625, "completions/mean_terminated_length": 516.9448852539062, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 3.206896551724138, "frac_reward_zero_std": 0.0, "grad_norm": 0.7663643054090497, "kl": 0.1002197265625, "learning_rate": 4.960418947454958e-06, "loss": 0.167, "num_tokens": 9974411.0, "reward": 0.16171875596046448, "reward_std": 0.07921412587165833, "rewards/code_format_reward/mean": 0.796875, "rewards/code_format_reward/std": 0.40390563011169434, "rewards/format_reward/mean": 0.8203125, "rewards/format_reward/std": 0.3854354918003082, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 482.03125, "completions/mean_terminated_length": 482.03125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 3.2758620689655173, "frac_reward_zero_std": 0.25, "grad_norm": 0.6031669012487536, "kl": 0.10595703125, "learning_rate": 4.957650898021038e-06, "loss": -0.0048, "num_tokens": 10167183.0, "reward": 0.17500001192092896, "reward_std": 0.05759534612298012, "rewards/code_format_reward/mean": 0.8671875, "rewards/code_format_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3389.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 540.1953125, "completions/mean_terminated_length": 517.7637939453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 3.344827586206897, "frac_reward_zero_std": 0.3125, "grad_norm": 0.6402833424208365, "kl": 0.107177734375, "learning_rate": 4.954790219976915e-06, "loss": 0.1032, "num_tokens": 10367400.0, "reward": 0.17734375596046448, "reward_std": 0.05118774622678757, "rewards/code_format_reward/mean": 0.8828125, "rewards/code_format_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 535.6015625, "completions/mean_terminated_length": 530.2125854492188, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 3.413793103448276, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5622681425626702, "kl": 0.0972900390625, "learning_rate": 4.95183703335091e-06, "loss": 0.0237, "num_tokens": 10566797.0, "reward": 0.18281251192092896, "reward_std": 0.04250866919755936, "rewards/code_format_reward/mean": 0.9140625, "rewards/code_format_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 516.953125, "completions/mean_terminated_length": 503.94403076171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 3.4827586206896552, "frac_reward_zero_std": 0.5625, "grad_norm": 2.1308403779181986, "kl": 0.2667236328125, "learning_rate": 4.948791462052819e-06, "loss": 0.1069, "num_tokens": 10762871.0, "reward": 0.18437500298023224, "reward_std": 0.03237384930253029, "rewards/code_format_reward/mean": 0.9140625, "rewards/code_format_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 2243.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 526.25, "completions/mean_terminated_length": 483.9837341308594, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 3.5517241379310347, "frac_reward_zero_std": 0.5625, "grad_norm": 15.23470491238641, "kl": 1.3590087890625, "learning_rate": 4.945653633868716e-06, "loss": 0.1142, "num_tokens": 10961303.0, "reward": 0.18125000596046448, "reward_std": 0.035247981548309326, "rewards/code_format_reward/mean": 0.90625, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 475.140625, "completions/mean_terminated_length": 475.140625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.6206896551724137, "frac_reward_zero_std": 0.6875, "grad_norm": 0.41814078259259574, "kl": 0.1163330078125, "learning_rate": 4.942423680455584e-06, "loss": 0.0046, "num_tokens": 11153193.0, "reward": 0.19218748807907104, "reward_std": 0.02057085558772087, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 493.4765625, "completions/mean_terminated_length": 482.3149719238281, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 3.689655172413793, "frac_reward_zero_std": 0.75, "grad_norm": 1664.6862364926953, "kl": 100.5736083984375, "learning_rate": 4.939101737335802e-06, "loss": 1.0242, "num_tokens": 11347430.0, "reward": 0.18984374403953552, "reward_std": 0.020350992679595947, "rewards/code_format_reward/mean": 0.9453125, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 523.0625, "completions/mean_terminated_length": 516.1181030273438, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 3.7586206896551726, "frac_reward_zero_std": 0.6875, "grad_norm": 0.42621486826703325, "kl": 0.127197265625, "learning_rate": 4.935687943891447e-06, "loss": -0.0541, "num_tokens": 11545454.0, "reward": 0.19296875596046448, "reward_std": 0.016834918409585953, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3578.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 564.2109375, "completions/mean_terminated_length": 499.84423828125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 3.8275862068965516, "frac_reward_zero_std": 0.75, "grad_norm": 78.37140371828227, "kl": 12.0, "learning_rate": 4.932182443358458e-06, "loss": 0.2436, "num_tokens": 11747649.0, "reward": 0.18828125298023224, "reward_std": 0.01909703202545643, "rewards/code_format_reward/mean": 0.9375, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 518.0234375, "completions/mean_terminated_length": 503.9031982421875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 3.896551724137931, "frac_reward_zero_std": 0.625, "grad_norm": 8.921711491174781, "kl": 0.603759765625, "learning_rate": 4.928585382820616e-06, "loss": 0.0247, "num_tokens": 11945028.0, "reward": 0.18437500298023224, "reward_std": 0.031300365924835205, "rewards/code_format_reward/mean": 0.921875, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2694226801395416, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 3627.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 525.0078125, "completions/mean_terminated_length": 491.5040283203125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 3.9655172413793105, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5156563431137091, "kl": 0.16015625, "learning_rate": 4.924896913203376e-06, "loss": 0.0297, "num_tokens": 12143301.0, "reward": 0.19453126192092896, "reward_std": 0.012415500357747078, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 3213.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 519.9921875, "completions/mean_terminated_length": 462.96002197265625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 4.068965517241379, "frac_reward_zero_std": 0.6875, "grad_norm": 8.189691709912301, "kl": 4.8702392578125, "learning_rate": 4.921117189267535e-06, "loss": 0.2236, "num_tokens": 12340932.0, "reward": 0.19140625, "reward_std": 0.022327817976474762, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3751.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 581.3671875, "completions/mean_terminated_length": 556.409423828125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 4.137931034482759, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5406944658569534, "kl": 0.107666015625, "learning_rate": 4.917246369602742e-06, "loss": 0.0798, "num_tokens": 12546419.0, "reward": 0.18906250596046448, "reward_std": 0.02414703369140625, "rewards/code_format_reward/mean": 0.9453125, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 1408.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 454.96875, "completions/mean_terminated_length": 447.4645690917969, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.206896551724138, "frac_reward_zero_std": 0.6875, "grad_norm": 0.33497177183170196, "kl": 0.1175537109375, "learning_rate": 4.9132846166208355e-06, "loss": -0.0154, "num_tokens": 12735495.0, "reward": 0.19374999403953552, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.9609375, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 507.015625, "completions/mean_terminated_length": 500.3149719238281, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 4.275862068965517, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4596870009961427, "kl": 0.1395263671875, "learning_rate": 4.9092320965490365e-06, "loss": 0.032, "num_tokens": 12931465.0, "reward": 0.18984374403953552, "reward_std": 0.022621294483542442, "rewards/code_format_reward/mean": 0.9453125, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3937.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 486.4296875, "completions/mean_terminated_length": 459.25982666015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 4.344827586206897, "frac_reward_zero_std": 0.75, "grad_norm": 0.526302411386141, "kl": 0.135498046875, "learning_rate": 4.905088979422971e-06, "loss": 0.071, "num_tokens": 13123896.0, "reward": 0.19296875596046448, "reward_std": 0.017908399924635887, "rewards/code_format_reward/mean": 0.9609375, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3627.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 511.5625, "completions/mean_terminated_length": 442.70965576171875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.413793103448276, "frac_reward_zero_std": 0.625, "grad_norm": 74.57382104056599, "kl": 11.362548828125, "learning_rate": 4.900855439079536e-06, "loss": 0.222, "num_tokens": 13319280.0, "reward": 0.18906250596046448, "reward_std": 0.02590448409318924, "rewards/code_format_reward/mean": 0.9375, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3933.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 539.40625, "completions/mean_terminated_length": 433.0322570800781, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 4.482758620689655, "frac_reward_zero_std": 0.4375, "grad_norm": 19.50917337775108, "kl": 2.5927734375, "learning_rate": 4.8965316531496055e-06, "loss": 0.2921, "num_tokens": 13519396.0, "reward": 0.18515625596046448, "reward_std": 0.03893200308084488, "rewards/code_format_reward/mean": 0.921875, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 428.5859375, "completions/mean_terminated_length": 428.5859375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.551724137931035, "frac_reward_zero_std": 0.625, "grad_norm": 5.412446385383582, "kl": 0.87109375, "learning_rate": 4.892117803050578e-06, "loss": -0.0089, "num_tokens": 13705327.0, "reward": 0.18906250596046448, "reward_std": 0.020641827955842018, "rewards/code_format_reward/mean": 0.90625, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 1341.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 452.921875, "completions/mean_terminated_length": 440.9762268066406, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.620689655172414, "frac_reward_zero_std": 0.625, "grad_norm": 0.5397669168631566, "kl": 0.194580078125, "learning_rate": 4.887614073978761e-06, "loss": -0.011, "num_tokens": 13894373.0, "reward": 0.19140625, "reward_std": 0.020348839461803436, "rewards/code_format_reward/mean": 0.9375, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3615.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 534.53125, "completions/mean_terminated_length": 447.82257080078125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.689655172413794, "frac_reward_zero_std": 0.625, "grad_norm": 91.20082626936673, "kl": 36.480224609375, "learning_rate": 4.883020654901609e-06, "loss": 0.4063, "num_tokens": 14093865.0, "reward": 0.19296875596046448, "reward_std": 0.019887376576662064, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3565.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 522.640625, "completions/mean_terminated_length": 433.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 4.758620689655173, "frac_reward_zero_std": 0.5625, "grad_norm": 1158.116969084908, "kl": 202.0533447265625, "learning_rate": 4.878337738549785e-06, "loss": 2.2702, "num_tokens": 14291395.0, "reward": 0.1875, "reward_std": 0.030323900282382965, "rewards/code_format_reward/mean": 0.9296875, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 479.265625, "completions/mean_terminated_length": 479.265625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 4.827586206896552, "frac_reward_zero_std": 0.8125, "grad_norm": 0.8904557320426325, "kl": 0.16650390625, "learning_rate": 4.873565521409082e-06, "loss": 0.0518, "num_tokens": 14481549.0, "reward": 0.1953125, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.9765625, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3196.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 620.3203125, "completions/mean_terminated_length": 522.8416748046875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 4.896551724137931, "frac_reward_zero_std": 0.375, "grad_norm": 4.105422981049957, "kl": 3.436767578125, "learning_rate": 4.868704203712173e-06, "loss": 0.3129, "num_tokens": 14690878.0, "reward": 0.17500001192092896, "reward_std": 0.049049004912376404, "rewards/code_format_reward/mean": 0.8671875, "rewards/code_format_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.28125, "completions/max_length": 4021.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 1178.859375, "completions/mean_terminated_length": 621.0952758789062, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 4.9655172413793105, "frac_reward_zero_std": 0.0625, "grad_norm": 213.86405401091332, "kl": 98.63232421875, "learning_rate": 4.86375398943021e-06, "loss": 1.5566, "num_tokens": 14972844.0, "reward": 0.14218750596046448, "reward_std": 0.08679269254207611, "rewards/code_format_reward/mean": 0.7109375, "rewards/code_format_reward/std": 0.45510825514793396, "rewards/format_reward/mean": 0.7109375, "rewards/format_reward/std": 0.45510825514793396, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 3781.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 1726.734375, "completions/mean_terminated_length": 699.8500366210938, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 5.068965517241379, "frac_reward_zero_std": 0.0, "grad_norm": 172.5869365555856, "kl": 94.25, "learning_rate": 4.858715086264274e-06, "loss": 1.477, "num_tokens": 15324938.0, "reward": 0.09375, "reward_std": 0.10150270909070969, "rewards/code_format_reward/mean": 0.4609375, "rewards/code_format_reward/std": 0.5004304051399231, "rewards/format_reward/mean": 0.4765625, "rewards/format_reward/std": 0.5014128684997559, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 2458.3125, "completions/mean_terminated_length": 1035.1429443359375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 5.137931034482759, "frac_reward_zero_std": 0.0625, "grad_norm": 57.25662177745869, "kl": 40.15625, "learning_rate": 4.853587705636646e-06, "loss": 0.8415, "num_tokens": 15770234.0, "reward": 0.05624999850988388, "reward_std": 0.0847882479429245, "rewards/code_format_reward/mean": 0.2734375, "rewards/code_format_reward/std": 0.447474867105484, "rewards/format_reward/mean": 0.2890625, "rewards/format_reward/std": 0.45510825514793396, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 2853.59375, "completions/mean_terminated_length": 974.1162719726562, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 5.206896551724138, "frac_reward_zero_std": 0.25, "grad_norm": 13.862558890178295, "kl": 12.4921875, "learning_rate": 4.84837206268195e-06, "loss": 0.3769, "num_tokens": 16265422.0, "reward": 0.03203125298023224, "reward_std": 0.06078151986002922, "rewards/code_format_reward/mean": 0.15625, "rewards/code_format_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.1640625, "rewards/format_reward/std": 0.371787428855896, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4037.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 3360.890625, "completions/mean_terminated_length": 807.9199829101562, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 5.275862068965517, "frac_reward_zero_std": 0.625, "grad_norm": 1.1593085629950635, "kl": 1.6650390625, "learning_rate": 4.8430683762381195e-06, "loss": 0.1107, "num_tokens": 16826688.0, "reward": 0.012500000186264515, "reward_std": 0.02925041876733303, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24301259219646454, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 3014.8125, "completions/mean_terminated_length": 767.2142944335938, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 5.344827586206897, "frac_reward_zero_std": 0.375, "grad_norm": 3.9131053387344386, "kl": 0.572265625, "learning_rate": 4.837676868837213e-06, "loss": 0.1534, "num_tokens": 17342752.0, "reward": 0.02734375, "reward_std": 0.04960141330957413, "rewards/code_format_reward/mean": 0.1328125, "rewards/code_format_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3490002751350403, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 3525.0703125, "completions/mean_terminated_length": 1183.3157958984375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 5.413793103448276, "frac_reward_zero_std": 0.8125, "grad_norm": 2.7497217743483526, "kl": 0.58984375, "learning_rate": 4.832197766696085e-06, "loss": 0.0596, "num_tokens": 17923705.0, "reward": 0.008593750186264515, "reward_std": 0.01593157649040222, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3974.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 3445.8515625, "completions/mean_terminated_length": 1025.0555419921875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 5.482758620689655, "frac_reward_zero_std": 0.5625, "grad_norm": 10.680361044962455, "kl": 0.8134765625, "learning_rate": 4.826631299706887e-06, "loss": 0.1354, "num_tokens": 18495846.0, "reward": 0.01406249962747097, "reward_std": 0.0329858660697937, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3911.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 3161.1484375, "completions/mean_terminated_length": 1222.09521484375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.551724137931035, "frac_reward_zero_std": 0.6875, "grad_norm": 5.863415608971705, "kl": 1.5400390625, "learning_rate": 4.820977701427424e-06, "loss": 0.0927, "num_tokens": 19031545.0, "reward": 0.008593750186264515, "reward_std": 0.022327817976474762, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 3779.0, "completions/max_terminated_length": 3281.0, "completions/mean_length": 3447.84375, "completions/mean_terminated_length": 1468.666748046875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 5.620689655172414, "frac_reward_zero_std": 0.6875, "grad_norm": 44.120715406394496, "kl": 13.8203125, "learning_rate": 4.81523720907136e-06, "loss": 0.1843, "num_tokens": 19603941.0, "reward": 0.0078125, "reward_std": 0.019044626504182816, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3712.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 3321.03125, "completions/mean_terminated_length": 975.4285888671875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 5.689655172413794, "frac_reward_zero_std": 0.8125, "grad_norm": 60.204694884732945, "kl": 21.171875, "learning_rate": 4.809410063498254e-06, "loss": 0.2614, "num_tokens": 20160105.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 4090.0, "completions/max_terminated_length": 3121.0, "completions/mean_length": 3610.2421875, "completions/mean_terminated_length": 1546.3333740234375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 5.758620689655173, "frac_reward_zero_std": 0.9375, "grad_norm": 20.14460667732567, "kl": 7.09375, "learning_rate": 4.8034965092034656e-06, "loss": 0.0843, "num_tokens": 20753288.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 3683.359375, "completions/mean_terminated_length": 1158.8333740234375, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 5.827586206896552, "frac_reward_zero_std": 0.75, "grad_norm": 14.12037911937169, "kl": 5.85546875, "learning_rate": 4.797496794307889e-06, "loss": 0.1153, "num_tokens": 21354662.0, "reward": 0.0078125, "reward_std": 0.019044626504182816, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 4029.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 3650.7734375, "completions/mean_terminated_length": 1109.300048828125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 5.896551724137931, "frac_reward_zero_std": 0.8125, "grad_norm": 0.9898362940499612, "kl": 0.728515625, "learning_rate": 4.791411170547545e-06, "loss": 0.0412, "num_tokens": 21953033.0, "reward": 0.007031249813735485, "reward_std": 0.013782460242509842, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 3448.1015625, "completions/mean_terminated_length": 1277.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 5.9655172413793105, "frac_reward_zero_std": 0.75, "grad_norm": 2.3365281655016754, "kl": 0.44677734375, "learning_rate": 4.785239893263017e-06, "loss": 0.0226, "num_tokens": 22524294.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 3402.4609375, "completions/mean_terminated_length": 1330.84619140625, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 6.068965517241379, "frac_reward_zero_std": 0.9375, "grad_norm": 0.9994644728808741, "kl": 0.41796875, "learning_rate": 4.778983221388742e-06, "loss": 0.0222, "num_tokens": 23089737.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 4078.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 3424.5546875, "completions/mean_terminated_length": 1120.1875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 6.137931034482759, "frac_reward_zero_std": 0.9375, "grad_norm": 0.7994480278597837, "kl": 0.7373046875, "learning_rate": 4.77264141744214e-06, "loss": 0.0257, "num_tokens": 23659152.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 4073.0, "completions/max_terminated_length": 3255.0, "completions/mean_length": 3615.6796875, "completions/mean_terminated_length": 1483.8333740234375, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 6.206896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.2463644130168377, "kl": 0.3974609375, "learning_rate": 4.766214747512603e-06, "loss": 0.004, "num_tokens": 24253031.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3220.0, "completions/mean_length": 3160.4765625, "completions/mean_terminated_length": 1251.0526123046875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 6.275862068965517, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7416070034458621, "kl": 0.56201171875, "learning_rate": 4.759703481250331e-06, "loss": 0.0459, "num_tokens": 24788412.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 4077.0, "completions/max_terminated_length": 3619.0, "completions/mean_length": 3303.9453125, "completions/mean_terminated_length": 1231.5789794921875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 6.344827586206897, "frac_reward_zero_std": 0.875, "grad_norm": 0.4067648831787075, "kl": 1.296875, "learning_rate": 4.753107891855015e-06, "loss": 0.0523, "num_tokens": 25342389.0, "reward": 0.004687500186264515, "reward_std": 0.010205792263150215, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.15625, "completions/max_length": 4071.0, "completions/max_terminated_length": 3598.0, "completions/mean_length": 3240.4765625, "completions/mean_terminated_length": 1464.7838134765625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 6.413793103448276, "frac_reward_zero_std": 0.9375, "grad_norm": 1.5773830436721306, "kl": 0.8857421875, "learning_rate": 4.746428256064375e-06, "loss": 0.0271, "num_tokens": 25888242.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 3247.8046875, "completions/mean_terminated_length": 1320.7059326171875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 6.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.47019129510941726, "kl": 0.45703125, "learning_rate": 4.7396648541425534e-06, "loss": 0.0046, "num_tokens": 26433865.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 3004.3828125, "completions/mean_terminated_length": 1139.6591796875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 6.551724137931035, "frac_reward_zero_std": 1.0, "grad_norm": 0.12969886319345608, "kl": 0.681640625, "learning_rate": 4.732817969868348e-06, "loss": 0.0068, "num_tokens": 26948594.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3961.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 2860.5859375, "completions/mean_terminated_length": 1184.319091796875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 6.620689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.572939086350903, "kl": 0.71875, "learning_rate": 4.7258878905233095e-06, "loss": 0.0072, "num_tokens": 27445821.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 2526.8984375, "completions/mean_terminated_length": 1347.7257080078125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 6.689655172413794, "frac_reward_zero_std": 0.9375, "grad_norm": 0.6818093659965091, "kl": 0.4169921875, "learning_rate": 4.718874906879688e-06, "loss": 0.023, "num_tokens": 27899240.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 1971.0546875, "completions/mean_terminated_length": 1234.521728515625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 6.758620689655173, "frac_reward_zero_std": 0.875, "grad_norm": 0.5461191185776753, "kl": 0.300048828125, "learning_rate": 4.711779313188231e-06, "loss": 0.0228, "num_tokens": 28282167.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3684.0, "completions/max_terminated_length": 3134.0, "completions/mean_length": 2307.0703125, "completions/mean_terminated_length": 1120.921875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 6.827586206896552, "frac_reward_zero_std": 0.875, "grad_norm": 0.8120148624300454, "kl": 0.53271484375, "learning_rate": 4.70460140716584e-06, "loss": 0.0365, "num_tokens": 28708544.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 4037.0, "completions/max_terminated_length": 3661.0, "completions/mean_length": 2056.6484375, "completions/mean_terminated_length": 1348.0107421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 6.896551724137931, "frac_reward_zero_std": 0.75, "grad_norm": 1.1307146547923295, "kl": 0.3037109375, "learning_rate": 4.697341489983076e-06, "loss": 0.0048, "num_tokens": 29102867.0, "reward": 0.0054687499068677425, "reward_std": 0.015467959456145763, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2191.359375, "completions/mean_terminated_length": 1324.2047119140625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 6.9655172413793105, "frac_reward_zero_std": 0.9375, "grad_norm": 0.28935824632623103, "kl": 0.521484375, "learning_rate": 4.6899998662515215e-06, "loss": 0.0193, "num_tokens": 29513265.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 3855.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 1871.7890625, "completions/mean_terminated_length": 1163.478271484375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 7.068965517241379, "frac_reward_zero_std": 1.0, "grad_norm": 3.2101373691948605, "kl": 1.4599609375, "learning_rate": 4.682576844011007e-06, "loss": 0.0146, "num_tokens": 29883926.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 3237.0, "completions/mean_length": 1758.7578125, "completions/mean_terminated_length": 1119.5208740234375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 7.137931034482759, "frac_reward_zero_std": 0.9375, "grad_norm": 0.913814548832671, "kl": 0.5185546875, "learning_rate": 4.675072734716678e-06, "loss": 0.0172, "num_tokens": 30238047.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 4077.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 1792.265625, "completions/mean_terminated_length": 1326.1748046875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.206896551724138, "frac_reward_zero_std": 0.9375, "grad_norm": 5.294271080104991, "kl": 2.69140625, "learning_rate": 4.667487853225931e-06, "loss": 0.0362, "num_tokens": 30598529.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 3974.0, "completions/max_terminated_length": 3415.0, "completions/mean_length": 1649.203125, "completions/mean_terminated_length": 1282.2523193359375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 7.275862068965517, "frac_reward_zero_std": 0.8125, "grad_norm": 1.3488144201379413, "kl": 0.72216796875, "learning_rate": 4.659822517785203e-06, "loss": 0.0245, "num_tokens": 30940699.0, "reward": 0.00390625, "reward_std": 0.011048542335629463, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3750.0, "completions/mean_length": 1611.8125, "completions/mean_terminated_length": 1136.2735595703125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 7.344827586206897, "frac_reward_zero_std": 0.875, "grad_norm": 0.36195644471317184, "kl": 0.6396484375, "learning_rate": 4.6520770500166165e-06, "loss": -0.0007, "num_tokens": 31276915.0, "reward": 0.0023437500931322575, "reward_std": 0.00662912568077445, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 1516.9765625, "completions/mean_terminated_length": 1113.4537353515625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 7.413793103448276, "frac_reward_zero_std": 0.9375, "grad_norm": 2.006734292231092, "kl": 0.8564453125, "learning_rate": 4.644251774904487e-06, "loss": 0.014, "num_tokens": 31601720.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 4029.0, "completions/max_terminated_length": 4004.0, "completions/mean_length": 1549.9921875, "completions/mean_terminated_length": 1126.2037353515625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 7.482758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 3.402705549923824, "kl": 0.48681640625, "learning_rate": 4.636347020781684e-06, "loss": -0.0058, "num_tokens": 31930095.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 3865.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 1463.078125, "completions/mean_terminated_length": 1016.1494750976562, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 7.551724137931035, "frac_reward_zero_std": 1.0, "grad_norm": 0.04885965524357822, "kl": 0.261962890625, "learning_rate": 4.6283631193158605e-06, "loss": 0.0026, "num_tokens": 32248441.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 4071.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 1335.765625, "completions/mean_terminated_length": 1070.67822265625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.620689655172414, "frac_reward_zero_std": 0.9375, "grad_norm": 0.10929604136729178, "kl": 0.15478515625, "learning_rate": 4.620300405495532e-06, "loss": 0.0117, "num_tokens": 32550491.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 1237.7109375, "completions/mean_terminated_length": 1018.67822265625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 7.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.034299438449552, "kl": 0.1138916015625, "learning_rate": 4.612159217616022e-06, "loss": 0.0011, "num_tokens": 32838846.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3125, "completions/max_length": 3574.0, "completions/max_terminated_length": 3104.0, "completions/mean_length": 1457.6875, "completions/mean_terminated_length": 1041.4716796875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 7.758620689655173, "frac_reward_zero_std": 0.75, "grad_norm": 0.3699601876805936, "kl": 0.6552734375, "learning_rate": 4.603939897265268e-06, "loss": 0.0401, "num_tokens": 33156502.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 3954.0, "completions/max_terminated_length": 3755.0, "completions/mean_length": 1153.484375, "completions/mean_terminated_length": 949.0689697265625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 7.827586206896552, "frac_reward_zero_std": 1.0, "grad_norm": 0.08969434421826968, "kl": 0.30517578125, "learning_rate": 4.595642789309492e-06, "loss": 0.0031, "num_tokens": 33435220.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 3844.0, "completions/max_terminated_length": 3298.0, "completions/mean_length": 1188.6953125, "completions/mean_terminated_length": 964.1206665039062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 7.896551724137931, "frac_reward_zero_std": 0.9375, "grad_norm": 0.25288165833131593, "kl": 0.2178955078125, "learning_rate": 4.587268241878724e-06, "loss": 0.0172, "num_tokens": 33718213.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 4090.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 1103.6796875, "completions/mean_terminated_length": 940.9827270507812, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.9655172413793105, "frac_reward_zero_std": 0.625, "grad_norm": 1.3612955105647937, "kl": 0.3759765625, "learning_rate": 4.578816606352205e-06, "loss": 0.0199, "num_tokens": 33990556.0, "reward": 0.0078125, "reward_std": 0.022097084671258926, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3685.0, "completions/max_terminated_length": 3685.0, "completions/mean_length": 1020.4765625, "completions/mean_terminated_length": 969.6209106445312, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.068965517241379, "frac_reward_zero_std": 0.9375, "grad_norm": 0.17683694538045752, "kl": 0.17919921875, "learning_rate": 4.570288237343632e-06, "loss": 0.003, "num_tokens": 34252249.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 3087.0, "completions/max_terminated_length": 2594.0, "completions/mean_length": 1116.2421875, "completions/mean_terminated_length": 948.6724243164062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 8.137931034482758, "frac_reward_zero_std": 0.75, "grad_norm": 0.3409942466350483, "kl": 0.39453125, "learning_rate": 4.561683492686289e-06, "loss": -0.0195, "num_tokens": 34525056.0, "reward": 0.0062500000931322575, "reward_std": 0.015698691830039024, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3675.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 1004.4140625, "completions/mean_terminated_length": 899.4417114257812, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 8.206896551724139, "frac_reward_zero_std": 0.875, "grad_norm": 0.6946015343722765, "kl": 0.14532470703125, "learning_rate": 4.5530027334180285e-06, "loss": 0.01, "num_tokens": 34783789.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 3457.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 985.4453125, "completions/mean_terminated_length": 855.6693725585938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 8.275862068965518, "frac_reward_zero_std": 0.875, "grad_norm": 1.0595839352292757, "kl": 0.18084716796875, "learning_rate": 4.544246323766122e-06, "loss": 0.0144, "num_tokens": 35040998.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.28125, "completions/max_length": 3627.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 1370.171875, "completions/mean_terminated_length": 955.2857666015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 8.344827586206897, "frac_reward_zero_std": 0.9375, "grad_norm": 4.254526071210821, "kl": 0.466796875, "learning_rate": 4.535414631131983e-06, "loss": -0.0032, "num_tokens": 35347452.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 3075.0, "completions/max_terminated_length": 3075.0, "completions/mean_length": 1082.421875, "completions/mean_terminated_length": 972.6387329101562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 8.413793103448276, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3143490105876032, "kl": 0.19195556640625, "learning_rate": 4.526508026075746e-06, "loss": -0.0017, "num_tokens": 35617074.0, "reward": 0.00390625, "reward_std": 0.011048542335629463, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 3420.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 1132.328125, "completions/mean_terminated_length": 905.4086303710938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 8.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.1851143243039512, "kl": 0.273681640625, "learning_rate": 4.517526882300721e-06, "loss": 0.0027, "num_tokens": 35891916.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3865.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 1070.8515625, "completions/mean_terminated_length": 909.5000610351562, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.551724137931034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0980935432999941, "kl": 0.21307373046875, "learning_rate": 4.508471576637713e-06, "loss": 0.0021, "num_tokens": 36159617.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3966.0, "completions/max_terminated_length": 3966.0, "completions/mean_length": 1109.5859375, "completions/mean_terminated_length": 994.86669921875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 8.620689655172415, "frac_reward_zero_std": 0.875, "grad_norm": 0.23365837176809, "kl": 0.13690185546875, "learning_rate": 4.499342489029211e-06, "loss": 0.0169, "num_tokens": 36432716.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 3889.0, "completions/max_terminated_length": 3384.0, "completions/mean_length": 1092.75, "completions/mean_terminated_length": 981.0812377929688, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.689655172413794, "frac_reward_zero_std": 0.875, "grad_norm": 0.16032757011050167, "kl": 0.10394287109375, "learning_rate": 4.490140002513449e-06, "loss": 0.0146, "num_tokens": 36701396.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2425.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 1034.046875, "completions/mean_terminated_length": 973.3114013671875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 8.758620689655173, "frac_reward_zero_std": 0.9375, "grad_norm": 0.10031944462051583, "kl": 0.1348876953125, "learning_rate": 4.48086450320833e-06, "loss": 0.0099, "num_tokens": 36964594.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3574.0, "completions/max_terminated_length": 3378.0, "completions/mean_length": 1042.8125, "completions/mean_terminated_length": 974.8524169921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 8.827586206896552, "frac_reward_zero_std": 0.875, "grad_norm": 0.17832977194904587, "kl": 0.1573486328125, "learning_rate": 4.4715163802952266e-06, "loss": -0.0023, "num_tokens": 37229146.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3886.0, "completions/max_terminated_length": 3886.0, "completions/mean_length": 1122.5859375, "completions/mean_terminated_length": 1023.791748046875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 8.89655172413793, "frac_reward_zero_std": 0.9375, "grad_norm": 0.159287714280421, "kl": 0.15380859375, "learning_rate": 4.462096026002655e-06, "loss": 0.0082, "num_tokens": 37503909.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3549.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 1073.1015625, "completions/mean_terminated_length": 993.8524169921875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.96551724137931, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1435096426454022, "kl": 0.1278076171875, "learning_rate": 4.4526038355898144e-06, "loss": -0.0049, "num_tokens": 37772338.0, "reward": 0.0023437500931322575, "reward_std": 0.004650149028748274, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 1065.578125, "completions/mean_terminated_length": 992.0743408203125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 9.068965517241379, "frac_reward_zero_std": 0.9375, "grad_norm": 0.157817499585136, "kl": 0.17333984375, "learning_rate": 4.4430402073300035e-06, "loss": -0.0093, "num_tokens": 38037804.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 3758.0, "completions/max_terminated_length": 3386.0, "completions/mean_length": 1141.515625, "completions/mean_terminated_length": 1105.7381591796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 9.137931034482758, "frac_reward_zero_std": 0.875, "grad_norm": 0.18359050511600664, "kl": 0.10345458984375, "learning_rate": 4.433405542493909e-06, "loss": 0.0256, "num_tokens": 38314990.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 3627.0, "completions/max_terminated_length": 3082.0, "completions/mean_length": 1154.953125, "completions/mean_terminated_length": 992.8782348632812, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 9.206896551724139, "frac_reward_zero_std": 0.9375, "grad_norm": 0.11399074662146205, "kl": 0.1798095703125, "learning_rate": 4.4237002453327734e-06, "loss": 0.0039, "num_tokens": 38593896.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3829.0, "completions/max_terminated_length": 3127.0, "completions/mean_length": 1100.09375, "completions/mean_terminated_length": 971.3933715820312, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 9.275862068965518, "frac_reward_zero_std": 0.875, "grad_norm": 0.1709097015039483, "kl": 0.15692138671875, "learning_rate": 4.4139247230614245e-06, "loss": -0.0009, "num_tokens": 38865780.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3712.0, "completions/max_terminated_length": 3399.0, "completions/mean_length": 1115.109375, "completions/mean_terminated_length": 1002.6966552734375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 9.344827586206897, "frac_reward_zero_std": 0.875, "grad_norm": 0.2622407919256445, "kl": 0.145751953125, "learning_rate": 4.404079385841201e-06, "loss": 0.0066, "num_tokens": 39138442.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3245.0, "completions/max_terminated_length": 3245.0, "completions/mean_length": 1110.4375, "completions/mean_terminated_length": 1051.9434814453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 9.413793103448276, "frac_reward_zero_std": 0.9375, "grad_norm": 0.13173411122101364, "kl": 0.091552734375, "learning_rate": 4.394164646762734e-06, "loss": 0.0054, "num_tokens": 39411418.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 3933.0, "completions/max_terminated_length": 3087.0, "completions/mean_length": 1342.8671875, "completions/mean_terminated_length": 1057.9385986328125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 9.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.08021227471324485, "kl": 0.158203125, "learning_rate": 4.384180921828618e-06, "loss": 0.0016, "num_tokens": 39714377.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 3816.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 1384.328125, "completions/mean_terminated_length": 1038.9815673828125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 9.551724137931034, "frac_reward_zero_std": 0.875, "grad_norm": 0.41718872220840225, "kl": 0.433349609375, "learning_rate": 4.374128629935955e-06, "loss": 0.0163, "num_tokens": 40022643.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 3622.0, "completions/max_terminated_length": 3622.0, "completions/mean_length": 1137.9921875, "completions/mean_terminated_length": 1055.6990966796875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 9.620689655172415, "frac_reward_zero_std": 0.9375, "grad_norm": 0.16603761477760853, "kl": 0.1663818359375, "learning_rate": 4.364008192858781e-06, "loss": 0.0016, "num_tokens": 40298210.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 3639.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 1100.7578125, "completions/mean_terminated_length": 949.0588989257812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 9.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.12834721614969635, "kl": 0.16253662109375, "learning_rate": 4.353820035230366e-06, "loss": 0.0016, "num_tokens": 40570179.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 3675.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 1162.109375, "completions/mean_terminated_length": 982.559326171875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.758620689655173, "frac_reward_zero_std": 0.875, "grad_norm": 0.42904089811800383, "kl": 0.1427001953125, "learning_rate": 4.3435645845254e-06, "loss": 0.0204, "num_tokens": 40850001.0, "reward": 0.0023437500931322575, "reward_std": 0.00662912568077445, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 1024.2890625, "completions/mean_terminated_length": 969.9425659179688, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.827586206896552, "frac_reward_zero_std": 1.0, "grad_norm": 0.04803927651974637, "kl": 0.13232421875, "learning_rate": 4.333242271042054e-06, "loss": 0.0013, "num_tokens": 41111014.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 3613.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 1123.8203125, "completions/mean_terminated_length": 1003.9586181640625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 9.89655172413793, "frac_reward_zero_std": 0.75, "grad_norm": 1.10662989148877, "kl": 0.20062255859375, "learning_rate": 4.32285352788393e-06, "loss": 0.0274, "num_tokens": 41385935.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3703.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 1096.5625, "completions/mean_terminated_length": 962.5000610351562, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 9.96551724137931, "frac_reward_zero_std": 0.875, "grad_norm": 0.38374628240227976, "kl": 0.775390625, "learning_rate": 4.312398790941882e-06, "loss": 0.023, "num_tokens": 41656927.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 3610.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 1232.203125, "completions/mean_terminated_length": 1068.411865234375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 10.068965517241379, "frac_reward_zero_std": 0.875, "grad_norm": 1.1052339109689304, "kl": 0.47564697265625, "learning_rate": 4.301878498875735e-06, "loss": 0.0308, "num_tokens": 41945721.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "completions/max_length": 3781.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 1234.3515625, "completions/mean_terminated_length": 1016.5765991210938, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 10.137931034482758, "frac_reward_zero_std": 0.9375, "grad_norm": 35.928120765270634, "kl": 14.375, "learning_rate": 4.291293093095873e-06, "loss": 0.1488, "num_tokens": 42234790.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3320.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 1105.1953125, "completions/mean_terminated_length": 1062.725830078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 10.206896551724139, "frac_reward_zero_std": 1.0, "grad_norm": 9.4112262946098, "kl": 3.7413330078125, "learning_rate": 4.280643017744723e-06, "loss": 0.0374, "num_tokens": 42507327.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3702.0, "completions/max_terminated_length": 3338.0, "completions/mean_length": 996.4453125, "completions/mean_terminated_length": 975.1417236328125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 10.275862068965518, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2040642472598806, "kl": 0.04669189453125, "learning_rate": 4.269928719678117e-06, "loss": 0.0234, "num_tokens": 42765504.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 3703.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 1151.609375, "completions/mean_terminated_length": 1033.728759765625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.344827586206897, "frac_reward_zero_std": 0.875, "grad_norm": 14.424156189088968, "kl": 7.0703125, "learning_rate": 4.2591506484465426e-06, "loss": 0.0775, "num_tokens": 43043982.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 3827.0, "completions/max_terminated_length": 3827.0, "completions/mean_length": 1085.0390625, "completions/mean_terminated_length": 1022.5440673828125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 10.413793103448276, "frac_reward_zero_std": 0.875, "grad_norm": 1.597484250545503, "kl": 1.0592041015625, "learning_rate": 4.248309256276283e-06, "loss": 0.0059, "num_tokens": 43313035.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3439.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 1064.8359375, "completions/mean_terminated_length": 966.7294311523438, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 10.482758620689655, "frac_reward_zero_std": 0.875, "grad_norm": 0.8861787967943775, "kl": 0.5126953125, "learning_rate": 4.23740499805044e-06, "loss": 0.0101, "num_tokens": 43578070.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 3494.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 1208.5078125, "completions/mean_terminated_length": 1023.4512939453125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 10.551724137931034, "frac_reward_zero_std": 0.9375, "grad_norm": 0.4357968611893147, "kl": 1.07177734375, "learning_rate": 4.22643833128985e-06, "loss": 0.0254, "num_tokens": 43863831.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3829.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 1155.1328125, "completions/mean_terminated_length": 1037.631103515625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.620689655172415, "frac_reward_zero_std": 0.875, "grad_norm": 0.22348726052808882, "kl": 0.13751220703125, "learning_rate": 4.215409716133885e-06, "loss": 0.004, "num_tokens": 44142760.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 4073.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 1111.03125, "completions/mean_terminated_length": 1002.6470947265625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.689655172413794, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7205943652484093, "kl": 0.1025390625, "learning_rate": 4.204319615321151e-06, "loss": 0.0214, "num_tokens": 44416044.0, "reward": 0.00390625, "reward_std": 0.011048542335629463, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 3084.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 1121.2578125, "completions/mean_terminated_length": 1068.4031982421875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.758620689655173, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7995609561077808, "kl": 0.06732177734375, "learning_rate": 4.193168494170065e-06, "loss": 0.0238, "num_tokens": 44690637.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 3615.0, "completions/max_terminated_length": 3593.0, "completions/mean_length": 1146.625, "completions/mean_terminated_length": 1007.7227172851562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 10.827586206896552, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1171622175387835, "kl": 0.1898193359375, "learning_rate": 4.181956820559339e-06, "loss": 0.0112, "num_tokens": 44967149.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 3627.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 1204.2421875, "completions/mean_terminated_length": 983.7368774414062, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 10.89655172413793, "frac_reward_zero_std": 0.875, "grad_norm": 0.1873253186072465, "kl": 0.203369140625, "learning_rate": 4.170685064908342e-06, "loss": 0.0186, "num_tokens": 45252364.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 3251.0, "completions/max_terminated_length": 2182.0, "completions/mean_length": 1139.265625, "completions/mean_terminated_length": 929.5228881835938, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.96551724137931, "frac_reward_zero_std": 0.8125, "grad_norm": 3.724868026814391, "kl": 0.2445068359375, "learning_rate": 4.159353700157365e-06, "loss": 0.0321, "num_tokens": 45528118.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 3597.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 1286.359375, "completions/mean_terminated_length": 1075.72412109375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.068965517241379, "frac_reward_zero_std": 0.9375, "grad_norm": 0.5905977263071814, "kl": 0.44775390625, "learning_rate": 4.14796320174778e-06, "loss": 0.0148, "num_tokens": 45823844.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 3898.0, "completions/max_terminated_length": 3686.0, "completions/mean_length": 1284.21875, "completions/mean_terminated_length": 997.7344970703125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 11.137931034482758, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6163712852564641, "kl": 0.421630859375, "learning_rate": 4.136514047602087e-06, "loss": 0.017, "num_tokens": 46118856.0, "reward": 0.00390625, "reward_std": 0.011048542335629463, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 4073.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 1169.3515625, "completions/mean_terminated_length": 1072.76416015625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 11.206896551724139, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2294368361263189, "kl": 0.083740234375, "learning_rate": 4.1250067181038635e-06, "loss": 0.0414, "num_tokens": 46399605.0, "reward": 0.007031249813735485, "reward_std": 0.019887376576662064, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3522.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 1179.953125, "completions/mean_terminated_length": 1068.36669921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 11.275862068965518, "frac_reward_zero_std": 0.875, "grad_norm": 0.5748097249061757, "kl": 0.138427734375, "learning_rate": 4.113441696077608e-06, "loss": 0.0303, "num_tokens": 46681711.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 3321.0, "completions/max_terminated_length": 3321.0, "completions/mean_length": 1342.6328125, "completions/mean_terminated_length": 1189.296630859375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 11.344827586206897, "frac_reward_zero_std": 0.875, "grad_norm": 0.8058614856067299, "kl": 0.1512451171875, "learning_rate": 4.101819466768484e-06, "loss": 0.0295, "num_tokens": 46983472.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 3656.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 1227.3671875, "completions/mean_terminated_length": 1120.6474609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.413793103448276, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2566916575044885, "kl": 0.13800048828125, "learning_rate": 4.0901405178219535e-06, "loss": 0.0504, "num_tokens": 47271647.0, "reward": 0.00937500037252903, "reward_std": 0.02346404455602169, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3262.0, "completions/max_terminated_length": 3262.0, "completions/mean_length": 1120.6875, "completions/mean_terminated_length": 1103.9920654296875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.482758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0736888172198473, "kl": 0.048828125, "learning_rate": 4.078405339263326e-06, "loss": 0.0137, "num_tokens": 47546167.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1875, "completions/max_length": 3549.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 1352.96875, "completions/mean_terminated_length": 1016.921630859375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 11.551724137931034, "frac_reward_zero_std": 0.875, "grad_norm": 1.6005923512486049, "kl": 0.984375, "learning_rate": 4.06661442347719e-06, "loss": 0.0458, "num_tokens": 47850419.0, "reward": 0.0023437500931322575, "reward_std": 0.00662912568077445, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 3155.0, "completions/max_terminated_length": 3155.0, "completions/mean_length": 1171.25, "completions/mean_terminated_length": 1157.6220703125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 11.620689655172415, "frac_reward_zero_std": 0.8125, "grad_norm": 0.14078818110253288, "kl": 0.04827880859375, "learning_rate": 4.054768265186758e-06, "loss": 0.0359, "num_tokens": 48131411.0, "reward": 0.00390625, "reward_std": 0.011048542335629463, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 3781.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 1255.859375, "completions/mean_terminated_length": 1105.821533203125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.689655172413794, "frac_reward_zero_std": 0.8125, "grad_norm": 1.2381277561820383, "kl": 0.73291015625, "learning_rate": 4.0428673614331036e-06, "loss": 0.023, "num_tokens": 48423233.0, "reward": 0.0062500000931322575, "reward_std": 0.014625209383666515, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "completions/max_length": 3201.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 1275.1796875, "completions/mean_terminated_length": 1040.189208984375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.758620689655173, "frac_reward_zero_std": 0.9375, "grad_norm": 7.280137346077164, "kl": 4.1153564453125, "learning_rate": 4.030912211554316e-06, "loss": 0.0174, "num_tokens": 48717528.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 4037.0, "completions/max_terminated_length": 3781.0, "completions/mean_length": 1331.5859375, "completions/mean_terminated_length": 1059.8304443359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 11.827586206896552, "frac_reward_zero_std": 0.875, "grad_norm": 1.9744599773603109, "kl": 2.4501953125, "learning_rate": 4.018903317164539e-06, "loss": 0.0407, "num_tokens": 49019043.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 1198.59375, "completions/mean_terminated_length": 1045.2689208984375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 11.89655172413793, "frac_reward_zero_std": 0.875, "grad_norm": 4.653929117412743, "kl": 2.8310546875, "learning_rate": 4.006841182132932e-06, "loss": 0.0342, "num_tokens": 49300087.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 3651.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 1265.8515625, "completions/mean_terminated_length": 1117.411865234375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 11.96551724137931, "frac_reward_zero_std": 0.875, "grad_norm": 0.6234316034794274, "kl": 0.6668701171875, "learning_rate": 3.9947263125625195e-06, "loss": 0.0188, "num_tokens": 49592092.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 3644.0, "completions/max_terminated_length": 3259.0, "completions/mean_length": 1238.4140625, "completions/mean_terminated_length": 1073.966064453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.068965517241379, "frac_reward_zero_std": 0.875, "grad_norm": 0.17122087218075646, "kl": 0.2076416015625, "learning_rate": 3.982559216768967e-06, "loss": 0.0231, "num_tokens": 49881681.0, "reward": 0.00390625, "reward_std": 0.009069565683603287, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2939.0, "completions/mean_length": 1182.0234375, "completions/mean_terminated_length": 1082.7850341796875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 12.137931034482758, "frac_reward_zero_std": 0.75, "grad_norm": 0.7991821557743215, "kl": 0.10546875, "learning_rate": 3.970340405259245e-06, "loss": 0.0297, "num_tokens": 50162444.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 3125.0, "completions/max_terminated_length": 2901.0, "completions/mean_length": 1098.0625, "completions/mean_terminated_length": 997.43798828125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 12.206896551724139, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2841276215892951, "kl": 0.1004638671875, "learning_rate": 3.958070390710214e-06, "loss": 0.0165, "num_tokens": 50434068.0, "reward": 0.0078125, "reward_std": 0.022097084671258926, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 3582.0, "completions/max_terminated_length": 3582.0, "completions/mean_length": 1317.7734375, "completions/mean_terminated_length": 1081.73681640625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 12.275862068965518, "frac_reward_zero_std": 0.875, "grad_norm": 0.16105398069309368, "kl": 0.3125, "learning_rate": 3.945749687947109e-06, "loss": 0.0182, "num_tokens": 50732911.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 3694.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 1322.953125, "completions/mean_terminated_length": 1041.291259765625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 12.344827586206897, "frac_reward_zero_std": 0.875, "grad_norm": 0.2090540532526456, "kl": 0.37744140625, "learning_rate": 3.933378813921942e-06, "loss": 0.0267, "num_tokens": 51033321.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "completions/max_length": 3779.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 1344.234375, "completions/mean_terminated_length": 1135.8726806640625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 12.413793103448276, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4311077408988136, "kl": 0.284423828125, "learning_rate": 3.920958287691811e-06, "loss": 0.0218, "num_tokens": 51336455.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 3701.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 1109.2421875, "completions/mean_terminated_length": 1054.152099609375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 12.482758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 0.12068220717626289, "kl": 0.05767822265625, "learning_rate": 3.908488630397121e-06, "loss": 0.0053, "num_tokens": 51608414.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 3180.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 1074.359375, "completions/mean_terminated_length": 977.718994140625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.551724137931034, "frac_reward_zero_std": 0.5625, "grad_norm": 3.1746304474076985, "kl": 0.1571044921875, "learning_rate": 3.8959703652397175e-06, "loss": 0.0611, "num_tokens": 51875836.0, "reward": 0.01171875, "reward_std": 0.031166650354862213, "rewards/code_format_reward/mean": 0.0546875, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24301259219646454, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 3597.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 1266.734375, "completions/mean_terminated_length": 999.9464721679688, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 12.620689655172415, "frac_reward_zero_std": 0.75, "grad_norm": 3.207418864003099, "kl": 0.321044921875, "learning_rate": 3.883404017460935e-06, "loss": 0.0392, "num_tokens": 52167906.0, "reward": 0.007031250279396772, "reward_std": 0.016834918409585953, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 4021.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 1417.796875, "completions/mean_terminated_length": 1209.271240234375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 12.689655172413794, "frac_reward_zero_std": 0.9375, "grad_norm": 0.09955903326233259, "kl": 0.176025390625, "learning_rate": 3.870790114319559e-06, "loss": 0.0072, "num_tokens": 52480456.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 3968.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 1108.2109375, "completions/mean_terminated_length": 1078.6270751953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 12.758620689655173, "frac_reward_zero_std": 0.75, "grad_norm": 0.24560735641594267, "kl": 0.0670166015625, "learning_rate": 3.858129185069701e-06, "loss": 0.0139, "num_tokens": 52753379.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 3961.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 1337.1640625, "completions/mean_terminated_length": 1102.4866943359375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 12.827586206896552, "frac_reward_zero_std": 0.75, "grad_norm": 2.6007499005127412, "kl": 0.50238037109375, "learning_rate": 3.845421760938597e-06, "loss": 0.0465, "num_tokens": 53055376.0, "reward": 0.007031249813735485, "reward_std": 0.017908399924635887, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 3639.0, "completions/max_terminated_length": 3604.0, "completions/mean_length": 1307.8125, "completions/mean_terminated_length": 1093.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.89655172413793, "frac_reward_zero_std": 0.75, "grad_norm": 0.2714273921718127, "kl": 0.23095703125, "learning_rate": 3.832668375104312e-06, "loss": 0.0268, "num_tokens": 53353848.0, "reward": 0.00546875037252903, "reward_std": 0.015467960387468338, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 4073.0, "completions/max_terminated_length": 3364.0, "completions/mean_length": 1279.1484375, "completions/mean_terminated_length": 1054.80859375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 12.96551724137931, "frac_reward_zero_std": 0.8125, "grad_norm": 0.36704943028353426, "kl": 0.5118408203125, "learning_rate": 3.8198695626733725e-06, "loss": 0.0337, "num_tokens": 53648651.0, "reward": 0.004687500186264515, "reward_std": 0.010205792263150215, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 3973.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 1346.34375, "completions/mean_terminated_length": 1185.033935546875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 13.068965517241379, "frac_reward_zero_std": 0.8125, "grad_norm": 1.1481876574798495, "kl": 0.763671875, "learning_rate": 3.8070258606583156e-06, "loss": 0.0321, "num_tokens": 53952055.0, "reward": 0.007031250279396772, "reward_std": 0.013782460242509842, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 3703.0, "completions/max_terminated_length": 3643.0, "completions/mean_length": 1286.375, "completions/mean_terminated_length": 1048.99072265625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 13.137931034482758, "frac_reward_zero_std": 0.625, "grad_norm": 2.673552770064533, "kl": 1.23779296875, "learning_rate": 3.7941378079551544e-06, "loss": 0.0481, "num_tokens": 54247783.0, "reward": 0.01328125037252903, "reward_std": 0.027723699808120728, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 4090.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 1398.671875, "completions/mean_terminated_length": 1144.3450927734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.206896551724139, "frac_reward_zero_std": 0.75, "grad_norm": 1.5552309526375896, "kl": 0.97216796875, "learning_rate": 3.7812059453207677e-06, "loss": 0.0689, "num_tokens": 54557885.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 3720.0, "completions/max_terminated_length": 3228.0, "completions/mean_length": 1557.703125, "completions/mean_terminated_length": 1227.1099853515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 13.275862068965518, "frac_reward_zero_std": 0.875, "grad_norm": 9.1078368022765, "kl": 5.70703125, "learning_rate": 3.768230815350213e-06, "loss": 0.0633, "num_tokens": 54888343.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3875.0, "completions/mean_length": 1598.5390625, "completions/mean_terminated_length": 1207.463623046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 13.344827586206897, "frac_reward_zero_std": 0.9375, "grad_norm": 31.899363924236727, "kl": 20.453125, "learning_rate": 3.7552129624539557e-06, "loss": 0.2114, "num_tokens": 55221692.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 4077.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 1382.3046875, "completions/mean_terminated_length": 1149.27587890625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 13.413793103448276, "frac_reward_zero_std": 0.75, "grad_norm": 8.608620637191756, "kl": 5.47119140625, "learning_rate": 3.7421529328350316e-06, "loss": 0.0755, "num_tokens": 55529259.0, "reward": 0.00546875037252903, "reward_std": 0.015467960387468338, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 3634.0, "completions/max_terminated_length": 3476.0, "completions/mean_length": 1699.5859375, "completions/mean_terminated_length": 1204.757568359375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 13.482758620689655, "frac_reward_zero_std": 0.6875, "grad_norm": 21.27014366022027, "kl": 14.7578125, "learning_rate": 3.7290512744661274e-06, "loss": 0.1867, "num_tokens": 55877878.0, "reward": 0.008593750186264515, "reward_std": 0.02125433459877968, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 3644.0, "completions/max_terminated_length": 3129.0, "completions/mean_length": 1274.1328125, "completions/mean_terminated_length": 1156.4627685546875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 13.551724137931034, "frac_reward_zero_std": 0.75, "grad_norm": 0.6348817319536253, "kl": 0.481689453125, "learning_rate": 3.715908537066589e-06, "loss": 0.0188, "num_tokens": 56170895.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "completions/max_length": 3876.0, "completions/max_terminated_length": 3179.0, "completions/mean_length": 1417.875, "completions/mean_terminated_length": 1151.973876953125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 13.620689655172415, "frac_reward_zero_std": 0.625, "grad_norm": 1.8195574686928588, "kl": 3.0634765625, "learning_rate": 3.7027252720793538e-06, "loss": 0.0722, "num_tokens": 56483455.0, "reward": 0.00937500037252903, "reward_std": 0.0265165027230978, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 1423.90625, "completions/mean_terminated_length": 1207.26318359375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.689655172413794, "frac_reward_zero_std": 0.5625, "grad_norm": 1.4766881349959924, "kl": 1.093017578125, "learning_rate": 3.689502032647817e-06, "loss": 0.0662, "num_tokens": 56795883.0, "reward": 0.01015624962747097, "reward_std": 0.028726210817694664, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "completions/max_length": 3701.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 1477.015625, "completions/mean_terminated_length": 1120.2908935546875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.758620689655173, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7111623568121386, "kl": 0.60986328125, "learning_rate": 3.6762393735926245e-06, "loss": 0.0385, "num_tokens": 57116013.0, "reward": 0.004687500186264515, "reward_std": 0.0132582513615489, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 4078.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 1320.6796875, "completions/mean_terminated_length": 1133.3504638671875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 13.827586206896552, "frac_reward_zero_std": 0.75, "grad_norm": 1.4078028037583223, "kl": 0.148681640625, "learning_rate": 3.6629378513883852e-06, "loss": -0.002, "num_tokens": 57415036.0, "reward": 0.0078125, "reward_std": 0.019044626504182816, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 3966.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 1565.1640625, "completions/mean_terminated_length": 1047.116455078125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 13.89655172413793, "frac_reward_zero_std": 0.75, "grad_norm": 0.23178510046600032, "kl": 0.50244140625, "learning_rate": 3.6495980241403307e-06, "loss": 0.0283, "num_tokens": 57746449.0, "reward": 0.0054687499068677425, "reward_std": 0.015467959456145763, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3301.0, "completions/mean_length": 1459.15625, "completions/mean_terminated_length": 1147.72802734375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 13.96551724137931, "frac_reward_zero_std": 0.625, "grad_norm": 1.2444494445748395, "kl": 0.1614990234375, "learning_rate": 3.636220451560896e-06, "loss": 0.0406, "num_tokens": 58064061.0, "reward": 0.00937500037252903, "reward_std": 0.0265165027230978, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 1537.328125, "completions/mean_terminated_length": 1081.9056396484375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 14.068965517241379, "frac_reward_zero_std": 0.625, "grad_norm": 4.202574842240394, "kl": 0.343017578125, "learning_rate": 3.622805694946235e-06, "loss": 0.0732, "num_tokens": 58391007.0, "reward": 0.008593750186264515, "reward_std": 0.02430679462850094, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.15625, "completions/max_length": 3937.0, "completions/max_terminated_length": 3439.0, "completions/mean_length": 1681.4765625, "completions/mean_terminated_length": 1135.1881103515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 14.137931034482758, "frac_reward_zero_std": 0.75, "grad_norm": 2.849201175527445, "kl": 0.3662109375, "learning_rate": 3.609354317152667e-06, "loss": 0.0143, "num_tokens": 58737308.0, "reward": 0.007031249813735485, "reward_std": 0.016834918409585953, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 3639.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 1557.609375, "completions/mean_terminated_length": 1218.6851806640625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 14.206896551724139, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25750281797892505, "kl": 0.397705078125, "learning_rate": 3.595866882573063e-06, "loss": 0.0404, "num_tokens": 59067754.0, "reward": 0.0078125, "reward_std": 0.022097084671258926, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 1568.7890625, "completions/mean_terminated_length": 1195.89990234375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.275862068965518, "frac_reward_zero_std": 0.625, "grad_norm": 0.5981729275912904, "kl": 0.2265625, "learning_rate": 3.5823439571131675e-06, "loss": 0.0397, "num_tokens": 59395991.0, "reward": 0.00937500037252903, "reward_std": 0.0265165027230978, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0625, "completions/max_length": 3974.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 1803.53125, "completions/mean_terminated_length": 1195.30615234375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 14.344827586206897, "frac_reward_zero_std": 0.6875, "grad_norm": 1.135149590775422, "kl": 0.8017578125, "learning_rate": 3.5687861081678477e-06, "loss": 0.0678, "num_tokens": 59757915.0, "reward": 0.0078125, "reward_std": 0.022097084671258926, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.96875, "completions/max_length": 3898.0, "completions/max_terminated_length": 3719.0, "completions/mean_length": 1811.453125, "completions/mean_terminated_length": 1174.9473876953125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 14.413793103448276, "frac_reward_zero_std": 0.8125, "grad_norm": 1.040721248461896, "kl": 0.904052734375, "learning_rate": 3.555193904597291e-06, "loss": 0.0588, "num_tokens": 60120853.0, "reward": 0.0078125, "reward_std": 0.015992168337106705, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 4090.0, "completions/max_terminated_length": 3369.0, "completions/mean_length": 1775.4609375, "completions/mean_terminated_length": 1164.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 14.482758620689655, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3439397685636572, "kl": 0.317626953125, "learning_rate": 3.541567916703138e-06, "loss": 0.0157, "num_tokens": 60479184.0, "reward": 0.00390625, "reward_std": 0.011048542335629463, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 3968.0, "completions/max_terminated_length": 3484.0, "completions/mean_length": 1820.671875, "completions/mean_terminated_length": 1123.135498046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 14.551724137931034, "frac_reward_zero_std": 0.75, "grad_norm": 3.2633427938950654, "kl": 0.63671875, "learning_rate": 3.5279087162045517e-06, "loss": 0.0301, "num_tokens": 60843302.0, "reward": 0.0078125, "reward_std": 0.019044626504182816, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 3779.0, "completions/max_terminated_length": 3535.0, "completions/mean_length": 1807.640625, "completions/mean_terminated_length": 1250.888916015625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 14.620689655172415, "frac_reward_zero_std": 0.875, "grad_norm": 0.5078428161832284, "kl": 0.7353515625, "learning_rate": 3.5142168762142265e-06, "loss": 0.0156, "num_tokens": 61205752.0, "reward": 0.004687500186264515, "reward_std": 0.010205792263150215, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.78125, "completions/max_length": 4077.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 2058.3828125, "completions/mean_terminated_length": 1210.6292724609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 14.689655172413794, "frac_reward_zero_std": 0.6875, "grad_norm": 1.5381994511295285, "kl": 1.01220703125, "learning_rate": 3.500492971214347e-06, "loss": 0.0944, "num_tokens": 61600297.0, "reward": 0.010937499813735485, "reward_std": 0.024831002578139305, "rewards/code_format_reward/mean": 0.0546875, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3662.0, "completions/mean_length": 2038.375, "completions/mean_terminated_length": 1164.9766845703125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 14.758620689655173, "frac_reward_zero_std": 0.5, "grad_norm": 3.2511400903390695, "kl": 1.201171875, "learning_rate": 3.48673757703248e-06, "loss": 0.1018, "num_tokens": 61991113.0, "reward": 0.014062500558793545, "reward_std": 0.036722294986248016, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 3760.0, "completions/max_terminated_length": 3302.0, "completions/mean_length": 1885.109375, "completions/mean_terminated_length": 1156.837158203125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 14.827586206896552, "frac_reward_zero_std": 0.5, "grad_norm": 4.492459393954852, "kl": 0.744140625, "learning_rate": 3.472951270817418e-06, "loss": 0.0914, "num_tokens": 62363479.0, "reward": 0.01640624925494194, "reward_std": 0.03724650293588638, "rewards/code_format_reward/mean": 0.078125, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0859375, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.65625, "completions/max_length": 3964.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 1984.21875, "completions/mean_terminated_length": 1081.0118408203125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.89655172413793, "frac_reward_zero_std": 0.625, "grad_norm": 4.7008931014840245, "kl": 1.0595703125, "learning_rate": 3.4591346310149578e-06, "loss": 0.1139, "num_tokens": 62748531.0, "reward": 0.010937499813735485, "reward_std": 0.026977965608239174, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24301259219646454, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 4071.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 2012.0390625, "completions/mean_terminated_length": 1207.4659423828125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 14.96551724137931, "frac_reward_zero_std": 0.4375, "grad_norm": 2.9389612842832147, "kl": 4.7158203125, "learning_rate": 3.445288237343632e-06, "loss": 0.206, "num_tokens": 63137144.0, "reward": 0.02734375, "reward_std": 0.048597924411296844, "rewards/code_format_reward/mean": 0.1328125, "rewards/code_format_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3490002751350403, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3966.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 2428.8046875, "completions/mean_terminated_length": 1306.1641845703125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.068965517241379, "frac_reward_zero_std": 0.625, "grad_norm": 4.45397792495567, "kl": 4.40234375, "learning_rate": 3.4314126707703895e-06, "loss": 0.1195, "num_tokens": 63579103.0, "reward": 0.008593750186264515, "reward_std": 0.02430679276585579, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3495.0, "completions/mean_length": 1989.8828125, "completions/mean_terminated_length": 1206.3145751953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 15.137931034482758, "frac_reward_zero_std": 0.625, "grad_norm": 1.5710707568898774, "kl": 2.033203125, "learning_rate": 3.4175085134862128e-06, "loss": 0.0981, "num_tokens": 63962616.0, "reward": 0.01328125037252903, "reward_std": 0.029189826920628548, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.65625, "completions/max_length": 4078.0, "completions/max_terminated_length": 3478.0, "completions/mean_length": 1967.8125, "completions/mean_terminated_length": 1057.5765380859375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 15.206896551724139, "frac_reward_zero_std": 0.6875, "grad_norm": 0.31964770163417183, "kl": 1.7822265625, "learning_rate": 3.4035763488816953e-06, "loss": 0.0976, "num_tokens": 64345568.0, "reward": 0.0062500000931322575, "reward_std": 0.0176776684820652, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 4090.0, "completions/max_terminated_length": 3514.0, "completions/mean_length": 2182.1640625, "completions/mean_terminated_length": 1087.0933837890625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.275862068965518, "frac_reward_zero_std": 0.5, "grad_norm": 1.0092039228909133, "kl": 4.09375, "learning_rate": 3.3896167615225594e-06, "loss": 0.1554, "num_tokens": 64755957.0, "reward": 0.015625, "reward_std": 0.03808925300836563, "rewards/code_format_reward/mean": 0.078125, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.2694226801395416, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 4029.0, "completions/max_terminated_length": 3093.0, "completions/mean_length": 1987.40625, "completions/mean_terminated_length": 1125.795166015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 15.344827586206897, "frac_reward_zero_std": 0.375, "grad_norm": 1.9154515107257004, "kl": 1.12060546875, "learning_rate": 3.375630337125133e-06, "loss": 0.1725, "num_tokens": 65141417.0, "reward": 0.02656250074505806, "reward_std": 0.052394941449165344, "rewards/code_format_reward/mean": 0.1328125, "rewards/code_format_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.1328125, "rewards/format_reward/std": 0.3407054841518402, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 4071.0, "completions/max_terminated_length": 3141.0, "completions/mean_length": 2030.7265625, "completions/mean_terminated_length": 1072.77099609375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 15.413793103448276, "frac_reward_zero_std": 0.5, "grad_norm": 0.8370610521795743, "kl": 2.37890625, "learning_rate": 3.361617662531772e-06, "loss": 0.1042, "num_tokens": 65532422.0, "reward": 0.01875000074505806, "reward_std": 0.04082317277789116, "rewards/code_format_reward/mean": 0.09375, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.09375, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 2137.3359375, "completions/mean_terminated_length": 1143.5316162109375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 15.482758620689655, "frac_reward_zero_std": 0.5, "grad_norm": 1.25557061574182, "kl": 1.6552734375, "learning_rate": 3.347579325686237e-06, "loss": 0.1034, "num_tokens": 65936169.0, "reward": 0.012500000186264515, "reward_std": 0.031241057440638542, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.2694226801395416, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 2148.4140625, "completions/mean_terminated_length": 1096.0506591796875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 15.551724137931034, "frac_reward_zero_std": 0.5625, "grad_norm": 1.8263881956490176, "kl": 2.603515625, "learning_rate": 3.333515915609027e-06, "loss": 0.1221, "num_tokens": 66341094.0, "reward": 0.015625, "reward_std": 0.03413129970431328, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0859375, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.78125, "completions/max_length": 4073.0, "completions/max_terminated_length": 3126.0, "completions/mean_length": 1835.375, "completions/mean_terminated_length": 924.03369140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 15.620689655172415, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5981599420132299, "kl": 2.189453125, "learning_rate": 3.3194280223726616e-06, "loss": 0.1519, "num_tokens": 66707094.0, "reward": 0.01953125, "reward_std": 0.0434223935008049, "rewards/code_format_reward/mean": 0.09375, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.1015625, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 1926.1171875, "completions/mean_terminated_length": 1125.227294921875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.689655172413794, "frac_reward_zero_std": 0.375, "grad_norm": 2.4515527014509053, "kl": 1.6328125, "learning_rate": 3.305316237076927e-06, "loss": 0.1069, "num_tokens": 67083541.0, "reward": 0.015625, "reward_std": 0.04114171117544174, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0859375, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 2122.3125, "completions/mean_terminated_length": 1097.654296875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 15.758620689655173, "frac_reward_zero_std": 0.4375, "grad_norm": 1.171755009564972, "kl": 2.701171875, "learning_rate": 3.291181151824071e-06, "loss": 0.12, "num_tokens": 67485829.0, "reward": 0.01250000111758709, "reward_std": 0.033376358449459076, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.2694226801395416, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.96875, "completions/max_length": 3989.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 1813.6875, "completions/mean_terminated_length": 1124.4947509765625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 15.827586206896552, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5046649652436593, "kl": 2.0888671875, "learning_rate": 3.27702335969396e-06, "loss": 0.11, "num_tokens": 67849053.0, "reward": 0.02421875111758709, "reward_std": 0.045765817165374756, "rewards/code_format_reward/mean": 0.1171875, "rewards/code_format_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3320184051990509, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 3937.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 1980.9765625, "completions/mean_terminated_length": 1105.8836669921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.89655172413793, "frac_reward_zero_std": 0.375, "grad_norm": 0.7928131225427445, "kl": 2.44140625, "learning_rate": 3.2628434547191985e-06, "loss": 0.1144, "num_tokens": 68233690.0, "reward": 0.01718750037252903, "reward_std": 0.042055923491716385, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.1015625, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 2097.0, "completions/mean_terminated_length": 961.4871826171875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 15.96551724137931, "frac_reward_zero_std": 0.5, "grad_norm": 1.0489042713154726, "kl": 2.595703125, "learning_rate": 3.2486420318601973e-06, "loss": 0.148, "num_tokens": 68632946.0, "reward": 0.02109375223517418, "reward_std": 0.03914884477853775, "rewards/code_format_reward/mean": 0.0859375, "rewards/code_format_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3320184051990509, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.8125, "completions/max_length": 4073.0, "completions/max_terminated_length": 3331.0, "completions/mean_length": 1801.9296875, "completions/mean_terminated_length": 960.8778076171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 16.06896551724138, "frac_reward_zero_std": 0.4375, "grad_norm": 4.154592071814475, "kl": 1.04931640625, "learning_rate": 3.2344196869802187e-06, "loss": 0.1281, "num_tokens": 68994665.0, "reward": 0.01875000074505806, "reward_std": 0.04007909819483757, "rewards/code_format_reward/mean": 0.078125, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 3865.0, "completions/max_terminated_length": 3621.0, "completions/mean_length": 1865.6015625, "completions/mean_terminated_length": 1114.8370361328125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.137931034482758, "frac_reward_zero_std": 0.25, "grad_norm": 1.3591517303477938, "kl": 1.84765625, "learning_rate": 3.2201770168203694e-06, "loss": 0.1793, "num_tokens": 69364534.0, "reward": 0.02656250074505806, "reward_std": 0.05918382853269577, "rewards/code_format_reward/mean": 0.1328125, "rewards/code_format_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.1328125, "rewards/format_reward/std": 0.3407054841518402, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 4077.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 2076.515625, "completions/mean_terminated_length": 942.3846435546875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 16.20689655172414, "frac_reward_zero_std": 0.1875, "grad_norm": 0.806887397746818, "kl": 5.0390625, "learning_rate": 3.205914618974563e-06, "loss": 0.25, "num_tokens": 69761400.0, "reward": 0.02812499925494194, "reward_std": 0.059036217629909515, "rewards/code_format_reward/mean": 0.1171875, "rewards/code_format_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.1640625, "rewards/format_reward/std": 0.371787428855896, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2223.921875, "completions/mean_terminated_length": 1006.7838134765625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 16.275862068965516, "frac_reward_zero_std": 0.5, "grad_norm": 35.47745775038266, "kl": 26.03125, "learning_rate": 3.1916330918644496e-06, "loss": 0.4003, "num_tokens": 70176038.0, "reward": 0.01953125, "reward_std": 0.038390956819057465, "rewards/code_format_reward/mean": 0.0859375, "rewards/code_format_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 3898.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 2152.0859375, "completions/mean_terminated_length": 1088.0267333984375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 16.344827586206897, "frac_reward_zero_std": 0.375, "grad_norm": 1.2503280834421062, "kl": 2.466796875, "learning_rate": 3.177333034714303e-06, "loss": 0.1744, "num_tokens": 70582577.0, "reward": 0.01796874962747097, "reward_std": 0.042448077350854874, "rewards/code_format_reward/mean": 0.078125, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.1015625, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3593.0, "completions/mean_length": 2271.9375, "completions/mean_terminated_length": 950.7464599609375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 16.413793103448278, "frac_reward_zero_std": 0.25, "grad_norm": 3.602351826361304, "kl": 6.76953125, "learning_rate": 3.1630150475258813e-06, "loss": 0.2266, "num_tokens": 71003289.0, "reward": 0.02265625074505806, "reward_std": 0.05599765479564667, "rewards/code_format_reward/mean": 0.109375, "rewards/code_format_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.1171875, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3756.0, "completions/mean_length": 2153.6953125, "completions/mean_terminated_length": 891.8289794921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.482758620689655, "frac_reward_zero_std": 0.25, "grad_norm": 1.070332411448895, "kl": 3.322265625, "learning_rate": 3.148679731053252e-06, "loss": 0.2088, "num_tokens": 71409802.0, "reward": 0.02031249925494194, "reward_std": 0.046327732503414154, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.1328125, "rewards/format_reward/std": 0.3407054841518402, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 2141.875, "completions/mean_terminated_length": 917.4473876953125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 16.551724137931036, "frac_reward_zero_std": 0.4375, "grad_norm": 5.424674912817685, "kl": 6.75, "learning_rate": 3.1343276867775805e-06, "loss": 0.1729, "num_tokens": 71814130.0, "reward": 0.01953125186264515, "reward_std": 0.03921514004468918, "rewards/code_format_reward/mean": 0.0859375, "rewards/code_format_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 2093.890625, "completions/mean_terminated_length": 907.3125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 16.620689655172413, "frac_reward_zero_std": 0.25, "grad_norm": 3.374418133895407, "kl": 2.6484375, "learning_rate": 3.1199595168819043e-06, "loss": 0.1271, "num_tokens": 72212076.0, "reward": 0.01718750037252903, "reward_std": 0.04312940686941147, "rewards/code_format_reward/mean": 0.0546875, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.1171875, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 4090.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 2016.7421875, "completions/mean_terminated_length": 873.5584106445312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.689655172413794, "frac_reward_zero_std": 0.25, "grad_norm": 3.8924129745454485, "kl": 1.4404296875, "learning_rate": 3.105575824225852e-06, "loss": 0.1844, "num_tokens": 72601291.0, "reward": 0.02500000037252903, "reward_std": 0.05301665514707565, "rewards/code_format_reward/mean": 0.1015625, "rewards/code_format_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.1484375, "rewards/format_reward/std": 0.356930136680603, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 4037.0, "completions/max_terminated_length": 2321.0, "completions/mean_length": 1980.5859375, "completions/mean_terminated_length": 908.4815063476562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 16.75862068965517, "frac_reward_zero_std": 0.375, "grad_norm": 1.4340625784111272, "kl": 1.90234375, "learning_rate": 3.091177212320363e-06, "loss": 0.1097, "num_tokens": 72985878.0, "reward": 0.01640624925494194, "reward_std": 0.03679375723004341, "rewards/code_format_reward/mean": 0.0546875, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 2147.96875, "completions/mean_terminated_length": 995.643798828125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 16.82758620689655, "frac_reward_zero_std": 0.5, "grad_norm": 2.7268618682978207, "kl": 1.09375, "learning_rate": 3.0767642853023538e-06, "loss": 0.1017, "num_tokens": 73390282.0, "reward": 0.01953125, "reward_std": 0.035612352192401886, "rewards/code_format_reward/mean": 0.078125, "rewards/code_format_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.1171875, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 4071.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 2277.1484375, "completions/mean_terminated_length": 1013.3943481445312, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 16.896551724137932, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5995736918071287, "kl": 1.7578125, "learning_rate": 3.062337647909376e-06, "loss": 0.1374, "num_tokens": 73812829.0, "reward": 0.01093750074505806, "reward_std": 0.026821641251444817, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 4029.0, "completions/max_terminated_length": 3600.0, "completions/mean_length": 2105.015625, "completions/mean_terminated_length": 944.467529296875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 16.96551724137931, "frac_reward_zero_std": 0.4375, "grad_norm": 1.857143231853844, "kl": 5.6171875, "learning_rate": 3.04789790545424e-06, "loss": 0.1826, "num_tokens": 74213343.0, "reward": 0.01171875, "reward_std": 0.02766144648194313, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.09375, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 4077.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 1964.4296875, "completions/mean_terminated_length": 838.923095703125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 17.06896551724138, "frac_reward_zero_std": 0.375, "grad_norm": 4.017797181216476, "kl": 6.318359375, "learning_rate": 3.033445663799621e-06, "loss": 0.2199, "num_tokens": 74595862.0, "reward": 0.02031249925494194, "reward_std": 0.044045768678188324, "rewards/code_format_reward/mean": 0.0859375, "rewards/code_format_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.1171875, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 3966.0, "completions/max_terminated_length": 3177.0, "completions/mean_length": 2182.3359375, "completions/mean_terminated_length": 938.0972290039062, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 17.137931034482758, "frac_reward_zero_std": 0.4375, "grad_norm": 3.343975298515327, "kl": 4.91796875, "learning_rate": 3.018981529332633e-06, "loss": 0.1409, "num_tokens": 75006273.0, "reward": 0.015625, "reward_std": 0.0354263111948967, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.09375, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 3984.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 2097.34375, "completions/mean_terminated_length": 961.1428833007812, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 17.20689655172414, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6967963330727057, "kl": 2.060546875, "learning_rate": 3.00450610893939e-06, "loss": 0.1658, "num_tokens": 75405805.0, "reward": 0.01640625111758709, "reward_std": 0.037123169749975204, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.1015625, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 4090.0, "completions/max_terminated_length": 3535.0, "completions/mean_length": 2127.8203125, "completions/mean_terminated_length": 835.5394897460938, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 17.275862068965516, "frac_reward_zero_std": 0.625, "grad_norm": 0.7450705734420142, "kl": 2.42041015625, "learning_rate": 2.9900200099795396e-06, "loss": 0.1491, "num_tokens": 75809238.0, "reward": 0.01484375074505806, "reward_std": 0.02863791026175022, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0859375, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3596.0, "completions/mean_length": 2129.953125, "completions/mean_terminated_length": 1060.5570068359375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.344827586206897, "frac_reward_zero_std": 0.4375, "grad_norm": 1.2848423270886091, "kl": 1.453125, "learning_rate": 2.9755238402607826e-06, "loss": 0.1156, "num_tokens": 76210872.0, "reward": 0.01484375074505806, "reward_std": 0.0329950749874115, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 3848.0, "completions/max_terminated_length": 3738.0, "completions/mean_length": 1911.796875, "completions/mean_terminated_length": 846.108154296875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 17.413793103448278, "frac_reward_zero_std": 0.5, "grad_norm": 4.043571004615006, "kl": 4.5546875, "learning_rate": 2.961018208013367e-06, "loss": 0.1418, "num_tokens": 76586654.0, "reward": 0.015625, "reward_std": 0.032934483140707016, "rewards/code_format_reward/mean": 0.0546875, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.1015625, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 1698.1171875, "completions/mean_terminated_length": 861.528076171875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.482758620689655, "frac_reward_zero_std": 0.25, "grad_norm": 3.2520289626867145, "kl": 2.818359375, "learning_rate": 2.9465037218645694e-06, "loss": 0.1809, "num_tokens": 76933941.0, "reward": 0.014843749813735485, "reward_std": 0.03893200308084488, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.31333550810813904, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 3886.0, "completions/max_terminated_length": 3092.0, "completions/mean_length": 1711.015625, "completions/mean_terminated_length": 722.279052734375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.551724137931036, "frac_reward_zero_std": 0.1875, "grad_norm": 1.386808868017716, "kl": 4.33203125, "learning_rate": 2.9319809908131604e-06, "loss": 0.2472, "num_tokens": 77284023.0, "reward": 0.02031250111758709, "reward_std": 0.04664548113942146, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3490002751350403, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 3933.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 1690.3125, "completions/mean_terminated_length": 672.7830810546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 17.620689655172413, "frac_reward_zero_std": 0.1875, "grad_norm": 5.6917441385024565, "kl": 4.3447265625, "learning_rate": 2.917450624203847e-06, "loss": 0.1689, "num_tokens": 77631455.0, "reward": 0.01953125, "reward_std": 0.04777955263853073, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.1328125, "rewards/format_reward/std": 0.3407054841518402, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 4021.0, "completions/max_terminated_length": 3440.0, "completions/mean_length": 1811.625, "completions/mean_terminated_length": 722.795166015625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.689655172413794, "frac_reward_zero_std": 0.5, "grad_norm": 5.136435933319982, "kl": 2.98046875, "learning_rate": 2.9029132317017118e-06, "loss": 0.1391, "num_tokens": 77994415.0, "reward": 0.01328125037252903, "reward_std": 0.03055463545024395, "rewards/code_format_reward/mean": 0.046875, "rewards/code_format_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.0859375, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 4037.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 1600.0390625, "completions/mean_terminated_length": 605.1046752929688, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 17.75862068965517, "frac_reward_zero_std": 0.3125, "grad_norm": 1.9956171932110272, "kl": 6.34375, "learning_rate": 2.888369423266629e-06, "loss": 0.254, "num_tokens": 78330292.0, "reward": 0.02734375, "reward_std": 0.047707222402095795, "rewards/code_format_reward/mean": 0.09375, "rewards/code_format_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.1796875, "rewards/format_reward/std": 0.3854354918003082, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1378.984375, "completions/mean_terminated_length": 578.1224365234375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 17.82758620689655, "frac_reward_zero_std": 0.25, "grad_norm": 5.423762267243065, "kl": 3.732421875, "learning_rate": 2.8738198091276712e-06, "loss": 0.1456, "num_tokens": 78637642.0, "reward": 0.02578125149011612, "reward_std": 0.0503658652305603, "rewards/code_format_reward/mean": 0.0859375, "rewards/code_format_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.3787541687488556, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3603.0, "completions/mean_length": 1465.65625, "completions/mean_terminated_length": 757.0425415039062, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.896551724137932, "frac_reward_zero_std": 0.375, "grad_norm": 0.5512289745895489, "kl": 1.080078125, "learning_rate": 2.859264999757509e-06, "loss": 0.1129, "num_tokens": 78954782.0, "reward": 0.01875000074505806, "reward_std": 0.040201522409915924, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3320184051990509, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3106.0, "completions/mean_length": 1494.125, "completions/mean_terminated_length": 739.6382446289062, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 17.96551724137931, "frac_reward_zero_std": 0.4375, "grad_norm": 4.597060014349811, "kl": 1.115234375, "learning_rate": 2.8447056058467928e-06, "loss": 0.1514, "num_tokens": 79275934.0, "reward": 0.01328125037252903, "reward_std": 0.028737083077430725, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.1015625, "rewards/format_reward/std": 0.3032590448856354, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 3973.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 1130.59375, "completions/mean_terminated_length": 561.3689575195312, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 18.06896551724138, "frac_reward_zero_std": 0.25, "grad_norm": 5.182421375878051, "kl": 0.8466796875, "learning_rate": 2.830142238278531e-06, "loss": 0.1517, "num_tokens": 79551722.0, "reward": 0.02734375186264515, "reward_std": 0.04687388986349106, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.2109375, "rewards/format_reward/std": 0.4095771610736847, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 3974.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 1148.2265625, "completions/mean_terminated_length": 685.1307983398438, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 18.137931034482758, "frac_reward_zero_std": 0.125, "grad_norm": 32.76938325038699, "kl": 2.17578125, "learning_rate": 2.81557550810246e-06, "loss": 0.1685, "num_tokens": 79829767.0, "reward": 0.03125, "reward_std": 0.058743592351675034, "rewards/code_format_reward/mean": 0.109375, "rewards/code_format_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.203125, "rewards/format_reward/std": 0.40390563011169434, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 3989.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 1138.234375, "completions/mean_terminated_length": 537.9514770507812, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.20689655172414, "frac_reward_zero_std": 0.4375, "grad_norm": 1.481458883574915, "kl": 2.66650390625, "learning_rate": 2.8010060265094026e-06, "loss": 0.0941, "num_tokens": 80106533.0, "reward": 0.01953125, "reward_std": 0.03758678585290909, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3320184051990509, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 3841.0, "completions/max_terminated_length": 3841.0, "completions/mean_length": 1103.625, "completions/mean_terminated_length": 743.2232666015625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 18.275862068965516, "frac_reward_zero_std": 0.25, "grad_norm": 3.1199833226592513, "kl": 0.8916015625, "learning_rate": 2.786434404805629e-06, "loss": 0.1961, "num_tokens": 80378637.0, "reward": 0.02031249925494194, "reward_std": 0.04190831631422043, "rewards/code_format_reward/mean": 0.0546875, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.1484375, "rewards/format_reward/std": 0.356930136680603, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 1401.6484375, "completions/mean_terminated_length": 601.9381103515625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.344827586206897, "frac_reward_zero_std": 0.1875, "grad_norm": 7.691397550402843, "kl": 4.43359375, "learning_rate": 2.771861254387199e-06, "loss": 0.3108, "num_tokens": 80688680.0, "reward": 0.02421875111758709, "reward_std": 0.044612735509872437, "rewards/code_format_reward/mean": 0.0546875, "rewards/code_format_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.39184603095054626, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 3766.0, "completions/max_terminated_length": 3421.0, "completions/mean_length": 1048.765625, "completions/mean_terminated_length": 709.5000610351562, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.413793103448278, "frac_reward_zero_std": 0.125, "grad_norm": 2.0199364421525243, "kl": 5.18212890625, "learning_rate": 2.7572871867143204e-06, "loss": 0.1734, "num_tokens": 80951730.0, "reward": 0.02421874925494194, "reward_std": 0.05020953714847565, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.1796875, "rewards/format_reward/std": 0.3854354918003082, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 3779.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 837.921875, "completions/mean_terminated_length": 614.5641479492188, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.482758620689655, "frac_reward_zero_std": 0.125, "grad_norm": 2.2037671170022564, "kl": 1.66796875, "learning_rate": 2.742712813285681e-06, "loss": 0.1824, "num_tokens": 81190056.0, "reward": 0.02656250074505806, "reward_std": 0.051087357103824615, "rewards/code_format_reward/mean": 0.0703125, "rewards/code_format_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.1953125, "rewards/format_reward/std": 0.3979988098144531, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 3981.0, "completions/max_terminated_length": 3981.0, "completions/mean_length": 611.421875, "completions/mean_terminated_length": 415.0254211425781, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.551724137931036, "frac_reward_zero_std": 0.125, "grad_norm": 6.316499749233628, "kl": 5.046875, "learning_rate": 2.7281387456128017e-06, "loss": 0.1215, "num_tokens": 81398222.0, "reward": 0.02734375, "reward_std": 0.051739681512117386, "rewards/code_format_reward/mean": 0.0625, "rewards/code_format_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.2109375, "rewards/format_reward/std": 0.4095771610736847, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 3816.0, "completions/max_terminated_length": 3276.0, "completions/mean_length": 763.0234375, "completions/mean_terminated_length": 485.97412109375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.620689655172413, "frac_reward_zero_std": 0.25, "grad_norm": 3.686627518077638, "kl": 4.93212890625, "learning_rate": 2.7135655951943716e-06, "loss": 0.174, "num_tokens": 81626961.0, "reward": 0.0234375, "reward_std": 0.03926009684801102, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 0.203125, "rewards/format_reward/std": 0.40390563011169434, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 4021.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 721.484375, "completions/mean_terminated_length": 390.7719421386719, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.689655172413794, "frac_reward_zero_std": 0.0, "grad_norm": 10.577008880883428, "kl": 9.1611328125, "learning_rate": 2.698993973490598e-06, "loss": 0.257, "num_tokens": 81850383.0, "reward": 0.02890624850988388, "reward_std": 0.05324123799800873, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.434714138507843, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2860.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 449.2578125, "completions/mean_terminated_length": 372.7131042480469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 18.75862068965517, "frac_reward_zero_std": 0.25, "grad_norm": 6.837143928180438, "kl": 1.724609375, "learning_rate": 2.6844244918975416e-06, "loss": 0.1715, "num_tokens": 82038056.0, "reward": 0.02265625074505806, "reward_std": 0.042552001774311066, "rewards/code_format_reward/mean": 0.0390625, "rewards/code_format_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.39184603095054626, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2975.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 416.8125, "completions/mean_terminated_length": 328.70489501953125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 18.82758620689655, "frac_reward_zero_std": 0.125, "grad_norm": 53.29809766701044, "kl": 9.4677734375, "learning_rate": 2.66985776172147e-06, "loss": 0.3514, "num_tokens": 82222480.0, "reward": 0.01875000074505806, "reward_std": 0.038377560675144196, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.1796875, "rewards/format_reward/std": 0.3854354918003082, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 3628.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 472.5625, "completions/mean_terminated_length": 401.1788330078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 18.896551724137932, "frac_reward_zero_std": 0.25, "grad_norm": 2.5521363288691994, "kl": 0.8603515625, "learning_rate": 2.6552943941532088e-06, "loss": 0.2253, "num_tokens": 82412896.0, "reward": 0.02109375037252903, "reward_std": 0.03715597838163376, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.1953125, "rewards/format_reward/std": 0.3979988098144531, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 361.4375, "completions/mean_terminated_length": 295.4098205566406, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.96551724137931, "frac_reward_zero_std": 0.1875, "grad_norm": 1.9363820641920646, "kl": 1.884765625, "learning_rate": 2.6407350002424927e-06, "loss": 0.1635, "num_tokens": 82590232.0, "reward": 0.02578125149011612, "reward_std": 0.03964492678642273, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.42527204751968384, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 3701.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 302.90625, "completions/mean_terminated_length": 252.00001525878906, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.06896551724138, "frac_reward_zero_std": 0.0625, "grad_norm": 2.944937731689937, "kl": 2.4765625, "learning_rate": 2.626180190872329e-06, "loss": 0.339, "num_tokens": 82760076.0, "reward": 0.02734375, "reward_std": 0.046176549047231674, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.2578125, "rewards/format_reward/std": 0.43914902210235596, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 242.21875, "completions/mean_terminated_length": 230.05557250976562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.137931034482758, "frac_reward_zero_std": 0.0, "grad_norm": 15.57910661584801, "kl": 11.6650390625, "learning_rate": 2.611630576733372e-06, "loss": 0.4444, "num_tokens": 82921008.0, "reward": 0.03046875074505806, "reward_std": 0.051431089639663696, "rewards/code_format_reward/mean": 0.0234375, "rewards/code_format_reward/std": 0.15188287198543549, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4513758420944214, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 179.9765625, "completions/mean_terminated_length": 150.3064422607422, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.20689655172414, "frac_reward_zero_std": 0.0, "grad_norm": 7.323966437247427, "kl": 12.451171875, "learning_rate": 2.5970867682982885e-06, "loss": 0.7394, "num_tokens": 83074885.0, "reward": 0.04296875, "reward_std": 0.05002420395612717, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.4296875, "rewards/format_reward/std": 0.4969765841960907, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 82.8984375, "completions/mean_terminated_length": 79.51181030273438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.275862068965516, "frac_reward_zero_std": 0.0, "grad_norm": 8.931857110772658, "kl": 3.00390625, "learning_rate": 2.582549375796154e-06, "loss": 0.4891, "num_tokens": 83216568.0, "reward": 0.04843749850988388, "reward_std": 0.05170843005180359, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.5017194747924805, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 85.921875, "completions/mean_terminated_length": 77.81600189208984, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.344827586206897, "frac_reward_zero_std": 0.0, "grad_norm": 19.11515653931392, "kl": 9.69921875, "learning_rate": 2.568019009186841e-06, "loss": 0.5259, "num_tokens": 83356638.0, "reward": 0.04140625149011612, "reward_std": 0.05091936141252518, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.4140625, "rewards/format_reward/std": 0.49449479579925537, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 67.5078125, "completions/mean_terminated_length": 66.23622131347656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.413793103448278, "frac_reward_zero_std": 0.0, "grad_norm": 17.639456329459385, "kl": 6.4296875, "learning_rate": 2.5534962781354317e-06, "loss": 0.4702, "num_tokens": 83496351.0, "reward": 0.04296875, "reward_std": 0.049871139228343964, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.4296875, "rewards/format_reward/std": 0.4969765841960907, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 3172.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 215.1796875, "completions/mean_terminated_length": 137.94168090820312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.482758620689655, "frac_reward_zero_std": 0.125, "grad_norm": 8.866046319285678, "kl": 9.9765625, "learning_rate": 2.538981791986634e-06, "loss": 0.4059, "num_tokens": 83654966.0, "reward": 0.0234375, "reward_std": 0.03877146169543266, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.42527204751968384, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 3662.0, "completions/max_terminated_length": 2852.0, "completions/mean_length": 861.859375, "completions/mean_terminated_length": 471.21099853515625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.551724137931036, "frac_reward_zero_std": 0.3125, "grad_norm": 78.11921241015132, "kl": 51.28125, "learning_rate": 2.524476159739218e-06, "loss": 0.5243, "num_tokens": 83895188.0, "reward": 0.012500000186264515, "reward_std": 0.027382206171751022, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3320184051990509, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3298.0, "completions/mean_length": 1974.359375, "completions/mean_terminated_length": 870.5783081054688, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 19.620689655172413, "frac_reward_zero_std": 0.5, "grad_norm": 147.7465652644851, "kl": 100.5, "learning_rate": 2.5099799900204607e-06, "loss": 1.0181, "num_tokens": 84277810.0, "reward": 0.007031249813735485, "reward_std": 0.018361147493124008, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 3954.0, "completions/max_terminated_length": 3934.0, "completions/mean_length": 2908.65625, "completions/mean_terminated_length": 965.1282348632812, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.689655172413794, "frac_reward_zero_std": 0.8125, "grad_norm": 65.90059415462332, "kl": 47.6875, "learning_rate": 2.4954938910606108e-06, "loss": 0.4934, "num_tokens": 84781190.0, "reward": 0.0023437500931322575, "reward_std": 0.00662912568077445, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3964.0, "completions/max_terminated_length": 3425.0, "completions/mean_length": 3164.609375, "completions/mean_terminated_length": 1118.7999267578125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 19.75862068965517, "frac_reward_zero_std": 0.9375, "grad_norm": 44.797485068125546, "kl": 32.0625, "learning_rate": 2.481018470667368e-06, "loss": 0.3168, "num_tokens": 85317332.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3937.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 3499.0546875, "completions/mean_terminated_length": 694.1666870117188, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 19.82758620689655, "frac_reward_zero_std": 0.6875, "grad_norm": 14.746233541391186, "kl": 10.765625, "learning_rate": 2.4665543362003802e-06, "loss": 0.1122, "num_tokens": 85896283.0, "reward": 0.004687500186264515, "reward_std": 0.01173202134668827, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 3891.953125, "completions/mean_terminated_length": 917.2000122070312, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 19.896551724137932, "frac_reward_zero_std": 0.75, "grad_norm": 14.791600919357032, "kl": 8.171875, "learning_rate": 2.4521020945457615e-06, "loss": 0.079, "num_tokens": 86525085.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.1746762990951538, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4073.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 3757.9296875, "completions/mean_terminated_length": 701.5, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 19.96551724137931, "frac_reward_zero_std": 0.75, "grad_norm": 7.958211027904403, "kl": 6.0234375, "learning_rate": 2.4376623520906255e-06, "loss": 0.0593, "num_tokens": 87137172.0, "reward": 0.00390625, "reward_std": 0.009522313252091408, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4078.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 3547.34375, "completions/mean_terminated_length": 1047.0, "completions/min_length": 1047.0, "completions/min_terminated_length": 1047.0, "epoch": 20.06896551724138, "frac_reward_zero_std": 0.6875, "grad_norm": 5.354544835330795, "kl": 2.328125, "learning_rate": 2.4232357146976478e-06, "loss": 0.0242, "num_tokens": 87722304.0, "reward": 0.007031249813735485, "reward_std": 0.012863079085946083, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2566775679588318, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 4021.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 3712.1796875, "completions/mean_terminated_length": 644.6666870117188, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 20.137931034482758, "frac_reward_zero_std": 0.625, "grad_norm": 11.135894730059231, "kl": 1.595703125, "learning_rate": 2.408822787679637e-06, "loss": 0.0132, "num_tokens": 88328535.0, "reward": 0.0078125, "reward_std": 0.01507278811186552, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.2694226801395416, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3968.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 3798.5, "completions/mean_terminated_length": 393.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 20.20689655172414, "frac_reward_zero_std": 0.375, "grad_norm": 13.397806501206668, "kl": 0.9453125, "learning_rate": 2.3944241757741475e-06, "loss": 0.014, "num_tokens": 88945815.0, "reward": 0.014062500558793545, "reward_std": 0.02688095159828663, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3490002751350403, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3989.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 3693.9921875, "completions/mean_terminated_length": 788.0, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 20.275862068965516, "frac_reward_zero_std": 0.3125, "grad_norm": 16.532779819726088, "kl": 1.384765625, "learning_rate": 2.380040483118097e-06, "loss": 0.0148, "num_tokens": 89549718.0, "reward": 0.01250000111758709, "reward_std": 0.027382206171751022, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3320184051990509, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3993.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3865.0, "completions/min_terminated_length": 0.0, "epoch": 20.344827586206897, "frac_reward_zero_std": 0.125, "grad_norm": 31.110583034199667, "kl": 2.390625, "learning_rate": 2.365672313222419e-06, "loss": 0.0239, "num_tokens": 90191662.0, "reward": 0.01796875149011612, "reward_std": 0.03640326112508774, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.1796875, "rewards/format_reward/std": 0.3854354918003082, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3378.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2581.0, "completions/min_terminated_length": 0.0, "epoch": 20.413793103448278, "frac_reward_zero_std": 0.125, "grad_norm": 26.367316082400894, "kl": 3.13671875, "learning_rate": 2.351320268946749e-06, "loss": 0.0314, "num_tokens": 90755118.0, "reward": 0.02265625074505806, "reward_std": 0.03753383085131645, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.2265625, "rewards/format_reward/std": 0.4202519655227661, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3311.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 2773.25, "completions/mean_terminated_length": 90.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 20.482758620689655, "frac_reward_zero_std": 0.1875, "grad_norm": 102.269291439468, "kl": 2.6015625, "learning_rate": 2.336984952474119e-06, "loss": 0.0289, "num_tokens": 91241166.0, "reward": 0.0234375, "reward_std": 0.035088710486888885, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.42527204751968384, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3876.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3458.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3164.0, "completions/min_terminated_length": 0.0, "epoch": 20.551724137931036, "frac_reward_zero_std": 0.0, "grad_norm": 72.51493009270023, "kl": 2.42578125, "learning_rate": 2.322666965285697e-06, "loss": 0.0243, "num_tokens": 91814894.0, "reward": 0.02421875111758709, "reward_std": 0.043605104088783264, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.42527204751968384, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3092.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2445.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1085.0, "completions/min_terminated_length": 0.0, "epoch": 20.620689655172413, "frac_reward_zero_std": 0.25, "grad_norm": 78.00543847542036, "kl": 2.6484375, "learning_rate": 2.3083669081355507e-06, "loss": 0.0265, "num_tokens": 92258958.0, "reward": 0.015625, "reward_std": 0.03164235129952431, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.3645188808441162, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3562.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2766.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1714.0, "completions/min_terminated_length": 0.0, "epoch": 20.689655172413794, "frac_reward_zero_std": 0.5, "grad_norm": 26.563040880287073, "kl": 3.9765625, "learning_rate": 2.2940853810254377e-06, "loss": 0.0398, "num_tokens": 92744110.0, "reward": 0.00937500037252903, "reward_std": 0.020875994116067886, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0859375, "rewards/format_reward/std": 0.2813730239868164, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3666.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3145.0, "completions/min_terminated_length": 0.0, "epoch": 20.75862068965517, "frac_reward_zero_std": 0.5, "grad_norm": 53.7635867052342, "kl": 18.375, "learning_rate": 2.2798229831796313e-06, "loss": 0.184, "num_tokens": 93342214.0, "reward": 0.0078125, "reward_std": 0.019044626504182816, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.2694226801395416, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3069.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1434.0, "completions/min_terminated_length": 0.0, "epoch": 20.82758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 17.9323099281784, "kl": 7.25, "learning_rate": 2.2655803130197816e-06, "loss": 0.0726, "num_tokens": 93863854.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4090.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2860.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 855.0, "completions/min_terminated_length": 0.0, "epoch": 20.896551724137932, "frac_reward_zero_std": 0.875, "grad_norm": 7.2071875811335255, "kl": 4.671875, "learning_rate": 2.2513579681398034e-06, "loss": 0.0468, "num_tokens": 94361038.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 2156.0390625, "completions/mean_terminated_length": 695.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 20.96551724137931, "frac_reward_zero_std": 0.8125, "grad_norm": 6.4429732989107045, "kl": 3.7734375, "learning_rate": 2.237156545280803e-06, "loss": 0.0392, "num_tokens": 94766739.0, "reward": 0.004687500186264515, "reward_std": 0.0077601829543709755, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3844.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3617.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3213.0, "completions/min_terminated_length": 0.0, "epoch": 21.06896551724138, "frac_reward_zero_std": 0.4375, "grad_norm": 40.14795749530242, "kl": 2.421875, "learning_rate": 2.2229766403060403e-06, "loss": 0.0242, "num_tokens": 95360883.0, "reward": 0.00937500037252903, "reward_std": 0.021018434315919876, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.09375, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3431.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2625.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1581.0, "completions/min_terminated_length": 0.0, "epoch": 21.137931034482758, "frac_reward_zero_std": 0.6875, "grad_norm": 21.982058945015865, "kl": 3.59765625, "learning_rate": 2.2088188481759305e-06, "loss": 0.036, "num_tokens": 95826851.0, "reward": 0.004687500186264515, "reward_std": 0.01173202134668827, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4090.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3675.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3390.0, "completions/min_terminated_length": 0.0, "epoch": 21.20689655172414, "frac_reward_zero_std": 0.3125, "grad_norm": 32.96333597644167, "kl": 3.59375, "learning_rate": 2.194683762923073e-06, "loss": 0.036, "num_tokens": 96428323.0, "reward": 0.01171875, "reward_std": 0.026698727160692215, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.1171875, "rewards/format_reward/std": 0.322907418012619, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3826.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3617.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3172.0, "completions/min_terminated_length": 0.0, "epoch": 21.275862068965516, "frac_reward_zero_std": 0.4375, "grad_norm": 36.29651807443635, "kl": 2.99609375, "learning_rate": 2.1805719776273387e-06, "loss": 0.03, "num_tokens": 97022371.0, "reward": 0.00937500037252903, "reward_std": 0.021937813609838486, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.09375, "rewards/format_reward/std": 0.29262590408325195, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4021.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3392.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2620.0, "completions/min_terminated_length": 0.0, "epoch": 21.344827586206897, "frac_reward_zero_std": 0.3125, "grad_norm": 108.61061705256071, "kl": 7.4140625, "learning_rate": 2.166484084390974e-06, "loss": 0.0743, "num_tokens": 97586515.0, "reward": 0.01249999925494194, "reward_std": 0.027382206171751022, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3320184051990509, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3954.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 3304.625, "completions/mean_terminated_length": 142.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 21.413793103448278, "frac_reward_zero_std": 0.6875, "grad_norm": 77.59864154798734, "kl": 20.03125, "learning_rate": 2.1524206743137636e-06, "loss": 0.1997, "num_tokens": 98140579.0, "reward": 0.004687500186264515, "reward_std": 0.011732022278010845, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21220162510871887, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4029.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3961.0, "completions/min_terminated_length": 0.0, "epoch": 21.482758620689655, "frac_reward_zero_std": 0.6875, "grad_norm": 49.341691513169614, "kl": 7.4609375, "learning_rate": 2.1383823374682287e-06, "loss": 0.0746, "num_tokens": 98785411.0, "reward": 0.00390625, "reward_std": 0.011048542335629463, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3212.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2480.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1502.0, "completions/min_terminated_length": 0.0, "epoch": 21.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 9.020518030640764, "kl": 4.37890625, "learning_rate": 2.124369662874868e-06, "loss": 0.0438, "num_tokens": 99232859.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3966.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2278.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 683.0, "completions/min_terminated_length": 0.0, "epoch": 21.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 6.851405408286888, "kl": 3.51953125, "learning_rate": 2.110383238477441e-06, "loss": 0.0352, "num_tokens": 99655379.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3984.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1785.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 710.0, "completions/min_terminated_length": 0.0, "epoch": 21.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 3.5568597923301497, "kl": 3.7109375, "learning_rate": 2.096423651118305e-06, "loss": 0.0371, "num_tokens": 100014995.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3675.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2987.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 2231.0, "completions/min_terminated_length": 0.0, "epoch": 21.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 2.4105407866695843, "kl": 2.31640625, "learning_rate": 2.082491486513788e-06, "loss": 0.0232, "num_tokens": 100528499.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2473.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1712.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 791.0, "completions/min_terminated_length": 0.0, "epoch": 21.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 1.6596995368025504, "kl": 2.14453125, "learning_rate": 2.0685873292296116e-06, "loss": 0.0214, "num_tokens": 100878739.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3310.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 2706.03125, "completions/mean_terminated_length": 70.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 21.896551724137932, "frac_reward_zero_std": 1.0, "grad_norm": 1.7381919799536105, "kl": 1.734375, "learning_rate": 2.054711762656369e-06, "loss": 0.0173, "num_tokens": 101356183.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3623.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3105.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1846.0, "completions/min_terminated_length": 0.0, "epoch": 21.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 1.3083439279190165, "kl": 1.814453125, "learning_rate": 2.040865368985044e-06, "loss": 0.0181, "num_tokens": 101884287.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4071.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3723.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3478.0, "completions/min_terminated_length": 0.0, "epoch": 22.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.43559275741805176, "kl": 1.3828125, "learning_rate": 2.027048729182583e-06, "loss": 0.0138, "num_tokens": 102491967.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 3766.6015625, "completions/mean_terminated_length": 141.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 22.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 1.4659577239042336, "kl": 1.4140625, "learning_rate": 2.0132624229675205e-06, "loss": 0.0141, "num_tokens": 103103396.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3889.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3629.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3302.0, "completions/min_terminated_length": 0.0, "epoch": 22.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 1.088985103033209, "kl": 1.3203125, "learning_rate": 1.9995070287856546e-06, "loss": 0.0132, "num_tokens": 103697844.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2738.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1453.0, "completions/min_terminated_length": 0.0, "epoch": 22.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.3899575907826617, "kl": 1.3359375, "learning_rate": 1.985783123785774e-06, "loss": 0.0134, "num_tokens": 104179444.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4037.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3551.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2919.0, "completions/min_terminated_length": 0.0, "epoch": 22.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.9200105360180978, "kl": 1.458984375, "learning_rate": 1.9720912837954486e-06, "loss": 0.0145, "num_tokens": 104764172.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3989.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3143.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2597.0, "completions/min_terminated_length": 0.0, "epoch": 22.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 0.4396045362414407, "kl": 1.22265625, "learning_rate": 1.958432083296862e-06, "loss": 0.0122, "num_tokens": 105296444.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3844.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 3476.734375, "completions/mean_terminated_length": 1613.0, "completions/min_length": 1613.0, "completions/min_terminated_length": 1613.0, "epoch": 22.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.31437757247901466, "kl": 1.14453125, "learning_rate": 1.9448060954027093e-06, "loss": 0.0114, "num_tokens": 105872538.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3627.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3438.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3190.0, "completions/min_terminated_length": 0.0, "epoch": 22.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 1.3801019010665978, "kl": 1.21875, "learning_rate": 1.931213891832153e-06, "loss": 0.0122, "num_tokens": 106443674.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3974.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3405.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2850.0, "completions/min_terminated_length": 0.0, "epoch": 22.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.41862411846040465, "kl": 1.1328125, "learning_rate": 1.9176560428868336e-06, "loss": 0.0113, "num_tokens": 107010618.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3961.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3497.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3145.0, "completions/min_terminated_length": 0.0, "epoch": 22.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.15447039288221498, "kl": 1.00390625, "learning_rate": 1.9041331174269373e-06, "loss": 0.01, "num_tokens": 107589402.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4029.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3716.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3190.0, "completions/min_terminated_length": 0.0, "epoch": 22.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.44569839064244576, "kl": 1.1328125, "learning_rate": 1.8906456828473341e-06, "loss": 0.0113, "num_tokens": 108196122.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3481.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3044.0, "completions/min_terminated_length": 0.0, "epoch": 22.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.1877332615017673, "kl": 1.138671875, "learning_rate": 1.8771943050537656e-06, "loss": 0.0114, "num_tokens": 108771682.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3751.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3572.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3283.0, "completions/min_terminated_length": 0.0, "epoch": 22.896551724137932, "frac_reward_zero_std": 1.0, "grad_norm": 0.1368502872613123, "kl": 0.984375, "learning_rate": 1.8637795484391046e-06, "loss": 0.0098, "num_tokens": 109360034.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3816.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3444.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3131.0, "completions/min_terminated_length": 0.0, "epoch": 22.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.18495732660292058, "kl": 0.953125, "learning_rate": 1.8504019758596698e-06, "loss": 0.0095, "num_tokens": 109932034.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4084.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3776.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3607.0, "completions/min_terminated_length": 0.0, "epoch": 23.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.2997038986166403, "kl": 0.958984375, "learning_rate": 1.8370621486116163e-06, "loss": 0.0096, "num_tokens": 110546530.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4071.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3697.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3431.0, "completions/min_terminated_length": 0.0, "epoch": 23.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 0.4687330139312712, "kl": 1.041015625, "learning_rate": 1.823760626407377e-06, "loss": 0.0104, "num_tokens": 111149746.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3889.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3518.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3123.0, "completions/min_terminated_length": 0.0, "epoch": 23.20689655172414, "frac_reward_zero_std": 0.9375, "grad_norm": 2.2686906307875616, "kl": 0.916015625, "learning_rate": 1.8104979673521838e-06, "loss": 0.0092, "num_tokens": 111731122.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3933.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3698.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3483.0, "completions/min_terminated_length": 0.0, "epoch": 23.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.4459883207937827, "kl": 0.9375, "learning_rate": 1.7972747279206482e-06, "loss": 0.0094, "num_tokens": 112335602.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3656.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 3125.5, "completions/mean_terminated_length": 1096.0, "completions/min_length": 1096.0, "completions/min_terminated_length": 1096.0, "epoch": 23.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.47023729166382977, "kl": 1.03125, "learning_rate": 1.7840914629334122e-06, "loss": 0.0103, "num_tokens": 112866298.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3708.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3310.0, "completions/min_terminated_length": 0.0, "epoch": 23.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 0.44946285877291525, "kl": 1.0625, "learning_rate": 1.7709487255338731e-06, "loss": 0.0106, "num_tokens": 113470826.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3786.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3092.0, "completions/min_terminated_length": 0.0, "epoch": 23.482758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 1.434112729967371, "kl": 0.98046875, "learning_rate": 1.7578470671649684e-06, "loss": 0.0098, "num_tokens": 114084330.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4073.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 3595.0625, "completions/mean_terminated_length": 807.0, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 23.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 0.24483285658126575, "kl": 1.0546875, "learning_rate": 1.744787037546045e-06, "loss": 0.0105, "num_tokens": 114675570.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3662.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3582.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3431.0, "completions/min_terminated_length": 0.0, "epoch": 23.620689655172413, "frac_reward_zero_std": 0.9375, "grad_norm": 3.836642163015548, "kl": 1.03515625, "learning_rate": 1.731769184649788e-06, "loss": 0.0104, "num_tokens": 115265138.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 3415.7265625, "completions/mean_terminated_length": 1480.0, "completions/min_length": 1480.0, "completions/min_terminated_length": 1480.0, "epoch": 23.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.9387186601541986, "kl": 1.12109375, "learning_rate": 1.7187940546792325e-06, "loss": 0.0112, "num_tokens": 115833191.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3974.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3782.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3565.0, "completions/min_terminated_length": 0.0, "epoch": 23.75862068965517, "frac_reward_zero_std": 0.875, "grad_norm": 9.46227286683567, "kl": 1.1640625, "learning_rate": 1.7058621920448465e-06, "loss": 0.0117, "num_tokens": 116448423.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3779.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3566.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3219.0, "completions/min_terminated_length": 0.0, "epoch": 23.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.46798818186427327, "kl": 1.232421875, "learning_rate": 1.6929741393416855e-06, "loss": 0.0123, "num_tokens": 117035039.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3855.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3253.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2673.0, "completions/min_terminated_length": 0.0, "epoch": 23.896551724137932, "frac_reward_zero_std": 0.875, "grad_norm": 8.866925669400183, "kl": 1.455078125, "learning_rate": 1.6801304373266286e-06, "loss": 0.0145, "num_tokens": 117582495.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4037.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 3627.78125, "completions/mean_terminated_length": 433.0, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 23.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.4121912299594654, "kl": 1.134765625, "learning_rate": 1.667331624895689e-06, "loss": 0.0113, "num_tokens": 118177923.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4077.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 2956.609375, "completions/mean_terminated_length": 326.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 24.06896551724138, "frac_reward_zero_std": 0.9375, "grad_norm": 2.839985387205673, "kl": 1.29296875, "learning_rate": 1.6545782390614037e-06, "loss": 0.013, "num_tokens": 118686273.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3457.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2946.4453125, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.137931034482758, "frac_reward_zero_std": 0.9375, "grad_norm": 13.455888197603404, "kl": 2.7734375, "learning_rate": 1.6418708149302992e-06, "loss": 0.0251, "num_tokens": 119194490.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3954.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3542.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3078.0, "completions/min_terminated_length": 0.0, "epoch": 24.20689655172414, "frac_reward_zero_std": 0.9375, "grad_norm": 3.0314910559580888, "kl": 1.607421875, "learning_rate": 1.6292098856804423e-06, "loss": 0.0161, "num_tokens": 119778970.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4071.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3805.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3634.0, "completions/min_terminated_length": 0.0, "epoch": 24.275862068965516, "frac_reward_zero_std": 0.9375, "grad_norm": 2.1375967851187987, "kl": 1.853515625, "learning_rate": 1.6165959825390661e-06, "loss": 0.0185, "num_tokens": 120397146.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 3621.578125, "completions/mean_terminated_length": 746.0, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 24.344827586206897, "frac_reward_zero_std": 0.9375, "grad_norm": 1.9099338039446698, "kl": 1.8203125, "learning_rate": 1.604029634760284e-06, "loss": 0.0182, "num_tokens": 120991548.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3737.7890625, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 1.8118569722283366, "kl": 1.337890625, "learning_rate": 1.59151136960288e-06, "loss": 0.0134, "num_tokens": 121600153.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3989.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3589.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3092.0, "completions/min_terminated_length": 0.0, "epoch": 24.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.7367641169948803, "kl": 1.10546875, "learning_rate": 1.5790417123081903e-06, "loss": 0.0111, "num_tokens": 122190713.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3974.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3676.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3458.0, "completions/min_terminated_length": 0.0, "epoch": 24.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 1.046414781901739, "kl": 1.19921875, "learning_rate": 1.5666211860780583e-06, "loss": 0.012, "num_tokens": 122792409.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3995.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3826.0, "completions/min_terminated_length": 0.0, "epoch": 24.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 1.0266921966715032, "kl": 1.201171875, "learning_rate": 1.5542503120528918e-06, "loss": 0.012, "num_tokens": 123433305.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3639.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 2935.0078125, "completions/mean_terminated_length": 1978.0, "completions/min_length": 1691.0, "completions/min_terminated_length": 1978.0, "epoch": 24.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.4641877307231566, "kl": 1.13671875, "learning_rate": 1.5419296092897866e-06, "loss": 0.0114, "num_tokens": 123940058.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4029.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 3732.40625, "completions/mean_terminated_length": 475.0, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 24.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 1.3849148754140799, "kl": 1.318359375, "learning_rate": 1.529659594740755e-06, "loss": 0.0132, "num_tokens": 124548878.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 3602.09375, "completions/mean_terminated_length": 474.5, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 24.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.7183176763768305, "kl": 0.982421875, "learning_rate": 1.5174407832310338e-06, "loss": 0.0098, "num_tokens": 125138706.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3852.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 3626.9453125, "completions/mean_terminated_length": 184.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 24.896551724137932, "frac_reward_zero_std": 0.9375, "grad_norm": 76.28656625387477, "kl": 2.8603515625, "learning_rate": 1.5052736874374815e-06, "loss": 0.0286, "num_tokens": 125734027.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3984.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3629.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3053.0, "completions/min_terminated_length": 0.0, "epoch": 24.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 1.3820884009065382, "kl": 0.9228515625, "learning_rate": 1.4931588178670695e-06, "loss": 0.0092, "num_tokens": 126329675.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3876.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 3551.9140625, "completions/mean_terminated_length": 406.0, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 25.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.2964534891394953, "kl": 0.9296875, "learning_rate": 1.4810966828354605e-06, "loss": 0.0093, "num_tokens": 126915392.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3933.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3479.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2988.0, "completions/min_terminated_length": 0.0, "epoch": 25.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 0.32995971567480264, "kl": 0.98828125, "learning_rate": 1.469087788445684e-06, "loss": 0.0099, "num_tokens": 127491776.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3937.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3658.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3212.0, "completions/min_terminated_length": 0.0, "epoch": 25.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.3010622006281474, "kl": 0.814453125, "learning_rate": 1.4571326385668965e-06, "loss": 0.0081, "num_tokens": 128091072.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3543.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2844.0, "completions/min_terminated_length": 0.0, "epoch": 25.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.21401397666059846, "kl": 0.90625, "learning_rate": 1.4452317348132434e-06, "loss": 0.0091, "num_tokens": 128675712.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3953.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3656.0, "completions/min_terminated_length": 0.0, "epoch": 25.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.21870819130395527, "kl": 0.8876953125, "learning_rate": 1.4333855765228104e-06, "loss": 0.0089, "num_tokens": 129310592.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3889.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3777.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3701.0, "completions/min_terminated_length": 0.0, "epoch": 25.413793103448278, "frac_reward_zero_std": 0.9375, "grad_norm": 1.9456913420889015, "kl": 0.796875, "learning_rate": 1.421594660736675e-06, "loss": 0.008, "num_tokens": 129925152.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4090.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 3815.8125, "completions/mean_terminated_length": 133.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 25.482758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 4.2622280654233435, "kl": 0.91796875, "learning_rate": 1.4098594821780476e-06, "loss": 0.0069, "num_tokens": 130543480.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 3614.1484375, "completions/mean_terminated_length": 383.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 25.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 0.20859600720715726, "kl": 0.8916015625, "learning_rate": 1.3981805332315174e-06, "loss": 0.0089, "num_tokens": 131136723.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 3695.96875, "completions/mean_terminated_length": 57.0, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 25.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.3710955385744262, "kl": 0.9169921875, "learning_rate": 1.3865583039223929e-06, "loss": 0.0092, "num_tokens": 131739975.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3974.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 3649.3359375, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.3673494528452197, "kl": 0.8984375, "learning_rate": 1.374993281896137e-06, "loss": 0.009, "num_tokens": 132337930.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3954.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 3649.5078125, "completions/mean_terminated_length": 759.0, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 25.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.3644650808148341, "kl": 0.943359375, "learning_rate": 1.3634859523979134e-06, "loss": 0.0094, "num_tokens": 132936139.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 3989.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 3507.390625, "completions/mean_terminated_length": 298.66668701171875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 25.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.2894840110413353, "kl": 0.931640625, "learning_rate": 1.3520367982522208e-06, "loss": 0.0093, "num_tokens": 133516157.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3878.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3626.0, "completions/min_terminated_length": 0.0, "epoch": 25.896551724137932, "frac_reward_zero_std": 0.9375, "grad_norm": 2.3360352116952874, "kl": 0.966796875, "learning_rate": 1.3406462998426358e-06, "loss": 0.0097, "num_tokens": 134142445.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4021.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3532.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3044.0, "completions/min_terminated_length": 0.0, "epoch": 25.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.870537760228744, "kl": 1.0146484375, "learning_rate": 1.3293149350916595e-06, "loss": 0.0101, "num_tokens": 134725677.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3691.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3347.8203125, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.36940153213331306, "kl": 0.8701171875, "learning_rate": 1.3180431794406623e-06, "loss": 0.0087, "num_tokens": 135285270.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3968.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 3864.796875, "completions/mean_terminated_length": 203.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 26.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 0.4201543122757977, "kl": 0.97265625, "learning_rate": 1.3068315058299358e-06, "loss": 0.0097, "num_tokens": 135911036.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3974.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 3575.2890625, "completions/mean_terminated_length": 143.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 26.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.3986036615007821, "kl": 1.0341796875, "learning_rate": 1.2956803846788503e-06, "loss": 0.0103, "num_tokens": 136499745.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3591.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3419.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3190.0, "completions/min_terminated_length": 0.0, "epoch": 26.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.6253148469350631, "kl": 1.0625, "learning_rate": 1.284590283866116e-06, "loss": 0.0106, "num_tokens": 137068449.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3729.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3511.0, "completions/min_terminated_length": 0.0, "epoch": 26.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.4167536859641112, "kl": 0.9130859375, "learning_rate": 1.2735616687101518e-06, "loss": 0.0091, "num_tokens": 137676697.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3898.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 3448.6796875, "completions/mean_terminated_length": 81.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 26.413793103448278, "frac_reward_zero_std": 0.9375, "grad_norm": 8.432101521755621, "kl": 0.8310546875, "learning_rate": 1.2625950019495614e-06, "loss": 0.0083, "num_tokens": 138248296.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3964.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 3306.96875, "completions/mean_terminated_length": 316.0, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 26.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.279100034883122, "kl": 0.912109375, "learning_rate": 1.251690743723718e-06, "loss": 0.0091, "num_tokens": 138802660.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 3500.984375, "completions/mean_terminated_length": 191.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 26.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 0.48801033618279654, "kl": 0.8671875, "learning_rate": 1.2408493515534581e-06, "loss": 0.0087, "num_tokens": 139380690.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4071.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 3878.71875, "completions/mean_terminated_length": 50.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.4693294450937292, "kl": 0.9873046875, "learning_rate": 1.2300712803218834e-06, "loss": 0.0099, "num_tokens": 140008238.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 3580.1796875, "completions/mean_terminated_length": 273.66668701171875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 26.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.3674590278310469, "kl": 0.79296875, "learning_rate": 1.2193569822552772e-06, "loss": 0.0079, "num_tokens": 140597133.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3468.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3090.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2554.0, "completions/min_terminated_length": 0.0, "epoch": 26.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.3323923370329007, "kl": 0.7861328125, "learning_rate": 1.2087069069041268e-06, "loss": 0.0079, "num_tokens": 141123757.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3712.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3443.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 2860.0, "completions/min_terminated_length": 0.0, "epoch": 26.82758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 2.5532572186614733, "kl": 0.875, "learning_rate": 1.1981215011242654e-06, "loss": 0.0088, "num_tokens": 141695629.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3749.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3369.0, "completions/min_terminated_length": 0.0, "epoch": 26.896551724137932, "frac_reward_zero_std": 1.0, "grad_norm": 0.6855015361222366, "kl": 0.9169921875, "learning_rate": 1.1876012090581184e-06, "loss": 0.0092, "num_tokens": 142304309.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 3855.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 3353.9921875, "completions/mean_terminated_length": 28.666667938232422, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.7452537798528475, "kl": 0.845703125, "learning_rate": 1.177146472116071e-06, "loss": 0.0085, "num_tokens": 142863548.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3712.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3230.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2655.0, "completions/min_terminated_length": 0.0, "epoch": 27.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.15321991613545333, "kl": 0.88671875, "learning_rate": 1.1667577289579462e-06, "loss": 0.0089, "num_tokens": 143408092.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4073.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 3606.203125, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 27.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 0.3216790824586849, "kl": 0.787109375, "learning_rate": 1.1564354154746007e-06, "loss": 0.0079, "num_tokens": 143999854.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4077.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 3568.9609375, "completions/mean_terminated_length": 584.0, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 27.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.2293355418323331, "kl": 0.896484375, "learning_rate": 1.146179964769635e-06, "loss": 0.009, "num_tokens": 144587753.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3702.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 3501.671875, "completions/mean_terminated_length": 521.0, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 27.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.24073059200385705, "kl": 0.8017578125, "learning_rate": 1.1359918071412195e-06, "loss": 0.008, "num_tokens": 145167039.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3848.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3510.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2987.0, "completions/min_terminated_length": 0.0, "epoch": 27.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.3178637455204856, "kl": 0.86328125, "learning_rate": 1.1258713700640456e-06, "loss": 0.0087, "num_tokens": 145746951.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4029.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 3476.796875, "completions/mean_terminated_length": 547.0, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 27.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 0.12214403358860393, "kl": 0.912109375, "learning_rate": 1.115819078171383e-06, "loss": 0.0091, "num_tokens": 146323053.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3989.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 3682.125, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 27.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.12897945574009914, "kl": 0.7890625, "learning_rate": 1.1058353532372667e-06, "loss": 0.0079, "num_tokens": 146925437.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3627.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3040.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2673.0, "completions/min_terminated_length": 0.0, "epoch": 27.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 0.16601461816076052, "kl": 0.8662109375, "learning_rate": 1.0959206141587998e-06, "loss": 0.0087, "num_tokens": 147444461.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3829.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 2987.171875, "completions/mean_terminated_length": 114.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 27.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.6719414486342502, "kl": 0.9482421875, "learning_rate": 1.0860752769385766e-06, "loss": 0.0095, "num_tokens": 147957891.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3973.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 3645.078125, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 27.689655172413794, "frac_reward_zero_std": 0.9375, "grad_norm": 2.6787758046147854, "kl": 0.7978515625, "learning_rate": 1.0762997546672279e-06, "loss": 0.008, "num_tokens": 148554365.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3876.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3511.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3180.0, "completions/min_terminated_length": 0.0, "epoch": 27.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.8095492451199964, "kl": 0.876953125, "learning_rate": 1.0665944575060914e-06, "loss": 0.0088, "num_tokens": 149134941.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 3694.421875, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 27.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 1.0715340671172988, "kl": 0.9248046875, "learning_rate": 1.056959792669997e-06, "loss": 0.0093, "num_tokens": 149737755.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3441.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3087.0, "completions/min_terminated_length": 0.0, "epoch": 27.896551724137932, "frac_reward_zero_std": 1.0, "grad_norm": 0.4141909984905106, "kl": 0.892578125, "learning_rate": 1.0473961644101856e-06, "loss": 0.0089, "num_tokens": 150308179.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3937.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3622.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3147.0, "completions/min_terminated_length": 0.0, "epoch": 27.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.2255309268311694, "kl": 0.7841796875, "learning_rate": 1.037903973997345e-06, "loss": 0.0078, "num_tokens": 150902731.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3541.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2683.0, "completions/min_terminated_length": 0.0, "epoch": 28.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.3262628560823647, "kl": 0.7548828125, "learning_rate": 1.0284836197047737e-06, "loss": 0.0076, "num_tokens": 151485939.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3719.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3148.0, "completions/min_terminated_length": 0.0, "epoch": 28.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 1.3785603547942102, "kl": 0.8994140625, "learning_rate": 1.0191354967916712e-06, "loss": 0.009, "num_tokens": 152092171.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 3582.390625, "completions/mean_terminated_length": 1065.0, "completions/min_length": 1065.0, "completions/min_terminated_length": 1065.0, "epoch": 28.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.6413165465435274, "kl": 0.9697265625, "learning_rate": 1.0098599974865515e-06, "loss": 0.0097, "num_tokens": 152680621.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3973.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3708.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3246.0, "completions/min_terminated_length": 0.0, "epoch": 28.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.38974890306156273, "kl": 0.7998046875, "learning_rate": 1.0006575109707898e-06, "loss": 0.008, "num_tokens": 153285253.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3889.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 3463.140625, "completions/mean_terminated_length": 34.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 28.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.2765363629705189, "kl": 0.7646484375, "learning_rate": 9.915284233622877e-07, "loss": 0.0076, "num_tokens": 153858439.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3964.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 3335.6875, "completions/mean_terminated_length": 180.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 28.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 0.14277807820182337, "kl": 0.7998046875, "learning_rate": 9.824731176992796e-07, "loss": 0.008, "num_tokens": 154416479.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3937.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 3363.4921875, "completions/mean_terminated_length": 635.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 28.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.529958856068822, "kl": 0.9189453125, "learning_rate": 9.734919739242543e-07, "loss": 0.0092, "num_tokens": 154977846.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3562.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3284.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2629.0, "completions/min_terminated_length": 0.0, "epoch": 28.551724137931036, "frac_reward_zero_std": 0.9375, "grad_norm": 2.695894262108347, "kl": 0.822265625, "learning_rate": 9.645853688680177e-07, "loss": 0.0082, "num_tokens": 155529302.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3855.0, "completions/max_terminated_length": 3117.0, "completions/mean_length": 3398.984375, "completions/mean_terminated_length": 3117.0, "completions/min_length": 2988.0, "completions/min_terminated_length": 3117.0, "epoch": 28.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 1.9989409896774373, "kl": 1.080078125, "learning_rate": 9.557536762338786e-07, "loss": 0.0108, "num_tokens": 156095444.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4071.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2942.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1817.0, "completions/min_terminated_length": 0.0, "epoch": 28.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 1.2000430461736387, "kl": 1.0009765625, "learning_rate": 9.46997266581973e-07, "loss": 0.01, "num_tokens": 156603124.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3984.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3755.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3439.0, "completions/min_terminated_length": 0.0, "epoch": 28.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.5851798965154009, "kl": 0.7685546875, "learning_rate": 9.383165073137115e-07, "loss": 0.0077, "num_tokens": 157214836.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 3886.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 3041.4375, "completions/mean_terminated_length": 299.3333435058594, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 28.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.28855791960820826, "kl": 0.796875, "learning_rate": 9.297117626563687e-07, "loss": 0.008, "num_tokens": 157735212.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 3989.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 3385.515625, "completions/mean_terminated_length": 170.40000915527344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.896551724137932, "frac_reward_zero_std": 0.9375, "grad_norm": 4.115015847542139, "kl": 0.818359375, "learning_rate": 9.211833936477957e-07, "loss": 0.0082, "num_tokens": 158299630.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4090.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 3138.6328125, "completions/mean_terminated_length": 680.0, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 28.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.36112828901574734, "kl": 0.8056640625, "learning_rate": 9.127317581212753e-07, "loss": 0.0081, "num_tokens": 158832007.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 3644.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 2845.8359375, "completions/mean_terminated_length": 131.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.2460400784894155, "kl": 0.8798828125, "learning_rate": 9.043572106905084e-07, "loss": 0.0088, "num_tokens": 159326906.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3751.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 2984.140625, "completions/mean_terminated_length": 1032.0, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "epoch": 29.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 0.19754937012199905, "kl": 0.814453125, "learning_rate": 8.960601027347321e-07, "loss": 0.0082, "num_tokens": 159839948.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3691.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3525.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3360.0, "completions/min_terminated_length": 0.0, "epoch": 29.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.13239906069104662, "kl": 0.677734375, "learning_rate": 8.878407823839788e-07, "loss": 0.0068, "num_tokens": 160422284.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4037.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3713.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3478.0, "completions/min_terminated_length": 0.0, "epoch": 29.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.36718771412162815, "kl": 0.796875, "learning_rate": 8.796995945044689e-07, "loss": 0.008, "num_tokens": 161026644.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3781.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 3147.2578125, "completions/mean_terminated_length": 1818.5, "completions/min_length": 1154.0, "completions/min_terminated_length": 1154.0, "epoch": 29.344827586206897, "frac_reward_zero_std": 0.9375, "grad_norm": 2.169416419631883, "kl": 0.908203125, "learning_rate": 8.716368806841405e-07, "loss": 0.0091, "num_tokens": 161560565.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3779.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3533.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3309.0, "completions/min_terminated_length": 0.0, "epoch": 29.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 1.3795953881794978, "kl": 0.9765625, "learning_rate": 8.636529792183171e-07, "loss": 0.0098, "num_tokens": 162143925.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3961.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 3567.1953125, "completions/mean_terminated_length": 302.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 1.0665685823452116, "kl": 0.8798828125, "learning_rate": 8.557482250955144e-07, "loss": 0.0088, "num_tokens": 162731598.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3968.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 3249.34375, "completions/mean_terminated_length": 99.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 29.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 0.3814851298081842, "kl": 0.8134765625, "learning_rate": 8.479229499833844e-07, "loss": 0.0081, "num_tokens": 163278586.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4078.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 3718.6328125, "completions/mean_terminated_length": 1340.5, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 29.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.15176365945131062, "kl": 0.7451171875, "learning_rate": 8.401774822147976e-07, "loss": 0.0074, "num_tokens": 163885643.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3175.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2948.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2718.0, "completions/min_terminated_length": 0.0, "epoch": 29.689655172413794, "frac_reward_zero_std": 0.9375, "grad_norm": 3.058306765611725, "kl": 0.8154296875, "learning_rate": 8.325121467740695e-07, "loss": 0.0082, "num_tokens": 164394059.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3462.890625, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.258682687819089, "kl": 0.7685546875, "learning_rate": 8.249272652833226e-07, "loss": 0.0077, "num_tokens": 164966981.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3675.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 3020.4453125, "completions/mean_terminated_length": 111.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 29.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.3645728533747666, "kl": 0.7998046875, "learning_rate": 8.174231559889931e-07, "loss": 0.008, "num_tokens": 165484670.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4073.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 3034.109375, "completions/mean_terminated_length": 96.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 29.896551724137932, "frac_reward_zero_std": 1.0, "grad_norm": 0.2787119228400319, "kl": 0.8466796875, "learning_rate": 8.100001337484787e-07, "loss": 0.0085, "num_tokens": 166004108.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3826.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3197.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2383.0, "completions/min_terminated_length": 0.0, "epoch": 29.96551724137931, "frac_reward_zero_std": 0.875, "grad_norm": 6.186121979015489, "kl": 0.9208984375, "learning_rate": 8.026585100169251e-07, "loss": 0.0092, "num_tokens": 166543332.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4021.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3274.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2390.0, "completions/min_terminated_length": 0.0, "epoch": 30.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.7446233999958237, "kl": 0.923828125, "learning_rate": 7.953985928341601e-07, "loss": 0.0092, "num_tokens": 167093036.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3701.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3090.1484375, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 0.8530934216927875, "kl": 0.8564453125, "learning_rate": 7.882206868117693e-07, "loss": 0.0086, "num_tokens": 167619647.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 4077.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 3716.3203125, "completions/mean_terminated_length": 79.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 1.0550851964159542, "kl": 0.939453125, "learning_rate": 7.81125093120313e-07, "loss": 0.0094, "num_tokens": 168226408.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3241.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 3151.453125, "completions/mean_terminated_length": 406.0, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 30.275862068965516, "frac_reward_zero_std": 0.9375, "grad_norm": 3.987086291107972, "kl": 0.767578125, "learning_rate": 7.741121094766916e-07, "loss": 0.0077, "num_tokens": 168759962.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3615.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3196.8203125, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.20934456348471733, "kl": 0.673828125, "learning_rate": 7.671820301316532e-07, "loss": 0.0067, "num_tokens": 169299059.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3954.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 3340.90625, "completions/mean_terminated_length": 193.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 30.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 0.26690197328437076, "kl": 0.869140625, "learning_rate": 7.603351458574474e-07, "loss": 0.0087, "num_tokens": 169857767.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3358.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2638.0, "completions/min_terminated_length": 0.0, "epoch": 30.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.15328210113309632, "kl": 0.771484375, "learning_rate": 7.535717439356255e-07, "loss": 0.0077, "num_tokens": 170416399.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3591.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 2907.390625, "completions/mean_terminated_length": 217.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 30.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 0.21811897481608625, "kl": 0.7958984375, "learning_rate": 7.46892108144986e-07, "loss": 0.008, "num_tokens": 170919617.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 4029.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 3019.265625, "completions/mean_terminated_length": 33.333335876464844, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.333253834493737, "kl": 0.8134765625, "learning_rate": 7.402965187496697e-07, "loss": 0.0081, "num_tokens": 171437155.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3855.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3307.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2262.0, "completions/min_terminated_length": 0.0, "epoch": 30.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.223342945786469, "kl": 0.7802734375, "learning_rate": 7.337852524873974e-07, "loss": 0.0078, "num_tokens": 171991555.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3631.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 3297.0, "completions/min_terminated_length": 0.0, "epoch": 30.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.29323377756635877, "kl": 0.9619140625, "learning_rate": 7.273585825578608e-07, "loss": 0.0096, "num_tokens": 172586251.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3222.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2151.0, "completions/min_terminated_length": 0.0, "epoch": 30.82758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.35038252371357637, "kl": 0.8544921875, "learning_rate": 7.21016778611259e-07, "loss": 0.0085, "num_tokens": 173129571.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3826.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 3255.6328125, "completions/mean_terminated_length": 1743.0, "completions/min_length": 1743.0, "completions/min_terminated_length": 1743.0, "epoch": 30.896551724137932, "frac_reward_zero_std": 1.0, "grad_norm": 0.18108887766051743, "kl": 0.69921875, "learning_rate": 7.147601067369835e-07, "loss": 0.007, "num_tokens": 173677364.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3974.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 2668.109375, "completions/mean_terminated_length": 1617.0, "completions/min_length": 1617.0, "completions/min_terminated_length": 1617.0, "epoch": 30.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.24837273393300843, "kl": 0.8681640625, "learning_rate": 7.085888294524561e-07, "loss": 0.0087, "num_tokens": 174149954.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3968.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3724.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3568.0, "completions/min_terminated_length": 0.0, "epoch": 31.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 0.20597077289203472, "kl": 0.71484375, "learning_rate": 7.025032056921117e-07, "loss": 0.0071, "num_tokens": 174756650.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3816.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 2984.3359375, "completions/mean_terminated_length": 69.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.137931034482758, "frac_reward_zero_std": 1.0, "grad_norm": 0.2017257131941572, "kl": 0.7939453125, "learning_rate": 6.965034907965349e-07, "loss": 0.0079, "num_tokens": 175269717.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4021.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 3485.9296875, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 31.20689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.5546489851899966, "kl": 0.865234375, "learning_rate": 6.905899365017462e-07, "loss": 0.0086, "num_tokens": 175846988.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3644.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2905.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1389.0, "completions/min_terminated_length": 0.0, "epoch": 31.275862068965516, "frac_reward_zero_std": 0.875, "grad_norm": 12.66676442762852, "kl": 0.802734375, "learning_rate": 6.847627909286409e-07, "loss": 0.008, "num_tokens": 176349900.0, "reward": 0.0031250000465661287, "reward_std": 0.0088388342410326, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12450689822435379, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3878.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2902.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1789.0, "completions/min_terminated_length": 0.0, "epoch": 31.344827586206897, "frac_reward_zero_std": 1.0, "grad_norm": 0.22258308896262888, "kl": 0.9619140625, "learning_rate": 6.790222985725761e-07, "loss": 0.0096, "num_tokens": 176852260.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3192.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 2895.75, "completions/mean_terminated_length": 63.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 31.413793103448278, "frac_reward_zero_std": 1.0, "grad_norm": 0.20057791811079195, "kl": 0.876953125, "learning_rate": 6.733687002931141e-07, "loss": 0.0088, "num_tokens": 177353988.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4071.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 3624.9609375, "completions/mean_terminated_length": 546.0, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 31.482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 0.16691217765560545, "kl": 0.8330078125, "learning_rate": 6.678022333039158e-07, "loss": 0.0083, "num_tokens": 177949055.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3855.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3600.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2943.0, "completions/min_terminated_length": 0.0, "epoch": 31.551724137931036, "frac_reward_zero_std": 1.0, "grad_norm": 0.18536855255083134, "kl": 0.7431640625, "learning_rate": 6.623231311627876e-07, "loss": 0.0074, "num_tokens": 178539759.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4077.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3307.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2233.0, "completions/min_terminated_length": 0.0, "epoch": 31.620689655172413, "frac_reward_zero_std": 1.0, "grad_norm": 0.3504897136928905, "kl": 0.7724609375, "learning_rate": 6.569316237618811e-07, "loss": 0.0077, "num_tokens": 179094127.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3984.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3221.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2751.0, "completions/min_terminated_length": 0.0, "epoch": 31.689655172413794, "frac_reward_zero_std": 0.9375, "grad_norm": 2.6973466278935714, "kl": 0.8564453125, "learning_rate": 6.516279373180499e-07, "loss": 0.0086, "num_tokens": 179635983.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3974.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3638.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3184.0, "completions/min_terminated_length": 0.0, "epoch": 31.75862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.18025035415172505, "kl": 0.751953125, "learning_rate": 6.464122943633543e-07, "loss": 0.0075, "num_tokens": 180232783.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3966.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3818.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3522.0, "completions/min_terminated_length": 0.0, "epoch": 31.82758620689655, "frac_reward_zero_std": 0.9375, "grad_norm": 3.1358050811645803, "kl": 0.833984375, "learning_rate": 6.412849137357271e-07, "loss": 0.0083, "num_tokens": 180852591.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3961.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 2886.921875, "completions/mean_terminated_length": 100.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 31.896551724137932, "frac_reward_zero_std": 1.0, "grad_norm": 1.2948274001749933, "kl": 0.923828125, "learning_rate": 6.3624601056979e-07, "loss": 0.0093, "num_tokens": 181352021.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3620.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 3160.2421875, "completions/mean_terminated_length": 209.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 31.96551724137931, "frac_reward_zero_std": 0.875, "grad_norm": 3.7274024380491744, "kl": 1.32421875, "learning_rate": 6.312957962878278e-07, "loss": 0.0132, "num_tokens": 181887604.0, "reward": 0.0023437500931322575, "reward_std": 0.00662912568077445, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3098.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2582.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 2358.0, "completions/min_terminated_length": 0.0, "epoch": 32.06896551724138, "frac_reward_zero_std": 1.0, "grad_norm": 2.136426703722538, "kl": 1.2822265625, "learning_rate": 6.264344785909181e-07, "loss": 0.0128, "num_tokens": 182349268.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3458.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 3162.3046875, "completions/mean_terminated_length": 1037.0, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "epoch": 32.13793103448276, "frac_reward_zero_std": 1.0, "grad_norm": 1.2520983576156235, "kl": 0.9404296875, "learning_rate": 6.216622614502149e-07, "loss": 0.0094, "num_tokens": 182885115.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3262.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 2184.6640625, "completions/mean_terminated_length": 1304.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 32.206896551724135, "frac_reward_zero_std": 1.0, "grad_norm": 0.366011224816465, "kl": 0.95703125, "learning_rate": 6.169793450983916e-07, "loss": 0.0096, "num_tokens": 183295824.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3567.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3170.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 2860.0, "completions/min_terminated_length": 0.0, "epoch": 32.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.8140210391828681, "kl": 0.822265625, "learning_rate": 6.123859260212393e-07, "loss": 0.0082, "num_tokens": 183832752.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3251.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 2810.3203125, "completions/mean_terminated_length": 59.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 32.3448275862069, "frac_reward_zero_std": 1.0, "grad_norm": 0.332293413639309, "kl": 0.76171875, "learning_rate": 6.07882196949423e-07, "loss": 0.0076, "num_tokens": 184323105.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3231.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 3132.3125, "completions/mean_terminated_length": 111.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 32.41379310344828, "frac_reward_zero_std": 1.0, "grad_norm": 0.5224894407491708, "kl": 0.8525390625, "learning_rate": 6.034683468503948e-07, "loss": 0.0085, "num_tokens": 184855113.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3451.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 2282.9375, "completions/mean_terminated_length": 63.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.48275862068966, "frac_reward_zero_std": 0.9375, "grad_norm": 11.166972360690067, "kl": 1.13671875, "learning_rate": 5.991445609204641e-07, "loss": 0.0092, "num_tokens": 185277305.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3301.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 2850.5859375, "completions/mean_terminated_length": 81.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 32.55172413793103, "frac_reward_zero_std": 1.0, "grad_norm": 0.7828655061364351, "kl": 0.9892578125, "learning_rate": 5.949110205770292e-07, "loss": 0.0099, "num_tokens": 185772084.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3713.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3115.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2377.0, "completions/min_terminated_length": 0.0, "epoch": 32.62068965517241, "frac_reward_zero_std": 1.0, "grad_norm": 0.6840848825830977, "kl": 0.8837890625, "learning_rate": 5.90767903450964e-07, "loss": 0.0088, "num_tokens": 186301940.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3131.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 2390.8984375, "completions/mean_terminated_length": 828.0, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 32.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.48156033465185893, "kl": 0.9599609375, "learning_rate": 5.867153833791652e-07, "loss": 0.0096, "num_tokens": 186737647.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3578.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3291.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2993.0, "completions/min_terminated_length": 0.0, "epoch": 32.758620689655174, "frac_reward_zero_std": 1.0, "grad_norm": 0.5346660425103332, "kl": 0.9501953125, "learning_rate": 5.827536303972587e-07, "loss": 0.0095, "num_tokens": 187290031.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3472.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3027.0, "completions/min_terminated_length": 0.0, "epoch": 32.827586206896555, "frac_reward_zero_std": 1.0, "grad_norm": 0.2648468875740046, "kl": 0.7978515625, "learning_rate": 5.78882810732465e-07, "loss": 0.008, "num_tokens": 187864439.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3346.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2923.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2373.0, "completions/min_terminated_length": 0.0, "epoch": 32.89655172413793, "frac_reward_zero_std": 1.0, "grad_norm": 0.4194738188643489, "kl": 0.8076171875, "learning_rate": 5.75103086796625e-07, "loss": 0.0081, "num_tokens": 188369719.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3937.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3460.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2908.0, "completions/min_terminated_length": 0.0, "epoch": 32.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.5584473553040513, "kl": 0.833984375, "learning_rate": 5.714146171793846e-07, "loss": 0.0083, "num_tokens": 188942767.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 3390.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 2825.0703125, "completions/mean_terminated_length": 28.666667938232422, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.06896551724138, "frac_reward_zero_std": 0.9375, "grad_norm": 2.2837544331197157, "kl": 0.876953125, "learning_rate": 5.678175566415422e-07, "loss": 0.0088, "num_tokens": 189435008.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 3617.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 2804.75, "completions/mean_terminated_length": 74.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.13793103448276, "frac_reward_zero_std": 1.0, "grad_norm": 0.8731926996051723, "kl": 0.958984375, "learning_rate": 5.643120561085528e-07, "loss": 0.0096, "num_tokens": 189923920.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3720.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3535.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 3373.0, "completions/min_terminated_length": 0.0, "epoch": 33.206896551724135, "frac_reward_zero_std": 1.0, "grad_norm": 1.1566863300497092, "kl": 1.0615234375, "learning_rate": 5.608982626641991e-07, "loss": 0.0106, "num_tokens": 190507536.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4073.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3281.875, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 0.6367413455232471, "kl": 1.017578125, "learning_rate": 5.575763195444166e-07, "loss": 0.0102, "num_tokens": 191058688.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3826.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2855.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1405.0, "completions/min_terminated_length": 0.0, "epoch": 33.3448275862069, "frac_reward_zero_std": 1.0, "grad_norm": 0.6884424033507358, "kl": 1.095703125, "learning_rate": 5.543463661312847e-07, "loss": 0.011, "num_tokens": 191555264.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3203.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 2655.2265625, "completions/mean_terminated_length": 105.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 33.41379310344828, "frac_reward_zero_std": 1.0, "grad_norm": 0.11921707423536955, "kl": 0.90625, "learning_rate": 5.512085379471808e-07, "loss": 0.009, "num_tokens": 192026205.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3984.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 3307.9609375, "completions/mean_terminated_length": 1131.0, "completions/min_length": 1131.0, "completions/min_terminated_length": 1131.0, "epoch": 33.48275862068966, "frac_reward_zero_std": 1.0, "grad_norm": 0.2074465134989551, "kl": 0.8173828125, "learning_rate": 5.481629666490903e-07, "loss": 0.0082, "num_tokens": 192579552.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3886.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 3260.5234375, "completions/mean_terminated_length": 119.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 33.55172413793103, "frac_reward_zero_std": 1.0, "grad_norm": 0.3890539429034864, "kl": 0.853515625, "learning_rate": 5.452097800230853e-07, "loss": 0.0085, "num_tokens": 193127739.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3829.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2894.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2358.0, "completions/min_terminated_length": 0.0, "epoch": 33.62068965517241, "frac_reward_zero_std": 0.9375, "grad_norm": 2.875283802975271, "kl": 0.9873046875, "learning_rate": 5.423491019789623e-07, "loss": 0.0098, "num_tokens": 193629243.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3937.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 3444.3515625, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 33.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 0.37285965772825935, "kl": 0.857421875, "learning_rate": 5.395810525450425e-07, "loss": 0.0086, "num_tokens": 194201192.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3937.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 3250.1015625, "completions/mean_terminated_length": 177.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 33.758620689655174, "frac_reward_zero_std": 0.9375, "grad_norm": 2.368582018342944, "kl": 0.875, "learning_rate": 5.369057478631359e-07, "loss": 0.0088, "num_tokens": 194748277.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3457.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2987.0, "completions/min_terminated_length": 0.0, "epoch": 33.827586206896555, "frac_reward_zero_std": 0.875, "grad_norm": 5.045523147578903, "kl": 0.802734375, "learning_rate": 5.343233001836694e-07, "loss": 0.008, "num_tokens": 195320781.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 3898.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 3029.125, "completions/mean_terminated_length": 330.3333435058594, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 33.89655172413793, "frac_reward_zero_std": 1.0, "grad_norm": 0.3221564952389173, "kl": 0.912109375, "learning_rate": 5.318338178609754e-07, "loss": 0.0091, "num_tokens": 195838413.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3656.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3382.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 3204.0, "completions/min_terminated_length": 0.0, "epoch": 33.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.6415043501475972, "kl": 0.7998046875, "learning_rate": 5.294374053487459e-07, "loss": 0.008, "num_tokens": 196402477.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3654.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 3087.46875, "completions/mean_terminated_length": 126.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 34.06896551724138, "frac_reward_zero_std": 0.9375, "grad_norm": 2.564493390981493, "kl": 0.94921875, "learning_rate": 5.271341631956511e-07, "loss": 0.0095, "num_tokens": 196928745.0, "reward": 0.0015625000232830644, "reward_std": 0.002893187804147601, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3684.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 3392.625, "completions/mean_terminated_length": 50.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 34.13793103448276, "frac_reward_zero_std": 1.0, "grad_norm": 0.937823822012556, "kl": 0.8486328125, "learning_rate": 5.249241880411181e-07, "loss": 0.0085, "num_tokens": 197494073.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3966.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 3509.4140625, "completions/mean_terminated_length": 50.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 34.206896551724135, "frac_reward_zero_std": 1.0, "grad_norm": 1.1941849080255262, "kl": 0.8662109375, "learning_rate": 5.228075726112785e-07, "loss": 0.0087, "num_tokens": 198074350.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3984.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3429.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2261.0, "completions/min_terminated_length": 0.0, "epoch": 34.275862068965516, "frac_reward_zero_std": 1.0, "grad_norm": 1.368989622275372, "kl": 1.0263671875, "learning_rate": 5.207844057150768e-07, "loss": 0.0103, "num_tokens": 198644398.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3260.3125, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.3448275862069, "frac_reward_zero_std": 0.9375, "grad_norm": 1.6426625269815358, "kl": 0.9677734375, "learning_rate": 5.188547722405437e-07, "loss": 0.0097, "num_tokens": 199191886.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3973.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 3510.6640625, "completions/mean_terminated_length": 244.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 34.41379310344828, "frac_reward_zero_std": 0.9375, "grad_norm": 1996.6623380906256, "kl": 63.958984375, "learning_rate": 5.170187531512351e-07, "loss": 0.6407, "num_tokens": 199772323.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 3746.34375, "completions/mean_terminated_length": 76.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 34.48275862068966, "frac_reward_zero_std": 0.875, "grad_norm": 5.607165443611961, "kl": 1.0458984375, "learning_rate": 5.152764254828348e-07, "loss": 0.0104, "num_tokens": 200380663.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3172.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 2518.75, "completions/mean_terminated_length": 164.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 34.55172413793103, "frac_reward_zero_std": 0.9375, "grad_norm": 6.292445339111056, "kl": 1.099609375, "learning_rate": 5.136278623399225e-07, "loss": 0.011, "num_tokens": 200832759.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3886.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 3176.1015625, "completions/mean_terminated_length": 608.0, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 34.62068965517241, "frac_reward_zero_std": 0.9375, "grad_norm": 1.8352783555090613, "kl": 1.0361328125, "learning_rate": 5.120731328929058e-07, "loss": 0.0103, "num_tokens": 201369204.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3933.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 3606.625, "completions/mean_terminated_length": 781.0, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 34.689655172413794, "frac_reward_zero_std": 1.0, "grad_norm": 1.1493563937634361, "kl": 0.912109375, "learning_rate": 5.106123023751187e-07, "loss": 0.0091, "num_tokens": 201961924.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4001.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 3408.46875, "completions/mean_terminated_length": 49.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 34.758620689655174, "frac_reward_zero_std": 0.9375, "grad_norm": 1.5486194926247445, "kl": 1.109375, "learning_rate": 5.092454320800833e-07, "loss": 0.031, "num_tokens": 202529280.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3826.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 3523.1484375, "completions/mean_terminated_length": 34.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.827586206896555, "frac_reward_zero_std": 0.9375, "grad_norm": 1.3741479681169653, "kl": 1.080078125, "learning_rate": 5.079725793589405e-07, "loss": 0.0092, "num_tokens": 203110875.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3439.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3050.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2591.0, "completions/min_terminated_length": 0.0, "epoch": 34.89655172413793, "frac_reward_zero_std": 0.875, "grad_norm": 2.7986592185842185, "kl": 1.359375, "learning_rate": 5.067937976180407e-07, "loss": 0.0136, "num_tokens": 203632379.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3817.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 3290.6796875, "completions/mean_terminated_length": 192.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 34.96551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 3.885159341324509, "kl": 2.0, "learning_rate": 5.057091363167046e-07, "loss": 0.02, "num_tokens": 204184658.0, "reward": 0.0, "reward_std": 0.0, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4090.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3021.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1211.0, "completions/min_terminated_length": 0.0, "epoch": 35.06896551724138, "frac_reward_zero_std": 0.875, "grad_norm": 4.282611410016724, "kl": 1.685546875, "learning_rate": 5.047186409651489e-07, "loss": 0.0169, "num_tokens": 204701370.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 3723.5, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.13793103448276, "frac_reward_zero_std": 0.875, "grad_norm": 1.5373039872226522, "kl": 2.14453125, "learning_rate": 5.038223531225742e-07, "loss": 0.0234, "num_tokens": 205308146.0, "reward": 0.0015625000232830644, "reward_std": 0.0044194171205163, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12450689822435379, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 3439.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3011.4609375, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.206896551724135, "frac_reward_zero_std": 0.9375, "grad_norm": 4.841716912287862, "kl": 3.078125, "learning_rate": 5.030203103954232e-07, "loss": 0.0308, "num_tokens": 205824685.0, "reward": 0.0007812500116415322, "reward_std": 0.00220970856025815, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.0883883461356163, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 4077.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 3439.9609375, "completions/mean_terminated_length": 81.33333587646484, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.275862068965516, "frac_reward_zero_std": 0.75, "grad_norm": 4.575034152640209, "kl": 3.66796875, "learning_rate": 5.023125464358026e-07, "loss": 0.0346, "num_tokens": 206394536.0, "reward": 0.00390625, "reward_std": 0.009522313252091408, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 4037.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 3622.9140625, "completions/mean_terminated_length": 1451.0, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 35.3448275862069, "frac_reward_zero_std": 0.75, "grad_norm": 3.7179488310500055, "kl": 3.53125, "learning_rate": 5.016990909400709e-07, "loss": 0.0366, "num_tokens": 206989341.0, "reward": 0.00390625, "reward_std": 0.009522313252091408, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 3968.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3635.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 3292.0, "completions/min_terminated_length": 0.0, "epoch": 35.41379310344828, "frac_reward_zero_std": 0.5625, "grad_norm": 20.553933911524943, "kl": 4.58984375, "learning_rate": 5.011799696475915e-07, "loss": 0.046, "num_tokens": 207585493.0, "reward": 0.007031249813735485, "reward_std": 0.018361147493124008, "rewards/code_format_reward/mean": 0.0078125, "rewards/code_format_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24301259219646454, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4021.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 3574.4140625, "completions/mean_terminated_length": 127.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.48275862068966, "frac_reward_zero_std": 0.8125, "grad_norm": 9.096320879858352, "kl": 4.7734375, "learning_rate": 5.007552043396547e-07, "loss": 0.0469, "num_tokens": 208174090.0, "reward": 0.0023437500931322575, "reward_std": 0.00662912568077445, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15188287198543549, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4029.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 3392.75, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 35.55172413793103, "frac_reward_zero_std": 0.6875, "grad_norm": 4.488695050523634, "kl": 6.3515625, "learning_rate": 5.004248128385618e-07, "loss": 0.0624, "num_tokens": 208738266.0, "reward": 0.0054687499068677425, "reward_std": 0.012415501289069653, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 3688.0234375, "completions/mean_terminated_length": 524.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.62068965517241, "frac_reward_zero_std": 0.75, "grad_norm": 4.721507511320711, "kl": 7.0859375, "learning_rate": 5.001888090068784e-07, "loss": 0.073, "num_tokens": 209340237.0, "reward": 0.00390625, "reward_std": 0.009522313252091408, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.194504976272583, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 3696.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 3322.875, "completions/mean_terminated_length": 278.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.689655172413794, "frac_reward_zero_std": 0.625, "grad_norm": 3.936276706948606, "kl": 6.3515625, "learning_rate": 5.000472027468528e-07, "loss": 0.0576, "num_tokens": 209896637.0, "reward": 0.0054687499068677425, "reward_std": 0.013941730372607708, "rewards/code_format_reward/mean": 0.0, "rewards/code_format_reward/std": 0.0, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.22826264798641205, "rewards/ioi_code_reward/mean": 0.0, "rewards/ioi_code_reward/std": 0.0, "step": 500 }, { "epoch": 35.689655172413794, "step": 500, "total_flos": 0.0, "train_loss": 0.09227038282398825, "train_runtime": 24234.9154, "train_samples_per_second": 2.641, "train_steps_per_second": 0.021 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 209896637, "num_train_epochs": 36, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }