diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4477277815088426, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1695.0, + "completions/max_terminated_length": 1695.0, + "completions/mean_length": 590.9921875, + "completions/mean_terminated_length": 590.9921875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.0008954555630176853, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1.2949865529017888, + "kl": 0.0003485679626464844, + "learning_rate": 0.0, + "loss": 0.016, + "num_tokens": 614764.0, + "reward": 0.05976562947034836, + "reward_std": 0.0456097275018692, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.59765625, + "rewards/format_reward/std": 0.4908501207828522, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1975.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 614.201171875, + "completions/mean_terminated_length": 614.201171875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.0017909111260353706, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1.163409985621042, + "kl": 0.0003452301025390625, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0213, + "num_tokens": 1251315.0, + "reward": 0.06015624850988388, + "reward_std": 0.04386558383703232, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.6015625, + "rewards/format_reward/std": 0.4900552034378052, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1578.0, + "completions/mean_length": 624.142578125, + "completions/mean_terminated_length": 621.3561401367188, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.002686366689053056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1790462948029907, + "kl": 0.0003814697265625, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0218, + "num_tokens": 1898076.0, + "reward": 0.05917969346046448, + "reward_std": 0.04427944868803024, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.591796875, + "rewards/format_reward/std": 0.49198177456855774, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 640.91015625, + "completions/mean_terminated_length": 632.616943359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.003581822252070741, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.9219838132930147, + "kl": 0.00046539306640625, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0346, + "num_tokens": 2551550.0, + "reward": 0.07011719048023224, + "reward_std": 0.04218312352895737, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.701171875, + "rewards/format_reward/std": 0.45819199085235596, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1910.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 592.734375, + "completions/mean_terminated_length": 592.734375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.0044772778150884264, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.5536853080406829, + "kl": 0.0011653900146484375, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0238, + "num_tokens": 3137542.0, + "reward": 0.08632812649011612, + "reward_std": 0.030137624591588974, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.86328125, + "rewards/format_reward/std": 0.3438861668109894, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1852.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 602.6796875, + "completions/mean_terminated_length": 601.4500732421875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.005372733378106112, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.927833201852441, + "kl": 0.0010623931884765625, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0328, + "num_tokens": 3761474.0, + "reward": 0.08613281697034836, + "reward_std": 0.03160237520933151, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.861328125, + "rewards/format_reward/std": 0.34594178199768066, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 617.1640625, + "completions/mean_terminated_length": 614.364013671875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.006268188941123797, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.36962881149557436, + "kl": 0.0016422271728515625, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0382, + "num_tokens": 4474534.0, + "reward": 0.09238281846046448, + "reward_std": 0.021231699734926224, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.923828125, + "rewards/format_reward/std": 0.26553234457969666, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 560.474609375, + "completions/mean_terminated_length": 560.474609375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.007163644504141482, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.28651886538300714, + "kl": 0.0016231536865234375, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.0085, + "num_tokens": 5073001.0, + "reward": 0.09648437798023224, + "reward_std": 0.011988259851932526, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.1843547374010086, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1955.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 541.166015625, + "completions/mean_terminated_length": 541.166015625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.008059100067159167, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.15014269470364927, + "kl": 0.00441741943359375, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0115, + "num_tokens": 5643102.0, + "reward": 0.09921875596046448, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08812850713729858, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 506.53515625, + "completions/mean_terminated_length": 506.53515625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.008954555630176853, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08665106882827825, + "kl": 0.00691986083984375, + "learning_rate": 3e-06, + "loss": 0.0027, + "num_tokens": 6187616.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1596.0, + "completions/max_terminated_length": 1596.0, + "completions/mean_length": 543.220703125, + "completions/mean_terminated_length": 543.220703125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.009850011193194537, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08459209304869837, + "kl": 0.0081939697265625, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0014, + "num_tokens": 6786177.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 473.095703125, + "completions/mean_terminated_length": 473.095703125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.010745466756212223, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.1349393788135462, + "kl": 0.010772705078125, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.001, + "num_tokens": 7333618.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 477.83984375, + "completions/mean_terminated_length": 474.7671203613281, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.011640922319229908, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.2695951493546503, + "kl": 0.01611328125, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0198, + "num_tokens": 7893104.0, + "reward": 0.09882812201976776, + "reward_std": 0.004687500186264515, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.10772226005792618, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 467.693359375, + "completions/mean_terminated_length": 467.693359375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.012536377882247594, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.14830650422442648, + "kl": 0.027679443359375, + "learning_rate": 4.333333333333334e-06, + "loss": -0.0028, + "num_tokens": 8487571.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1680.0, + "completions/mean_length": 479.158203125, + "completions/mean_terminated_length": 457.4118957519531, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.013431833445265278, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.1292681901261042, + "kl": 0.0396728515625, + "learning_rate": 4.666666666666667e-06, + "loss": -0.0035, + "num_tokens": 9052324.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 471.365234375, + "completions/mean_terminated_length": 451.6119079589844, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.014327289008282965, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.505905631710607, + "kl": 0.1810302734375, + "learning_rate": 5e-06, + "loss": -0.0003, + "num_tokens": 9618767.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 445.05859375, + "completions/mean_terminated_length": 416.3777160644531, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.015222744571300649, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.11459480404649415, + "kl": 0.04522705078125, + "learning_rate": 4.999952797253148e-06, + "loss": -0.0005, + "num_tokens": 10131773.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 450.349609375, + "completions/mean_terminated_length": 434.59368896484375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.016118200134318333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040197815529214365, + "kl": 0.0364990234375, + "learning_rate": 4.9998111909931225e-06, + "loss": 0.0004, + "num_tokens": 10705280.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 385.060546875, + "completions/mean_terminated_length": 378.53924560546875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.01701365569733602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04327269884725563, + "kl": 0.03729248046875, + "learning_rate": 4.999575187161439e-06, + "loss": 0.0004, + "num_tokens": 11188831.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1855.0, + "completions/max_terminated_length": 1765.0, + "completions/mean_length": 395.326171875, + "completions/mean_terminated_length": 389.7784423828125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.017909111260353706, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.14595779879986776, + "kl": 0.04998779296875, + "learning_rate": 4.9992447956603455e-06, + "loss": 0.0004, + "num_tokens": 11684854.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1312.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 395.77734375, + "completions/mean_terminated_length": 395.77734375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.01880456682337139, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.1270393251550353, + "kl": 0.0399169921875, + "learning_rate": 4.998820030352409e-06, + "loss": 0.0025, + "num_tokens": 12168180.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 445.04296875, + "completions/mean_terminated_length": 445.04296875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.019700022386389075, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08325698207439201, + "kl": 0.029510498046875, + "learning_rate": 4.998300909059929e-06, + "loss": -0.0003, + "num_tokens": 12693066.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1276.0, + "completions/max_terminated_length": 1276.0, + "completions/mean_length": 472.298828125, + "completions/mean_terminated_length": 471.02935791015625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.02059547794940676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18844888408802188, + "kl": 0.0380859375, + "learning_rate": 4.997687453564198e-06, + "loss": 0.0004, + "num_tokens": 13205331.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 531.49609375, + "completions/mean_terminated_length": 531.49609375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.021490933512424447, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06203949637731368, + "kl": 0.022369384765625, + "learning_rate": 4.9969796896045775e-06, + "loss": 0.0007, + "num_tokens": 13830705.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1293.0, + "completions/max_terminated_length": 1293.0, + "completions/mean_length": 504.82421875, + "completions/mean_terminated_length": 504.82421875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.02238638907544213, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.11074876133870112, + "kl": 0.0238037109375, + "learning_rate": 4.996177646877426e-06, + "loss": 0.0015, + "num_tokens": 14356775.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 557.5546875, + "completions/mean_terminated_length": 554.3740234375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.023281844638459816, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5291093870609653, + "kl": 0.02923583984375, + "learning_rate": 4.995281359034851e-06, + "loss": 0.0003, + "num_tokens": 14946355.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1999.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 515.958984375, + "completions/mean_terminated_length": 512.8192749023438, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.0241773002014775, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07928872087363079, + "kl": 0.04559326171875, + "learning_rate": 4.994290863683296e-06, + "loss": 0.001, + "num_tokens": 15480734.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1846.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 574.833984375, + "completions/mean_terminated_length": 571.451904296875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.025072755764495188, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08454160430867592, + "kl": 0.05511474609375, + "learning_rate": 4.99320620238196e-06, + "loss": 0.0018, + "num_tokens": 16059817.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 620.23828125, + "completions/mean_terminated_length": 614.6392822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.025968211327512872, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.15422752090531933, + "kl": 0.020233154296875, + "learning_rate": 4.99202742064106e-06, + "loss": 0.0008, + "num_tokens": 16731331.0, + "reward": 0.0986328125, + "reward_std": 0.004973640665411949, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.986328125, + "rewards/format_reward/std": 0.1162383034825325, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 589.849609375, + "completions/mean_terminated_length": 581.8382568359375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.026863666890530557, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09738347351320935, + "kl": 0.051025390625, + "learning_rate": 4.990754567919917e-06, + "loss": 0.0162, + "num_tokens": 17335926.0, + "reward": 0.09941406548023224, + "reward_std": 0.0018486406188458204, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 555.578125, + "completions/mean_terminated_length": 550.5304565429688, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.02775912245354824, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.0282571546038983, + "kl": 0.0521240234375, + "learning_rate": 4.989387697624881e-06, + "loss": 0.0115, + "num_tokens": 17923134.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 646.40625, + "completions/mean_terminated_length": 634.2544555664062, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.02865457801656593, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.349618502264421, + "kl": 0.07720947265625, + "learning_rate": 4.987926867107095e-06, + "loss": 0.0187, + "num_tokens": 18598862.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1606.0, + "completions/mean_length": 605.197265625, + "completions/mean_terminated_length": 577.9679565429688, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.029550033579583614, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.851046169134026, + "kl": 0.17608642578125, + "learning_rate": 4.986372137660078e-06, + "loss": 0.0221, + "num_tokens": 19225795.0, + "reward": 0.09921874850988388, + "reward_std": 0.0026298905722796917, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08812850713729858, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.859375, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 517.287109375, + "completions/mean_terminated_length": 504.3876647949219, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.030445489142601298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2087030864895066, + "kl": 0.1455078125, + "learning_rate": 4.984723574517165e-06, + "loss": 0.0015, + "num_tokens": 19769414.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 550.6015625, + "completions/mean_terminated_length": 536.6250610351562, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.03134094470561898, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.9192969116714527, + "kl": 0.04638671875, + "learning_rate": 4.9829812468487655e-06, + "loss": 0.0164, + "num_tokens": 20371434.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.703125, + "completions/max_length": 1479.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 549.693359375, + "completions/mean_terminated_length": 530.4969482421875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.03223640026863667, + "frac_reward_zero_std": 1.0, + "grad_norm": 1263.958700074066, + "kl": 68.7724609375, + "learning_rate": 4.981145227759457e-06, + "loss": 0.6899, + "num_tokens": 20970829.0, + "reward": 0.09687499701976776, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17416280508041382, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 1371.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 545.916015625, + "completions/mean_terminated_length": 540.122314453125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.03313185583165435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03698563879129744, + "kl": 0.086212158203125, + "learning_rate": 4.979215594284924e-06, + "loss": 0.0009, + "num_tokens": 21555490.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.859375, + "completions/max_length": 1982.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 558.373046875, + "completions/mean_terminated_length": 546.431396484375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.03402731139467204, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.13576120693482005, + "kl": 0.2076416015625, + "learning_rate": 4.977192427388722e-06, + "loss": 0.0039, + "num_tokens": 22166497.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 515.453125, + "completions/mean_terminated_length": 511.3834228515625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.03492276695768973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03263560038943874, + "kl": 0.10321044921875, + "learning_rate": 4.9750758119588824e-06, + "loss": 0.001, + "num_tokens": 22731609.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 535.62109375, + "completions/mean_terminated_length": 519.460205078125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.03581822252070741, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09399632708462895, + "kl": 0.236114501953125, + "learning_rate": 4.972865836804349e-06, + "loss": 0.0212, + "num_tokens": 23296151.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 443.244140625, + "completions/mean_terminated_length": 440.1240234375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.036713678083725096, + "frac_reward_zero_std": 0.96875, + "grad_norm": 5.057284700020039, + "kl": 0.08331298828125, + "learning_rate": 4.970562594651254e-06, + "loss": 0.0128, + "num_tokens": 23796340.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.859375, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 479.05078125, + "completions/mean_terminated_length": 466.76934814453125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.03760913364674278, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.0023024311910922, + "kl": 0.1585693359375, + "learning_rate": 4.968166182139026e-06, + "loss": 0.0061, + "num_tokens": 24369582.0, + "reward": 0.09902343899011612, + "reward_std": 0.003411140525713563, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.990234375, + "rewards/format_reward/std": 0.09843364357948303, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 414.234375, + "completions/mean_terminated_length": 407.6094665527344, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.038504589209760465, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.15413699128802932, + "kl": 0.06298828125, + "learning_rate": 4.9656766998163306e-06, + "loss": 0.0226, + "num_tokens": 24845446.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1314.0, + "completions/mean_length": 437.359375, + "completions/mean_terminated_length": 430.48126220703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.03940004477277815, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.23779403700635063, + "kl": 0.090850830078125, + "learning_rate": 4.963094252136865e-06, + "loss": -0.0011, + "num_tokens": 25368766.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 1683.0, + "completions/max_terminated_length": 1683.0, + "completions/mean_length": 430.693359375, + "completions/mean_terminated_length": 421.4102478027344, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.040295500335795834, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.3591338929523544, + "kl": 0.076690673828125, + "learning_rate": 4.960418947454958e-06, + "loss": 0.0039, + "num_tokens": 25917201.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 415.494140625, + "completions/mean_terminated_length": 408.1029968261719, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.04119095589881352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16254673027144365, + "kl": 0.1004638671875, + "learning_rate": 4.957650898021038e-06, + "loss": 0.001, + "num_tokens": 26428750.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 1904.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 441.0390625, + "completions/mean_terminated_length": 431.62847900390625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.04208641146183121, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.19054061716212126, + "kl": 0.09002685546875, + "learning_rate": 4.954790219976915e-06, + "loss": 0.0128, + "num_tokens": 26966306.0, + "reward": 0.09941406548023224, + "reward_std": 0.0018486406188458204, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 1602.0, + "completions/max_terminated_length": 1602.0, + "completions/mean_length": 473.658203125, + "completions/mean_terminated_length": 448.3489685058594, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.042981867024848894, + "frac_reward_zero_std": 0.96875, + "grad_norm": 3.439149122044318, + "kl": 0.166748046875, + "learning_rate": 4.95183703335091e-06, + "loss": 0.0106, + "num_tokens": 27563907.0, + "reward": 0.09785155951976776, + "reward_std": 0.0014959799591451883, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.978515625, + "rewards/format_reward/std": 0.14513419568538666, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 1410.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 435.626953125, + "completions/mean_terminated_length": 414.824462890625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.04387732258786658, + "frac_reward_zero_std": 0.90625, + "grad_norm": 1.8935940647535439, + "kl": 0.1712646484375, + "learning_rate": 4.948791462052819e-06, + "loss": -0.0051, + "num_tokens": 28065876.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.796875, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 428.220703125, + "completions/mean_terminated_length": 415.7835693359375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.04477277815088426, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.8868819030212512, + "kl": 0.1536865234375, + "learning_rate": 4.945653633868716e-06, + "loss": -0.0052, + "num_tokens": 28605125.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 457.01953125, + "completions/mean_terminated_length": 444.8520202636719, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.04566823371390195, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.15304329957981105, + "kl": 0.14794921875, + "learning_rate": 4.942423680455584e-06, + "loss": 0.0218, + "num_tokens": 29147839.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 432.66796875, + "completions/mean_terminated_length": 413.81146240234375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.04656368927691963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23482997740855524, + "kl": 0.2802734375, + "learning_rate": 4.939101737335802e-06, + "loss": 0.0028, + "num_tokens": 29658757.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.734375, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 438.236328125, + "completions/mean_terminated_length": 422.03436279296875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.047459144839937316, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.6889933551461128, + "kl": 0.253662109375, + "learning_rate": 4.935687943891447e-06, + "loss": 0.0029, + "num_tokens": 30180190.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 498.029296875, + "completions/mean_terminated_length": 469.6290588378906, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.048354600402955, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3531726053710491, + "kl": 0.38623046875, + "learning_rate": 4.932182443358458e-06, + "loss": 0.0107, + "num_tokens": 30770989.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 1710.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 454.625, + "completions/mean_terminated_length": 438.0184326171875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.04925005596597269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12648196237180884, + "kl": 0.4267578125, + "learning_rate": 4.928585382820616e-06, + "loss": 0.0043, + "num_tokens": 31263629.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.609375, + "completions/max_length": 1855.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 519.474609375, + "completions/mean_terminated_length": 495.954833984375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.050145511528990376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10102802285833903, + "kl": 0.53369140625, + "learning_rate": 4.924896913203376e-06, + "loss": 0.0053, + "num_tokens": 31840480.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.453125, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 515.52734375, + "completions/mean_terminated_length": 483.0272521972656, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.05104096709200806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11688839296654617, + "kl": 0.7001953125, + "learning_rate": 4.921117189267535e-06, + "loss": 0.007, + "num_tokens": 32412846.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.578125, + "completions/max_length": 1303.0, + "completions/max_terminated_length": 1303.0, + "completions/mean_length": 496.125, + "completions/mean_terminated_length": 476.0907287597656, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.051936422655025745, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.11134456981324262, + "kl": 0.49658203125, + "learning_rate": 4.917246369602742e-06, + "loss": 0.0064, + "num_tokens": 32981054.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.609375, + "completions/max_length": 1756.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 523.712890625, + "completions/mean_terminated_length": 500.2217712402344, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.05283187821804343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06899093954508781, + "kl": 0.5185546875, + "learning_rate": 4.9132846166208355e-06, + "loss": 0.0052, + "num_tokens": 33561563.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.578125, + "completions/max_length": 1418.0, + "completions/max_terminated_length": 1418.0, + "completions/mean_length": 535.345703125, + "completions/mean_terminated_length": 509.7340393066406, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.053727333781061114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07653749811047808, + "kl": 0.53466796875, + "learning_rate": 4.9092320965490365e-06, + "loss": 0.0053, + "num_tokens": 34159484.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 1332.0, + "completions/max_terminated_length": 1332.0, + "completions/mean_length": 540.06640625, + "completions/mean_terminated_length": 515.6085815429688, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.0546227893440788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06559465349000848, + "kl": 0.56689453125, + "learning_rate": 4.905088979422971e-06, + "loss": 0.0057, + "num_tokens": 34758014.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 1254.0, + "completions/max_terminated_length": 1254.0, + "completions/mean_length": 516.626953125, + "completions/mean_terminated_length": 502.10528564453125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.05551824490709648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04968607276207747, + "kl": 0.39599609375, + "learning_rate": 4.900855439079536e-06, + "loss": 0.004, + "num_tokens": 35325679.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 544.69140625, + "completions/mean_terminated_length": 527.228759765625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.056413700470114174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050819547419717004, + "kl": 0.3779296875, + "learning_rate": 4.8965316531496055e-06, + "loss": 0.0038, + "num_tokens": 35917633.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.609375, + "completions/max_length": 1638.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 614.998046875, + "completions/mean_terminated_length": 586.9384155273438, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.05730915603313186, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.9281709893512259, + "kl": 0.490234375, + "learning_rate": 4.892117803050578e-06, + "loss": 0.0085, + "num_tokens": 36515376.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 561.8984375, + "completions/mean_terminated_length": 547.4457397460938, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.05820461159614954, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.12936559351174773, + "kl": 0.284423828125, + "learning_rate": 4.887614073978761e-06, + "loss": 0.0028, + "num_tokens": 37101836.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 1885.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 589.90625, + "completions/mean_terminated_length": 573.6224975585938, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.05910006715916723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11081938425830375, + "kl": 0.2110595703125, + "learning_rate": 4.883020654901609e-06, + "loss": 0.0021, + "num_tokens": 37733500.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 540.2421875, + "completions/mean_terminated_length": 533.0988159179688, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.05999552272218491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03058666366366206, + "kl": 0.062225341796875, + "learning_rate": 4.878337738549785e-06, + "loss": 0.0006, + "num_tokens": 38316408.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 1758.0, + "completions/max_terminated_length": 1758.0, + "completions/mean_length": 600.439453125, + "completions/mean_terminated_length": 596.3037719726562, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.060890978285202596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046953867799324686, + "kl": 0.05511474609375, + "learning_rate": 4.873565521409082e-06, + "loss": 0.0006, + "num_tokens": 38938953.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1493.0, + "completions/mean_length": 599.556640625, + "completions/mean_terminated_length": 586.1148681640625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.06178643384822028, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.8300083372883273, + "kl": 0.1490478515625, + "learning_rate": 4.868704203712173e-06, + "loss": 0.0082, + "num_tokens": 39537958.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1958.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 573.798828125, + "completions/mean_terminated_length": 570.0748291015625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.06268188941123796, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08740740558555117, + "kl": 0.055633544921875, + "learning_rate": 4.86375398943021e-06, + "loss": 0.0021, + "num_tokens": 40134975.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1962.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 576.38671875, + "completions/mean_terminated_length": 576.1604614257812, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.06357734497425566, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05822616064224703, + "kl": 0.02789306640625, + "learning_rate": 4.858715086264274e-06, + "loss": -0.0002, + "num_tokens": 40738437.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 604.00390625, + "completions/mean_terminated_length": 595.5443725585938, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.06447280053727333, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08496351887302583, + "kl": 0.071563720703125, + "learning_rate": 4.853587705636646e-06, + "loss": 0.0108, + "num_tokens": 41380487.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1295.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 541.388671875, + "completions/mean_terminated_length": 541.388671875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.06536825610029103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006430184540198528, + "kl": 0.01727294921875, + "learning_rate": 4.84837206268195e-06, + "loss": 0.0002, + "num_tokens": 41942766.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1405.0, + "completions/max_terminated_length": 1405.0, + "completions/mean_length": 526.68359375, + "completions/mean_terminated_length": 525.0332641601562, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.0662637116633087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050587940561448004, + "kl": 0.0487060546875, + "learning_rate": 4.8430683762381195e-06, + "loss": 0.0005, + "num_tokens": 42489644.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 607.765625, + "completions/mean_terminated_length": 605.7216186523438, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.0671591672263264, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.41589730301523176, + "kl": 0.049285888671875, + "learning_rate": 4.837676868837213e-06, + "loss": 0.0007, + "num_tokens": 43111732.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1516.0, + "completions/max_terminated_length": 1516.0, + "completions/mean_length": 543.322265625, + "completions/mean_terminated_length": 542.630126953125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.06805462278934409, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06608406351030027, + "kl": 0.027587890625, + "learning_rate": 4.832197766696085e-06, + "loss": -0.0021, + "num_tokens": 43678265.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1981.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 591.98046875, + "completions/mean_terminated_length": 591.98046875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.06895007835236176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007514927935081122, + "kl": 0.014678955078125, + "learning_rate": 4.826631299706887e-06, + "loss": 0.0001, + "num_tokens": 44288559.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1318.0, + "completions/max_terminated_length": 1318.0, + "completions/mean_length": 571.615234375, + "completions/mean_terminated_length": 571.2055053710938, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.06984553391537945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015106634040085255, + "kl": 0.0263824462890625, + "learning_rate": 4.820977701427424e-06, + "loss": 0.0003, + "num_tokens": 44863674.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1360.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 562.982421875, + "completions/mean_terminated_length": 562.4638061523438, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.07074098947839713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012163661806333983, + "kl": 0.02752685546875, + "learning_rate": 4.81523720907136e-06, + "loss": 0.0003, + "num_tokens": 45473345.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1590.0, + "completions/max_terminated_length": 1590.0, + "completions/mean_length": 556.7109375, + "completions/mean_terminated_length": 552.409423828125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.07163644504141482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0356262021967284, + "kl": 0.081390380859375, + "learning_rate": 4.809410063498254e-06, + "loss": 0.0008, + "num_tokens": 46071037.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1244.0, + "completions/max_terminated_length": 1244.0, + "completions/mean_length": 548.291015625, + "completions/mean_terminated_length": 546.3294677734375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.0725319006044325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01599046315259318, + "kl": 0.039093017578125, + "learning_rate": 4.8034965092034656e-06, + "loss": 0.0004, + "num_tokens": 46675298.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1712.0, + "completions/max_terminated_length": 1712.0, + "completions/mean_length": 587.048828125, + "completions/mean_terminated_length": 585.8140869140625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.07342735616745019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009076777613622609, + "kl": 0.02301025390625, + "learning_rate": 4.797496794307889e-06, + "loss": 0.0002, + "num_tokens": 47280811.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1663.0, + "completions/mean_length": 594.373046875, + "completions/mean_terminated_length": 587.3948974609375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.07432281173046787, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.15365831760972712, + "kl": 0.0318756103515625, + "learning_rate": 4.791411170547545e-06, + "loss": 0.0288, + "num_tokens": 47912698.0, + "reward": 0.09921875596046448, + "reward_std": 0.0031250000465661287, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08812850713729858, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1717.0, + "completions/max_terminated_length": 1717.0, + "completions/mean_length": 568.76953125, + "completions/mean_terminated_length": 568.76953125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.07521826729348556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006783503334953544, + "kl": 0.014373779296875, + "learning_rate": 4.785239893263017e-06, + "loss": 0.0001, + "num_tokens": 48532644.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1656.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 559.60546875, + "completions/mean_terminated_length": 559.60546875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.07611372285650325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006573209966206178, + "kl": 0.0153350830078125, + "learning_rate": 4.778983221388742e-06, + "loss": 0.0002, + "num_tokens": 49114330.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1991.0, + "completions/mean_length": 586.0703125, + "completions/mean_terminated_length": 580.0255737304688, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.07700917841952093, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.551142553430298, + "kl": 0.0923309326171875, + "learning_rate": 4.77264141744214e-06, + "loss": 0.0139, + "num_tokens": 49735342.0, + "reward": 0.099609375, + "reward_std": 0.0010673906654119492, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1488.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 599.73828125, + "completions/mean_terminated_length": 598.367919921875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.07790463398253862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01035156923582944, + "kl": 0.025360107421875, + "learning_rate": 4.766214747512603e-06, + "loss": 0.0003, + "num_tokens": 50352280.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1399.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 556.302734375, + "completions/mean_terminated_length": 555.140869140625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0788000895455563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010699159061825052, + "kl": 0.034393310546875, + "learning_rate": 4.759703481250331e-06, + "loss": 0.0003, + "num_tokens": 50955907.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1160.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 543.453125, + "completions/mean_terminated_length": 541.4921875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.07969554510857399, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013360408248908037, + "kl": 0.045135498046875, + "learning_rate": 4.753107891855015e-06, + "loss": 0.0005, + "num_tokens": 51588075.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1900.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 553.51953125, + "completions/mean_terminated_length": 552.8238525390625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.08059100067159167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011168392202523601, + "kl": 0.02880859375, + "learning_rate": 4.746428256064375e-06, + "loss": 0.0003, + "num_tokens": 52190549.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1500.0, + "completions/max_terminated_length": 1500.0, + "completions/mean_length": 527.140625, + "completions/mean_terminated_length": 527.140625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.08148645623460936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005992984864276156, + "kl": 0.01611328125, + "learning_rate": 4.7396648541425534e-06, + "loss": 0.0002, + "num_tokens": 52761421.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1572.0, + "completions/max_terminated_length": 1572.0, + "completions/mean_length": 521.013671875, + "completions/mean_terminated_length": 520.2974243164062, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.08238191179762704, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05877329461584653, + "kl": 0.02740478515625, + "learning_rate": 4.732817969868348e-06, + "loss": -0.0054, + "num_tokens": 53347188.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1517.0, + "completions/max_terminated_length": 1517.0, + "completions/mean_length": 578.26171875, + "completions/mean_terminated_length": 577.1585083007812, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.08327736736064473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011239280015537782, + "kl": 0.0374908447265625, + "learning_rate": 4.7258878905233095e-06, + "loss": 0.0004, + "num_tokens": 53968970.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 574.3359375, + "completions/mean_terminated_length": 571.4520263671875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.08417282292366242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008823775393731852, + "kl": 0.0159149169921875, + "learning_rate": 4.718874906879688e-06, + "loss": 0.0002, + "num_tokens": 54580806.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1479.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 530.111328125, + "completions/mean_terminated_length": 530.111328125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.0850682784866801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007847241033788286, + "kl": 0.01611328125, + "learning_rate": 4.711779313188231e-06, + "loss": 0.0002, + "num_tokens": 55117295.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1631.0, + "completions/max_terminated_length": 1631.0, + "completions/mean_length": 620.12890625, + "completions/mean_terminated_length": 618.1898193359375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.08596373404969779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009847757471650577, + "kl": 0.030364990234375, + "learning_rate": 4.70460140716584e-06, + "loss": 0.0003, + "num_tokens": 55756081.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 533.216796875, + "completions/mean_terminated_length": 520.8560180664062, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.08685918961271547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02188329125598179, + "kl": 0.063232421875, + "learning_rate": 4.697341489983076e-06, + "loss": 0.0006, + "num_tokens": 56308048.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1538.0, + "completions/max_terminated_length": 1538.0, + "completions/mean_length": 602.330078125, + "completions/mean_terminated_length": 598.328125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.08775464517573316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014668515840607925, + "kl": 0.036651611328125, + "learning_rate": 4.6899998662515215e-06, + "loss": 0.0004, + "num_tokens": 56933033.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 537.30078125, + "completions/mean_terminated_length": 527.7637939453125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.08865010073875083, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06815628049927762, + "kl": 0.0523681640625, + "learning_rate": 4.682576844011007e-06, + "loss": -0.0004, + "num_tokens": 57503027.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1472.0, + "completions/mean_length": 573.46484375, + "completions/mean_terminated_length": 568.1709594726562, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.08954555630176853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0137112684812689, + "kl": 0.0355224609375, + "learning_rate": 4.675072734716678e-06, + "loss": 0.0004, + "num_tokens": 58080577.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 621.76953125, + "completions/mean_terminated_length": 613.8447875976562, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.09044101186478622, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009438245136725277, + "kl": 0.0351409912109375, + "learning_rate": 4.667487853225931e-06, + "loss": 0.0004, + "num_tokens": 58743211.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 620.9609375, + "completions/mean_terminated_length": 606.7233276367188, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.0913364674278039, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09659170255219529, + "kl": 0.0374298095703125, + "learning_rate": 4.659822517785203e-06, + "loss": 0.0174, + "num_tokens": 59349367.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 611.19921875, + "completions/mean_terminated_length": 602.1830444335938, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.09223192299082159, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08032704473046437, + "kl": 0.0544891357421875, + "learning_rate": 4.6520770500166165e-06, + "loss": 0.0096, + "num_tokens": 59975933.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1547.0, + "completions/mean_length": 639.21875, + "completions/mean_terminated_length": 633.039306640625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.09312737855383926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01748885143685336, + "kl": 0.0367431640625, + "learning_rate": 4.644251774904487e-06, + "loss": 0.0004, + "num_tokens": 60649933.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1570.0, + "completions/mean_length": 606.859375, + "completions/mean_terminated_length": 601.9310913085938, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.09402283411685695, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.10834880104883948, + "kl": 0.03607177734375, + "learning_rate": 4.636347020781684e-06, + "loss": 0.0093, + "num_tokens": 61248245.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1984.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 577.3125, + "completions/mean_terminated_length": 577.3125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.09491828967987463, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.062441023022902345, + "kl": 0.0150604248046875, + "learning_rate": 4.6283631193158605e-06, + "loss": -0.0008, + "num_tokens": 61822373.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 616.103515625, + "completions/mean_terminated_length": 609.403564453125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.09581374524289232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019687455098924667, + "kl": 0.029449462890625, + "learning_rate": 4.620300405495532e-06, + "loss": 0.0003, + "num_tokens": 62427850.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 611.4140625, + "completions/mean_terminated_length": 607.3745727539062, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.09670920080591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012668657725866108, + "kl": 0.0213623046875, + "learning_rate": 4.612159217616022e-06, + "loss": 0.0002, + "num_tokens": 63066814.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1751.0, + "completions/mean_length": 602.12109375, + "completions/mean_terminated_length": 593.2051391601562, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.09760465636892769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04029833180728693, + "kl": 0.0290985107421875, + "learning_rate": 4.603939897265268e-06, + "loss": 0.0003, + "num_tokens": 63683004.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1547.0, + "completions/mean_length": 582.087890625, + "completions/mean_terminated_length": 576.3392333984375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.09850011193194538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006320606993388272, + "kl": 0.01507568359375, + "learning_rate": 4.595642789309492e-06, + "loss": 0.0002, + "num_tokens": 64273289.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 595.115234375, + "completions/mean_terminated_length": 587.6627197265625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.09939556749496306, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09846182063050066, + "kl": 0.03070068359375, + "learning_rate": 4.587268241878724e-06, + "loss": 0.0086, + "num_tokens": 64844916.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 601.66796875, + "completions/mean_terminated_length": 586.6806030273438, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.10029102305798075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05979181929406999, + "kl": 0.042999267578125, + "learning_rate": 4.578816606352205e-06, + "loss": 0.0004, + "num_tokens": 65471226.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1442.0, + "completions/mean_length": 603.990234375, + "completions/mean_terminated_length": 600.4745483398438, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.10118647862099843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01187867971560546, + "kl": 0.019927978515625, + "learning_rate": 4.570288237343632e-06, + "loss": 0.0002, + "num_tokens": 66080773.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1423.0, + "completions/max_terminated_length": 1423.0, + "completions/mean_length": 630.3046875, + "completions/mean_terminated_length": 630.3046875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.10208193418401612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005428850450334548, + "kl": 0.012451171875, + "learning_rate": 4.561683492686289e-06, + "loss": 0.0001, + "num_tokens": 66714161.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 601.828125, + "completions/mean_terminated_length": 596.7976684570312, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1029773897470338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11416624154361638, + "kl": 0.0304107666015625, + "learning_rate": 4.5530027334180285e-06, + "loss": 0.0003, + "num_tokens": 67311769.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 570.79296875, + "completions/mean_terminated_length": 564.2440795898438, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.10387284531005149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020105885250086865, + "kl": 0.026123046875, + "learning_rate": 4.544246323766122e-06, + "loss": 0.0003, + "num_tokens": 67915423.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1785.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 596.521484375, + "completions/mean_terminated_length": 593.3208618164062, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.10476830087306917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012545587800104133, + "kl": 0.022369384765625, + "learning_rate": 4.535414631131983e-06, + "loss": 0.0002, + "num_tokens": 68509082.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 569.6796875, + "completions/mean_terminated_length": 562.622802734375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.10566375643608686, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04035915126510421, + "kl": 0.018463134765625, + "learning_rate": 4.526508026075746e-06, + "loss": 0.0, + "num_tokens": 69106406.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1713.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 595.96875, + "completions/mean_terminated_length": 595.4912109375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.10655921199910455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007108927697730135, + "kl": 0.0175018310546875, + "learning_rate": 4.517526882300721e-06, + "loss": 0.0002, + "num_tokens": 69726022.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1847.0, + "completions/max_terminated_length": 1847.0, + "completions/mean_length": 571.841796875, + "completions/mean_terminated_length": 570.9823608398438, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.10745466756212223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011503710528882577, + "kl": 0.0205230712890625, + "learning_rate": 4.508471576637713e-06, + "loss": 0.0002, + "num_tokens": 70342293.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1537.0, + "completions/max_terminated_length": 1537.0, + "completions/mean_length": 570.453125, + "completions/mean_terminated_length": 570.453125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.10835012312513992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006772977452012986, + "kl": 0.012603759765625, + "learning_rate": 4.499342489029211e-06, + "loss": 0.0001, + "num_tokens": 70936061.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1694.0, + "completions/max_terminated_length": 1694.0, + "completions/mean_length": 601.3359375, + "completions/mean_terminated_length": 597.08056640625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.1092455786881576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017466662282534594, + "kl": 0.0307464599609375, + "learning_rate": 4.490140002513449e-06, + "loss": 0.0003, + "num_tokens": 71557209.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1514.0, + "completions/mean_length": 557.890625, + "completions/mean_terminated_length": 551.7976684570312, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.11014103425117529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012604297189502118, + "kl": 0.025177001953125, + "learning_rate": 4.48086450320833e-06, + "loss": 0.0003, + "num_tokens": 72139169.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1292.0, + "completions/max_terminated_length": 1292.0, + "completions/mean_length": 552.798828125, + "completions/mean_terminated_length": 551.892333984375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.11103648981419297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008976864650579949, + "kl": 0.018157958984375, + "learning_rate": 4.4715163802952266e-06, + "loss": 0.0002, + "num_tokens": 72725034.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1579.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 600.53125, + "completions/mean_terminated_length": 600.53125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.11193194537721066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00470251748733287, + "kl": 0.011016845703125, + "learning_rate": 4.462096026002655e-06, + "loss": 0.0001, + "num_tokens": 73320762.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1402.0, + "completions/max_terminated_length": 1402.0, + "completions/mean_length": 578.86328125, + "completions/mean_terminated_length": 573.9291381835938, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.11282740094022835, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.23897883202723777, + "kl": 0.0271148681640625, + "learning_rate": 4.4526038355898144e-06, + "loss": 0.0038, + "num_tokens": 73929892.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1232.0, + "completions/max_terminated_length": 1232.0, + "completions/mean_length": 547.65234375, + "completions/mean_terminated_length": 546.6477661132812, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.11372285650324603, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08286544589526837, + "kl": 0.0220489501953125, + "learning_rate": 4.4430402073300035e-06, + "loss": -0.0006, + "num_tokens": 74521522.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 567.892578125, + "completions/mean_terminated_length": 566.9041137695312, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.11461831206626372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030056592458929755, + "kl": 0.0231781005859375, + "learning_rate": 4.433405542493909e-06, + "loss": 0.0002, + "num_tokens": 75106875.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1772.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 597.25390625, + "completions/mean_terminated_length": 596.0645751953125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.1155137676292814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008412131791043819, + "kl": 0.0222015380859375, + "learning_rate": 4.4237002453327734e-06, + "loss": 0.0002, + "num_tokens": 75719277.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 1290.0, + "completions/max_terminated_length": 1290.0, + "completions/mean_length": 565.88671875, + "completions/mean_terminated_length": 559.84619140625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.11640922319229909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32429218735536774, + "kl": 0.0892486572265625, + "learning_rate": 4.4139247230614245e-06, + "loss": 0.0009, + "num_tokens": 76317571.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1534.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 547.5546875, + "completions/mean_terminated_length": 545.9549560546875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.11730467875531676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014236274188526875, + "kl": 0.034820556640625, + "learning_rate": 4.404079385841201e-06, + "loss": 0.0003, + "num_tokens": 76885567.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1635.0, + "completions/mean_length": 600.884765625, + "completions/mean_terminated_length": 591.5767822265625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.11820013431833445, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06339289426546586, + "kl": 0.031524658203125, + "learning_rate": 4.394164646762734e-06, + "loss": 0.0003, + "num_tokens": 77486612.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 532.310546875, + "completions/mean_terminated_length": 526.36669921875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.11909558988135213, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.13312016968576001, + "kl": 0.018341064453125, + "learning_rate": 4.384180921828618e-06, + "loss": 0.0181, + "num_tokens": 78117891.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1484.0, + "completions/mean_length": 540.021484375, + "completions/mean_terminated_length": 533.6011962890625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.11999104544436982, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08648084234997525, + "kl": 0.046478271484375, + "learning_rate": 4.374128629935955e-06, + "loss": -0.0024, + "num_tokens": 78698590.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 532.849609375, + "completions/mean_terminated_length": 523.9194946289062, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.12088650100738751, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.08052364773538578, + "kl": 0.02685546875, + "learning_rate": 4.364008192858781e-06, + "loss": -0.0032, + "num_tokens": 79283569.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1637.0, + "completions/mean_length": 528.208984375, + "completions/mean_terminated_length": 513.2208862304688, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.12178195657040519, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.11289715329207015, + "kl": 0.046234130859375, + "learning_rate": 4.353820035230366e-06, + "loss": -0.003, + "num_tokens": 79851820.0, + "reward": 0.09941406548023224, + "reward_std": 0.0023437500931322575, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 456.6640625, + "completions/mean_terminated_length": 453.5498962402344, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.12267741213342288, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08924289327164087, + "kl": 0.03387451171875, + "learning_rate": 4.3435645845254e-06, + "loss": -0.001, + "num_tokens": 80383584.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 508.736328125, + "completions/mean_terminated_length": 496.61614990234375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.12357286769644056, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09347602654223215, + "kl": 0.03009033203125, + "learning_rate": 4.333242271042054e-06, + "loss": 0.0017, + "num_tokens": 80995369.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 465.349609375, + "completions/mean_terminated_length": 465.349609375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.12446832325945825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013730702087078508, + "kl": 0.03009033203125, + "learning_rate": 4.32285352788393e-06, + "loss": 0.0003, + "num_tokens": 81553276.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 480.57421875, + "completions/mean_terminated_length": 480.57421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.12536377882247593, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07481910987499471, + "kl": 0.02728271484375, + "learning_rate": 4.312398790941882e-06, + "loss": 0.0012, + "num_tokens": 82114402.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1834.0, + "completions/max_terminated_length": 1834.0, + "completions/mean_length": 450.7265625, + "completions/mean_terminated_length": 450.7265625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.1262592343854936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011789371347269173, + "kl": 0.0286865234375, + "learning_rate": 4.301878498875735e-06, + "loss": 0.0003, + "num_tokens": 82631798.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 455.955078125, + "completions/mean_terminated_length": 449.7117919921875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1271546899485113, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.052832056240060994, + "kl": 0.02972412109375, + "learning_rate": 4.291293093095873e-06, + "loss": -0.0012, + "num_tokens": 83148911.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1352.0, + "completions/mean_length": 502.37109375, + "completions/mean_terminated_length": 499.34637451171875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.128050145511529, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04390275036916505, + "kl": 0.02667236328125, + "learning_rate": 4.280643017744723e-06, + "loss": 0.0001, + "num_tokens": 83749389.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 556.841796875, + "completions/mean_terminated_length": 552.2809448242188, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.12894560107454667, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08664061338905497, + "kl": 0.0445556640625, + "learning_rate": 4.269928719678117e-06, + "loss": 0.01, + "num_tokens": 84340956.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 497.876953125, + "completions/mean_terminated_length": 494.84344482421875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.12984105663756437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010525690663272753, + "kl": 0.025848388671875, + "learning_rate": 4.2591506484465426e-06, + "loss": 0.0003, + "num_tokens": 84899501.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1429.0, + "completions/max_terminated_length": 1429.0, + "completions/mean_length": 517.29296875, + "completions/mean_terminated_length": 517.29296875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.13073651220058205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008489301450088622, + "kl": 0.023101806640625, + "learning_rate": 4.248309256276283e-06, + "loss": 0.0002, + "num_tokens": 85466867.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1663.0, + "completions/max_terminated_length": 1663.0, + "completions/mean_length": 505.10546875, + "completions/mean_terminated_length": 505.10546875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.13163196776359973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007370931688452669, + "kl": 0.021636962890625, + "learning_rate": 4.23740499805044e-06, + "loss": 0.0002, + "num_tokens": 85994841.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 477.810546875, + "completions/mean_terminated_length": 474.7377624511719, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.1325274233266174, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0756510185292118, + "kl": 0.0216064453125, + "learning_rate": 4.22643833128985e-06, + "loss": 0.0105, + "num_tokens": 86546552.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1965.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 545.544921875, + "completions/mean_terminated_length": 545.544921875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.1334228788896351, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0603333547239931, + "kl": 0.02008056640625, + "learning_rate": 4.215409716133885e-06, + "loss": 0.0014, + "num_tokens": 87128943.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1428.0, + "completions/max_terminated_length": 1428.0, + "completions/mean_length": 561.455078125, + "completions/mean_terminated_length": 559.75927734375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.1343183344526528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009368817627326945, + "kl": 0.028350830078125, + "learning_rate": 4.204319615321151e-06, + "loss": 0.0003, + "num_tokens": 87733000.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1804.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 557.017578125, + "completions/mean_terminated_length": 555.7691040039062, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.13521379001567047, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06317428308270816, + "kl": 0.02294921875, + "learning_rate": 4.193168494170065e-06, + "loss": 0.0009, + "num_tokens": 88335393.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1721.0, + "completions/mean_length": 580.064453125, + "completions/mean_terminated_length": 575.8349609375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.13610924557868817, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0826749090914757, + "kl": 0.036865234375, + "learning_rate": 4.181956820559339e-06, + "loss": 0.0103, + "num_tokens": 88925634.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 553.10546875, + "completions/mean_terminated_length": 550.6451416015625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.13700470114170585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00995949815814587, + "kl": 0.027069091796875, + "learning_rate": 4.170685064908342e-06, + "loss": 0.0003, + "num_tokens": 89501912.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 578.724609375, + "completions/mean_terminated_length": 575.8493041992188, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.13790015670472353, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09221706487886625, + "kl": 0.0160675048828125, + "learning_rate": 4.159353700157365e-06, + "loss": 0.0126, + "num_tokens": 90088363.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1557.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 540.712890625, + "completions/mean_terminated_length": 540.712890625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.1387956122677412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006455095544717651, + "kl": 0.0162353515625, + "learning_rate": 4.14796320174778e-06, + "loss": 0.0002, + "num_tokens": 90652088.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 559.244140625, + "completions/mean_terminated_length": 559.244140625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.1396910678307589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005425296726901158, + "kl": 0.0153961181640625, + "learning_rate": 4.136514047602087e-06, + "loss": 0.0002, + "num_tokens": 91253445.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 564.927734375, + "completions/mean_terminated_length": 564.4657592773438, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.14058652339377659, + "frac_reward_zero_std": 0.96875, + "grad_norm": 2.504373695194436, + "kl": 0.021728515625, + "learning_rate": 4.1250067181038635e-06, + "loss": 0.003, + "num_tokens": 91860448.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1609.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 605.458984375, + "completions/mean_terminated_length": 604.2720336914062, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.14148197895679426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00715206410917614, + "kl": 0.0187835693359375, + "learning_rate": 4.113441696077608e-06, + "loss": 0.0002, + "num_tokens": 92504011.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1686.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 573.283203125, + "completions/mean_terminated_length": 572.3072509765625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.14237743451981194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015028040699635488, + "kl": 0.02349853515625, + "learning_rate": 4.101819466768484e-06, + "loss": 0.0002, + "num_tokens": 93098252.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.921875, + "completions/max_length": 1435.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 575.388671875, + "completions/mean_terminated_length": 569.3017578125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.14327289008282965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09941954542796112, + "kl": 0.056854248046875, + "learning_rate": 4.0901405178219535e-06, + "loss": 0.0006, + "num_tokens": 93693747.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 538.818359375, + "completions/mean_terminated_length": 538.818359375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.14416834564584732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00484019642481913, + "kl": 0.015655517578125, + "learning_rate": 4.078405339263326e-06, + "loss": 0.0002, + "num_tokens": 94259334.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 625.568359375, + "completions/mean_terminated_length": 622.6941528320312, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.145063801208865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11710367237098084, + "kl": 0.0552978515625, + "learning_rate": 4.06661442347719e-06, + "loss": 0.0006, + "num_tokens": 94913801.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1388.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 580.35546875, + "completions/mean_terminated_length": 578.882568359375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.1459592567718827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14896474611580432, + "kl": 0.045867919921875, + "learning_rate": 4.054768265186758e-06, + "loss": 0.0005, + "num_tokens": 95520063.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 566.701171875, + "completions/mean_terminated_length": 562.4401245117188, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.14685471233490038, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.15854952145501405, + "kl": 0.0495452880859375, + "learning_rate": 4.0428673614331036e-06, + "loss": 0.0121, + "num_tokens": 96098582.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 600.06640625, + "completions/mean_terminated_length": 596.2598266601562, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.14775016789791806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20531760245321556, + "kl": 0.08526611328125, + "learning_rate": 4.030912211554316e-06, + "loss": 0.0009, + "num_tokens": 96741480.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1792.0, + "completions/max_terminated_length": 1792.0, + "completions/mean_length": 607.255859375, + "completions/mean_terminated_length": 607.255859375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.14864562346093574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003842422136667564, + "kl": 0.0128631591796875, + "learning_rate": 4.018903317164539e-06, + "loss": 0.0001, + "num_tokens": 97347403.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1490.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 578.634765625, + "completions/mean_terminated_length": 578.634765625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.14954107902395344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038581929594324353, + "kl": 0.0126190185546875, + "learning_rate": 4.006841182132932e-06, + "loss": 0.0001, + "num_tokens": 97959888.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1215.0, + "completions/max_terminated_length": 1215.0, + "completions/mean_length": 584.98046875, + "completions/mean_terminated_length": 584.98046875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.15043653458697112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00375612876227885, + "kl": 0.0120697021484375, + "learning_rate": 3.9947263125625195e-06, + "loss": 0.0001, + "num_tokens": 98539894.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1811.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 611.880859375, + "completions/mean_terminated_length": 611.880859375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1513319901499888, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.046888330106465065, + "kl": 0.0120086669921875, + "learning_rate": 3.982559216768967e-06, + "loss": -0.0021, + "num_tokens": 99170681.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1564.0, + "completions/max_terminated_length": 1564.0, + "completions/mean_length": 559.029296875, + "completions/mean_terminated_length": 557.5929565429688, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.1522274457130065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005528583561475633, + "kl": 0.0159912109375, + "learning_rate": 3.970340405259245e-06, + "loss": 0.0002, + "num_tokens": 99749576.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1260.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 546.486328125, + "completions/mean_terminated_length": 546.486328125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.15312290127602418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003966195156768382, + "kl": 0.0129852294921875, + "learning_rate": 3.958070390710214e-06, + "loss": 0.0001, + "num_tokens": 100314545.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1621.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 579.455078125, + "completions/mean_terminated_length": 579.455078125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.15401835683904186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036733548406718934, + "kl": 0.0113525390625, + "learning_rate": 3.945749687947109e-06, + "loss": 0.0001, + "num_tokens": 100910106.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 542.794921875, + "completions/mean_terminated_length": 542.794921875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.15491381240205954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035849946550755496, + "kl": 0.0124359130859375, + "learning_rate": 3.933378813921942e-06, + "loss": 0.0001, + "num_tokens": 101456417.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1490.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 608.80078125, + "completions/mean_terminated_length": 608.80078125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.15580926796507724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003298089597570542, + "kl": 0.0116729736328125, + "learning_rate": 3.920958287691811e-06, + "loss": 0.0001, + "num_tokens": 102074587.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1334.0, + "completions/max_terminated_length": 1334.0, + "completions/mean_length": 568.365234375, + "completions/mean_terminated_length": 566.866943359375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.15670472352809492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006980512317555958, + "kl": 0.01434326171875, + "learning_rate": 3.908488630397121e-06, + "loss": 0.0001, + "num_tokens": 102675462.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1912.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 570.677734375, + "completions/mean_terminated_length": 570.677734375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.1576001790911126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031480434475734697, + "kl": 0.01202392578125, + "learning_rate": 3.8959703652397175e-06, + "loss": 0.0001, + "num_tokens": 103274369.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1623.0, + "completions/max_terminated_length": 1623.0, + "completions/mean_length": 604.94140625, + "completions/mean_terminated_length": 604.94140625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1584956346541303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003733587566945661, + "kl": 0.0122222900390625, + "learning_rate": 3.883404017460935e-06, + "loss": 0.0001, + "num_tokens": 103883331.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1402.0, + "completions/max_terminated_length": 1402.0, + "completions/mean_length": 562.93359375, + "completions/mean_terminated_length": 562.93359375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.15939109021714798, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07129197589795944, + "kl": 0.011749267578125, + "learning_rate": 3.870790114319559e-06, + "loss": 0.0028, + "num_tokens": 104461825.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1662.0, + "completions/mean_length": 552.26953125, + "completions/mean_terminated_length": 548.3549194335938, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.16028654578016566, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05920464764711308, + "kl": 0.0140838623046875, + "learning_rate": 3.858129185069701e-06, + "loss": 0.0146, + "num_tokens": 105027531.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1596.0, + "completions/mean_length": 573.619140625, + "completions/mean_terminated_length": 570.7338256835938, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.16118200134318333, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.055155168174674904, + "kl": 0.01177978515625, + "learning_rate": 3.845421760938597e-06, + "loss": 0.0117, + "num_tokens": 105645832.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1295.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 535.0, + "completions/mean_terminated_length": 535.0, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.16207745690620104, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003039648303141946, + "kl": 0.011962890625, + "learning_rate": 3.832668375104312e-06, + "loss": 0.0001, + "num_tokens": 106216184.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 581.462890625, + "completions/mean_terminated_length": 581.462890625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.16297291246921872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003132059428993178, + "kl": 0.0121917724609375, + "learning_rate": 3.8198695626733725e-06, + "loss": 0.0001, + "num_tokens": 106828949.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1339.0, + "completions/mean_length": 580.763671875, + "completions/mean_terminated_length": 577.892333984375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.1638683680322364, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0653143471467512, + "kl": 0.0109405517578125, + "learning_rate": 3.8070258606583156e-06, + "loss": 0.008, + "num_tokens": 107424508.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1318.0, + "completions/max_terminated_length": 1318.0, + "completions/mean_length": 589.708984375, + "completions/mean_terminated_length": 588.2837524414062, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.16476382359525407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010691685971411858, + "kl": 0.0192108154296875, + "learning_rate": 3.7941378079551544e-06, + "loss": 0.0002, + "num_tokens": 108042327.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1538.0, + "completions/max_terminated_length": 1538.0, + "completions/mean_length": 572.59375, + "completions/mean_terminated_length": 569.33203125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.16565927915827178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015248976686925637, + "kl": 0.0164031982421875, + "learning_rate": 3.7812059453207677e-06, + "loss": 0.0002, + "num_tokens": 108650695.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 1289.0, + "completions/mean_length": 556.603515625, + "completions/mean_terminated_length": 554.9431762695312, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.16655473472128945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01722797259757862, + "kl": 0.0211944580078125, + "learning_rate": 3.768230815350213e-06, + "loss": 0.0002, + "num_tokens": 109250460.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1452.0, + "completions/max_terminated_length": 1452.0, + "completions/mean_length": 539.919921875, + "completions/mean_terminated_length": 539.3933715820312, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.16745019028430713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034258710759817617, + "kl": 0.013336181640625, + "learning_rate": 3.7552129624539557e-06, + "loss": 0.0001, + "num_tokens": 109827459.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1466.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 532.275390625, + "completions/mean_terminated_length": 532.275390625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.16834564584732484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031267033439639287, + "kl": 0.0120391845703125, + "learning_rate": 3.7421529328350316e-06, + "loss": 0.0001, + "num_tokens": 110402736.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 591.84375, + "completions/mean_terminated_length": 588.994140625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.16924110141034251, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0715876333086449, + "kl": 0.0121917724609375, + "learning_rate": 3.7290512744661274e-06, + "loss": 0.0127, + "num_tokens": 111027280.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1416.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 573.173828125, + "completions/mean_terminated_length": 571.5667114257812, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.1701365569733602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008349329533868787, + "kl": 0.0147705078125, + "learning_rate": 3.715908537066589e-06, + "loss": 0.0001, + "num_tokens": 111619129.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1375.0, + "completions/max_terminated_length": 1375.0, + "completions/mean_length": 539.93359375, + "completions/mean_terminated_length": 539.93359375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.17103201253637787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003295209257109422, + "kl": 0.0115509033203125, + "learning_rate": 3.7027252720793538e-06, + "loss": 0.0001, + "num_tokens": 112199271.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1103.0, + "completions/max_terminated_length": 1103.0, + "completions/mean_length": 533.30859375, + "completions/mean_terminated_length": 533.30859375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.17192746809939558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003764296136813722, + "kl": 0.010955810546875, + "learning_rate": 3.689502032647817e-06, + "loss": 0.0001, + "num_tokens": 112777429.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1428.0, + "completions/mean_length": 530.67578125, + "completions/mean_terminated_length": 526.7824096679688, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.17282292366241325, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07152586735877586, + "kl": 0.01251220703125, + "learning_rate": 3.6762393735926245e-06, + "loss": 0.0006, + "num_tokens": 113329615.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 530.80859375, + "completions/mean_terminated_length": 527.8395385742188, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.17371837922543093, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04504173515888175, + "kl": 0.0119781494140625, + "learning_rate": 3.6629378513883852e-06, + "loss": 0.018, + "num_tokens": 113909309.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1531.0, + "completions/max_terminated_length": 1531.0, + "completions/mean_length": 542.884765625, + "completions/mean_terminated_length": 541.2078857421875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.17461383478844864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010134304705067215, + "kl": 0.0190582275390625, + "learning_rate": 3.6495980241403307e-06, + "loss": 0.0002, + "num_tokens": 114465426.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1534.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 614.9296875, + "completions/mean_terminated_length": 613.6771240234375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.1755092903514663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007632044399021584, + "kl": 0.0135498046875, + "learning_rate": 3.636220451560896e-06, + "loss": 0.0001, + "num_tokens": 115087886.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 569.10546875, + "completions/mean_terminated_length": 563.4527587890625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.176404745914484, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08856901666593459, + "kl": 0.02618408203125, + "learning_rate": 3.622805694946235e-06, + "loss": 0.0097, + "num_tokens": 115703236.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1152.0, + "completions/max_terminated_length": 1152.0, + "completions/mean_length": 566.34375, + "completions/mean_terminated_length": 564.7583618164062, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.17730020147750167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013044392386165083, + "kl": 0.0165252685546875, + "learning_rate": 3.609354317152667e-06, + "loss": 0.0002, + "num_tokens": 116294900.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 563.509765625, + "completions/mean_terminated_length": 563.509765625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.17819565704051937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003209897085657959, + "kl": 0.0105133056640625, + "learning_rate": 3.595866882573063e-06, + "loss": 0.0001, + "num_tokens": 116880745.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1799.0, + "completions/max_terminated_length": 1799.0, + "completions/mean_length": 531.884765625, + "completions/mean_terminated_length": 531.884765625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.17909111260353705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004143644301975902, + "kl": 0.012176513671875, + "learning_rate": 3.5823439571131675e-06, + "loss": 0.0001, + "num_tokens": 117463918.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1421.0, + "completions/max_terminated_length": 1421.0, + "completions/mean_length": 569.326171875, + "completions/mean_terminated_length": 566.1650390625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.17998656816655473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012133384997274272, + "kl": 0.0173492431640625, + "learning_rate": 3.5687861081678477e-06, + "loss": 0.0002, + "num_tokens": 118052981.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1425.0, + "completions/max_terminated_length": 1425.0, + "completions/mean_length": 561.556640625, + "completions/mean_terminated_length": 560.632080078125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.18088202372957243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003834334830708197, + "kl": 0.011871337890625, + "learning_rate": 3.555193904597291e-06, + "loss": 0.0001, + "num_tokens": 118648194.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1372.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 572.849609375, + "completions/mean_terminated_length": 571.554931640625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.1817774792925901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00980280583780432, + "kl": 0.0140380859375, + "learning_rate": 3.541567916703138e-06, + "loss": 0.0001, + "num_tokens": 119223221.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1629.0, + "completions/mean_length": 623.349609375, + "completions/mean_terminated_length": 619.7882690429688, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.1826729348556078, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.10399564452044252, + "kl": 0.0112762451171875, + "learning_rate": 3.5279087162045517e-06, + "loss": 0.0159, + "num_tokens": 119855848.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1155.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 598.236328125, + "completions/mean_terminated_length": 598.236328125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.18356839041862547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035681655006297246, + "kl": 0.011077880859375, + "learning_rate": 3.5142168762142265e-06, + "loss": 0.0001, + "num_tokens": 120499377.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 636.830078125, + "completions/mean_terminated_length": 635.0784912109375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.18446384598164317, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.6828457034908066, + "kl": 0.0124969482421875, + "learning_rate": 3.500492971214347e-06, + "loss": 0.0033, + "num_tokens": 121137194.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1620.0, + "completions/max_terminated_length": 1620.0, + "completions/mean_length": 558.048828125, + "completions/mean_terminated_length": 558.048828125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.18535930154466085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032938623101590554, + "kl": 0.0101318359375, + "learning_rate": 3.48673757703248e-06, + "loss": 0.0001, + "num_tokens": 121723699.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1521.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 573.240234375, + "completions/mean_terminated_length": 571.7741088867188, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.18625475710767853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21324968563726487, + "kl": 0.0446624755859375, + "learning_rate": 3.472951270817418e-06, + "loss": 0.0004, + "num_tokens": 122331454.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 563.048828125, + "completions/mean_terminated_length": 563.048828125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.1871502126706962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003241167135534498, + "kl": 0.0105133056640625, + "learning_rate": 3.4591346310149578e-06, + "loss": 0.0001, + "num_tokens": 122922103.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1402.0, + "completions/max_terminated_length": 1402.0, + "completions/mean_length": 584.478515625, + "completions/mean_terminated_length": 582.925537109375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.1880456682337139, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09870587380257911, + "kl": 0.0258941650390625, + "learning_rate": 3.445288237343632e-06, + "loss": 0.0003, + "num_tokens": 123554636.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 631.953125, + "completions/mean_terminated_length": 630.5029296875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.18894112379673159, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.2456670428591226, + "kl": 1.09356689453125, + "learning_rate": 3.4314126707703895e-06, + "loss": 0.0109, + "num_tokens": 124211300.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 549.259765625, + "completions/mean_terminated_length": 547.682373046875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.18983657935974926, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.133757572612665, + "kl": 0.0540771484375, + "learning_rate": 3.4175085134862128e-06, + "loss": -0.0032, + "num_tokens": 124809529.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1313.0, + "completions/max_terminated_length": 1313.0, + "completions/mean_length": 582.845703125, + "completions/mean_terminated_length": 580.4695434570312, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.19073203492276697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026589642468070985, + "kl": 0.0259552001953125, + "learning_rate": 3.4035763488816953e-06, + "loss": 0.0003, + "num_tokens": 125429338.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1780.0, + "completions/max_terminated_length": 1780.0, + "completions/mean_length": 563.369140625, + "completions/mean_terminated_length": 562.3972778320312, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.19162749048578465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006533161291379581, + "kl": 0.0159912109375, + "learning_rate": 3.3896167615225594e-06, + "loss": 0.0002, + "num_tokens": 126038519.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1084.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 520.470703125, + "completions/mean_terminated_length": 518.7960815429688, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.19252294604880232, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0627678652899011, + "kl": 0.025634765625, + "learning_rate": 3.375630337125133e-06, + "loss": -0.003, + "num_tokens": 126581912.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 573.263671875, + "completions/mean_terminated_length": 571.9099731445312, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.19341840161182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006775091474281142, + "kl": 0.0161285400390625, + "learning_rate": 3.361617662531772e-06, + "loss": 0.0002, + "num_tokens": 127155263.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 575.451171875, + "completions/mean_terminated_length": 575.0, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.1943138571748377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005273121720543155, + "kl": 0.012603759765625, + "learning_rate": 3.347579325686237e-06, + "loss": 0.0001, + "num_tokens": 127753062.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1395.0, + "completions/max_terminated_length": 1395.0, + "completions/mean_length": 550.294921875, + "completions/mean_terminated_length": 550.294921875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.19520931273785538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035793800730764247, + "kl": 0.009857177734375, + "learning_rate": 3.333515915609027e-06, + "loss": 0.0001, + "num_tokens": 128324701.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1949.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 565.615234375, + "completions/mean_terminated_length": 564.639892578125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.19610476830087306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007094075305124351, + "kl": 0.016448974609375, + "learning_rate": 3.3194280223726616e-06, + "loss": 0.0002, + "num_tokens": 128957480.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1772.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 572.2734375, + "completions/mean_terminated_length": 572.2734375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.19700022386389077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038221420485777383, + "kl": 0.0097198486328125, + "learning_rate": 3.305316237076927e-06, + "loss": 0.0001, + "num_tokens": 129585172.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 532.0625, + "completions/mean_terminated_length": 530.8823852539062, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.19789567942690844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008623426068599522, + "kl": 0.0181884765625, + "learning_rate": 3.291181151824071e-06, + "loss": 0.0002, + "num_tokens": 130132276.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1149.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 533.65625, + "completions/mean_terminated_length": 532.3901977539062, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.19879113498992612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0099887725406859, + "kl": 0.0251617431640625, + "learning_rate": 3.27702335969396e-06, + "loss": 0.0003, + "num_tokens": 130689172.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 560.853515625, + "completions/mean_terminated_length": 558.616943359375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.1996865905529438, + "frac_reward_zero_std": 0.96875, + "grad_norm": 2.441640256939799, + "kl": 0.0290985107421875, + "learning_rate": 3.2628434547191985e-06, + "loss": 0.0064, + "num_tokens": 131272473.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 559.45703125, + "completions/mean_terminated_length": 558.0880737304688, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.2005820461159615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006144070704434375, + "kl": 0.018341064453125, + "learning_rate": 3.2486420318601973e-06, + "loss": 0.0002, + "num_tokens": 131859091.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 569.279296875, + "completions/mean_terminated_length": 567.7745361328125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.20147750167897918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008590736065801137, + "kl": 0.015380859375, + "learning_rate": 3.2344196869802187e-06, + "loss": 0.0002, + "num_tokens": 132477154.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1571.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 562.7421875, + "completions/mean_terminated_length": 561.2000122070312, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.20237295724199686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13583527088514458, + "kl": 0.0154571533203125, + "learning_rate": 3.2201770168203694e-06, + "loss": 0.0002, + "num_tokens": 133030798.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1399.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 607.732421875, + "completions/mean_terminated_length": 605.4078979492188, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.20326841280501456, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.2425459218180856, + "kl": 0.0305938720703125, + "learning_rate": 3.205914618974563e-06, + "loss": -0.0023, + "num_tokens": 133661173.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1169.0, + "completions/max_terminated_length": 1169.0, + "completions/mean_length": 573.162109375, + "completions/mean_terminated_length": 571.7249755859375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.20416386836803224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20645976549639367, + "kl": 0.043853759765625, + "learning_rate": 3.1916330918644496e-06, + "loss": 0.0004, + "num_tokens": 134255000.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1607.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 559.875, + "completions/mean_terminated_length": 558.2764892578125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.20505932393104992, + "frac_reward_zero_std": 0.96875, + "grad_norm": 4.117987738900867, + "kl": 0.018524169921875, + "learning_rate": 3.177333034714303e-06, + "loss": 0.0078, + "num_tokens": 134835192.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 589.134765625, + "completions/mean_terminated_length": 583.9607543945312, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.2059547794940676, + "frac_reward_zero_std": 0.96875, + "grad_norm": 3.824308935913339, + "kl": 0.377777099609375, + "learning_rate": 3.1630150475258813e-06, + "loss": 0.0145, + "num_tokens": 135457933.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1135.0, + "completions/max_terminated_length": 1135.0, + "completions/mean_length": 558.9375, + "completions/mean_terminated_length": 557.1549072265625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.2068502350570853, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.1961945020346105, + "kl": 0.935028076171875, + "learning_rate": 3.148679731053252e-06, + "loss": 0.0093, + "num_tokens": 136009357.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1904.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 548.19921875, + "completions/mean_terminated_length": 544.4666748046875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.20774569062010298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08870372403833915, + "kl": 0.0277252197265625, + "learning_rate": 3.1343276867775805e-06, + "loss": 0.0003, + "num_tokens": 136578547.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 549.17578125, + "completions/mean_terminated_length": 547.7156982421875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.20864114618312066, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07043683406575273, + "kl": 0.0178375244140625, + "learning_rate": 3.1199595168819043e-06, + "loss": -0.0027, + "num_tokens": 137174157.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 571.9453125, + "completions/mean_terminated_length": 570.641845703125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.20953660174613833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006257229542658951, + "kl": 0.0132293701171875, + "learning_rate": 3.105575824225852e-06, + "loss": 0.0001, + "num_tokens": 137780577.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1432.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 598.96875, + "completions/mean_terminated_length": 598.1624145507812, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.21043205730915604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011157997315646496, + "kl": 0.012359619140625, + "learning_rate": 3.091177212320363e-06, + "loss": 0.0001, + "num_tokens": 138408513.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1565.0, + "completions/max_terminated_length": 1565.0, + "completions/mean_length": 549.392578125, + "completions/mean_terminated_length": 548.8963012695312, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.21132751287217372, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.6590655154847698, + "kl": 0.011138916015625, + "learning_rate": 3.0767642853023538e-06, + "loss": 0.0025, + "num_tokens": 138965722.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 593.787109375, + "completions/mean_terminated_length": 579.1386108398438, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.2122229684351914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15619148204537786, + "kl": 0.075897216796875, + "learning_rate": 3.062337647909376e-06, + "loss": 0.0008, + "num_tokens": 139558861.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1522.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 565.580078125, + "completions/mean_terminated_length": 565.580078125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.2131184239982091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038175994456020293, + "kl": 0.0096282958984375, + "learning_rate": 3.04789790545424e-06, + "loss": 0.0001, + "num_tokens": 140136790.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1869.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 584.841796875, + "completions/mean_terminated_length": 584.135009765625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.21401387956122678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12342934098552011, + "kl": 0.040069580078125, + "learning_rate": 3.033445663799621e-06, + "loss": 0.0004, + "num_tokens": 140736069.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1629.0, + "completions/max_terminated_length": 1629.0, + "completions/mean_length": 599.724609375, + "completions/mean_terminated_length": 598.7847290039062, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.21490933512424445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32029486598751655, + "kl": 0.10284423828125, + "learning_rate": 3.018981529332633e-06, + "loss": 0.001, + "num_tokens": 141330328.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1605.0, + "completions/max_terminated_length": 1605.0, + "completions/mean_length": 597.66015625, + "completions/mean_terminated_length": 596.3972778320312, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.21580479068726213, + "frac_reward_zero_std": 0.96875, + "grad_norm": 2.3221797862117994, + "kl": 0.07763671875, + "learning_rate": 3.00450610893939e-06, + "loss": 0.0065, + "num_tokens": 141958138.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1319.0, + "completions/max_terminated_length": 1319.0, + "completions/mean_length": 564.0703125, + "completions/mean_terminated_length": 563.2250366210938, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.21670024625027984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003764847986352615, + "kl": 0.0100250244140625, + "learning_rate": 2.9900200099795396e-06, + "loss": 0.0001, + "num_tokens": 142555854.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 541.349609375, + "completions/mean_terminated_length": 539.8588256835938, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.21759570181329752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07286080033895995, + "kl": 0.0408782958984375, + "learning_rate": 2.9755238402607826e-06, + "loss": 0.0004, + "num_tokens": 143140897.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1174.0, + "completions/max_terminated_length": 1174.0, + "completions/mean_length": 596.396484375, + "completions/mean_terminated_length": 595.5283813476562, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.2184911573763152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09176379905515818, + "kl": 0.038543701171875, + "learning_rate": 2.961018208013367e-06, + "loss": 0.0004, + "num_tokens": 143784588.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1427.0, + "completions/max_terminated_length": 1427.0, + "completions/mean_length": 613.876953125, + "completions/mean_terminated_length": 613.876953125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2193866129393329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031006039422018894, + "kl": 0.008758544921875, + "learning_rate": 2.9465037218645694e-06, + "loss": 0.0001, + "num_tokens": 144437757.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 577.76171875, + "completions/mean_terminated_length": 577.76171875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.22028206850235058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002758364992869889, + "kl": 0.0091705322265625, + "learning_rate": 2.9319809908131604e-06, + "loss": 0.0001, + "num_tokens": 145024083.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1413.0, + "completions/mean_length": 592.3359375, + "completions/mean_terminated_length": 589.4873046875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.22117752406536825, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0861900059464085, + "kl": 0.0089569091796875, + "learning_rate": 2.917450624203847e-06, + "loss": 0.0127, + "num_tokens": 145636767.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 571.796875, + "completions/mean_terminated_length": 571.796875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.22207297962838593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030091220440551336, + "kl": 0.0094146728515625, + "learning_rate": 2.9029132317017118e-06, + "loss": 0.0001, + "num_tokens": 146239815.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1273.0, + "completions/max_terminated_length": 1273.0, + "completions/mean_length": 544.17578125, + "completions/mean_terminated_length": 543.2152709960938, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.22296843519140364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01973309618938481, + "kl": 0.0178375244140625, + "learning_rate": 2.888369423266629e-06, + "loss": 0.0002, + "num_tokens": 146801057.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1987.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 568.916015625, + "completions/mean_terminated_length": 567.5333862304688, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.2238638907544213, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06952648422286142, + "kl": 0.024871826171875, + "learning_rate": 2.8738198091276712e-06, + "loss": -0.0006, + "num_tokens": 147403766.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 538.427734375, + "completions/mean_terminated_length": 536.07666015625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.224759346317439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016991407978815883, + "kl": 0.018768310546875, + "learning_rate": 2.859264999757509e-06, + "loss": 0.0002, + "num_tokens": 147944977.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 580.37109375, + "completions/mean_terminated_length": 580.37109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.2256548018804567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006941913355644624, + "kl": 0.01361083984375, + "learning_rate": 2.8447056058467928e-06, + "loss": 0.0001, + "num_tokens": 148568847.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1309.0, + "completions/max_terminated_length": 1309.0, + "completions/mean_length": 592.466796875, + "completions/mean_terminated_length": 592.466796875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.22655025744347437, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0566631220034882, + "kl": 0.0107574462890625, + "learning_rate": 2.830142238278531e-06, + "loss": 0.0014, + "num_tokens": 149210606.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1268.0, + "completions/max_terminated_length": 1268.0, + "completions/mean_length": 607.787109375, + "completions/mean_terminated_length": 605.1980590820312, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.22744571300649205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05797446748776696, + "kl": 0.02349853515625, + "learning_rate": 2.81557550810246e-06, + "loss": 0.0002, + "num_tokens": 149810001.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1490.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 589.015625, + "completions/mean_terminated_length": 586.2667236328125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.22834116856950973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018451201824408743, + "kl": 0.0198516845703125, + "learning_rate": 2.8010060265094026e-06, + "loss": 0.0002, + "num_tokens": 150404329.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 578.8125, + "completions/mean_terminated_length": 570.1532592773438, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.22923662413252743, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.10756325823969777, + "kl": 0.011627197265625, + "learning_rate": 2.786434404805629e-06, + "loss": 0.0255, + "num_tokens": 151005641.0, + "reward": 0.099609375, + "reward_std": 0.0015625000232830644, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 610.44140625, + "completions/mean_terminated_length": 610.1741333007812, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2301320796955451, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0655357885338102, + "kl": 0.0149688720703125, + "learning_rate": 2.771861254387199e-06, + "loss": 0.0002, + "num_tokens": 151662571.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 612.0078125, + "completions/mean_terminated_length": 612.0078125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.2310275352585628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029556844817003707, + "kl": 0.0084381103515625, + "learning_rate": 2.7572871867143204e-06, + "loss": 0.0001, + "num_tokens": 152275167.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 573.62890625, + "completions/mean_terminated_length": 572.8493041992188, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.23192299082158047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007288985341644463, + "kl": 0.0105438232421875, + "learning_rate": 2.742712813285681e-06, + "loss": 0.0001, + "num_tokens": 152895521.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1782.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 614.169921875, + "completions/mean_terminated_length": 614.169921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.23281844638459817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004161180805146349, + "kl": 0.009002685546875, + "learning_rate": 2.7281387456128017e-06, + "loss": 0.0001, + "num_tokens": 153559304.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1427.0, + "completions/max_terminated_length": 1427.0, + "completions/mean_length": 611.6015625, + "completions/mean_terminated_length": 611.6015625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.23371390194761585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006474119488780102, + "kl": 0.011688232421875, + "learning_rate": 2.7135655951943716e-06, + "loss": 0.0001, + "num_tokens": 154212716.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1293.0, + "completions/max_terminated_length": 1293.0, + "completions/mean_length": 571.82421875, + "completions/mean_terminated_length": 571.82421875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.23460935751063353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005382814191726183, + "kl": 0.0092315673828125, + "learning_rate": 2.698993973490598e-06, + "loss": 0.0001, + "num_tokens": 154846866.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1159.0, + "completions/max_terminated_length": 1159.0, + "completions/mean_length": 547.623046875, + "completions/mean_terminated_length": 546.890380859375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.23550481307365123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005042127493023264, + "kl": 0.0117950439453125, + "learning_rate": 2.6844244918975416e-06, + "loss": 0.0001, + "num_tokens": 155407633.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1403.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 570.494140625, + "completions/mean_terminated_length": 568.864990234375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.2364002686366689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007056110275466197, + "kl": 0.0125274658203125, + "learning_rate": 2.66985776172147e-06, + "loss": 0.0001, + "num_tokens": 155986862.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 587.935546875, + "completions/mean_terminated_length": 587.1487426757812, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2372957241996866, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06552415384762432, + "kl": 0.0103302001953125, + "learning_rate": 2.6552943941532088e-06, + "loss": -0.0, + "num_tokens": 156603789.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 582.896484375, + "completions/mean_terminated_length": 582.896484375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.23819117976270426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004462483031820937, + "kl": 0.0096893310546875, + "learning_rate": 2.6407350002424927e-06, + "loss": 0.0001, + "num_tokens": 157225640.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1307.0, + "completions/max_terminated_length": 1307.0, + "completions/mean_length": 576.552734375, + "completions/mean_terminated_length": 575.3463745117188, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.23908663532572197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012787334540345264, + "kl": 0.0113067626953125, + "learning_rate": 2.626180190872329e-06, + "loss": 0.0001, + "num_tokens": 157805571.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 564.31640625, + "completions/mean_terminated_length": 563.1961059570312, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.23998209088873965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007419547207026392, + "kl": 0.0146636962890625, + "learning_rate": 2.611630576733372e-06, + "loss": 0.0001, + "num_tokens": 158406629.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1268.0, + "completions/max_terminated_length": 1268.0, + "completions/mean_length": 566.529296875, + "completions/mean_terminated_length": 566.529296875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.24087754645175732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031746798716117243, + "kl": 0.0085906982421875, + "learning_rate": 2.5970867682982885e-06, + "loss": 0.0001, + "num_tokens": 158995780.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1686.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 569.357421875, + "completions/mean_terminated_length": 569.0841674804688, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.24177300201477503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00388552281980107, + "kl": 0.010498046875, + "learning_rate": 2.582549375796154e-06, + "loss": 0.0001, + "num_tokens": 159625131.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 576.232421875, + "completions/mean_terminated_length": 573.8862915039062, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.2426684575777927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004227836528309876, + "kl": 0.0115814208984375, + "learning_rate": 2.568019009186841e-06, + "loss": 0.0001, + "num_tokens": 160182066.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1246.0, + "completions/max_terminated_length": 1246.0, + "completions/mean_length": 516.759765625, + "completions/mean_terminated_length": 516.759765625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.24356391314081038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027793874560609663, + "kl": 0.0087738037109375, + "learning_rate": 2.5534962781354317e-06, + "loss": 0.0001, + "num_tokens": 160749847.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1465.0, + "completions/max_terminated_length": 1465.0, + "completions/mean_length": 564.66796875, + "completions/mean_terminated_length": 564.66796875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.24445936870382806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028601609302890164, + "kl": 0.0087890625, + "learning_rate": 2.538981791986634e-06, + "loss": 0.0001, + "num_tokens": 161335181.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1863.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 569.353515625, + "completions/mean_terminated_length": 569.353515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.24535482426684577, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07083375076553232, + "kl": 0.0089111328125, + "learning_rate": 2.524476159739218e-06, + "loss": 0.0016, + "num_tokens": 161951522.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1158.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 558.08984375, + "completions/mean_terminated_length": 558.08984375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.24625027982986344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003647237114248715, + "kl": 0.0092620849609375, + "learning_rate": 2.5099799900204607e-06, + "loss": 0.0001, + "num_tokens": 162541488.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1869.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 588.4140625, + "completions/mean_terminated_length": 588.4140625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.24714573539288112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035135948008930802, + "kl": 0.00830078125, + "learning_rate": 2.4954938910606108e-06, + "loss": 0.0001, + "num_tokens": 163117476.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1634.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 532.267578125, + "completions/mean_terminated_length": 531.6927490234375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.24804119095589883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005236208278342104, + "kl": 0.0112762451171875, + "learning_rate": 2.481018470667368e-06, + "loss": 0.0001, + "num_tokens": 163672957.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1685.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 560.439453125, + "completions/mean_terminated_length": 558.9628295898438, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.2489366465189165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009090939542478412, + "kl": 0.014678955078125, + "learning_rate": 2.4665543362003802e-06, + "loss": 0.0001, + "num_tokens": 164243550.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 560.4140625, + "completions/mean_terminated_length": 560.4140625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.24983210208193418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034273376840632665, + "kl": 0.00921630859375, + "learning_rate": 2.4521020945457615e-06, + "loss": 0.0001, + "num_tokens": 164843138.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 555.576171875, + "completions/mean_terminated_length": 554.9921875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.25072755764495186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005580408966216022, + "kl": 0.010986328125, + "learning_rate": 2.4376623520906255e-06, + "loss": 0.0001, + "num_tokens": 165444489.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1674.0, + "completions/mean_length": 583.470703125, + "completions/mean_terminated_length": 580.0020141601562, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.25162301320796954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0055618289088762655, + "kl": 0.013427734375, + "learning_rate": 2.4232357146976478e-06, + "loss": 0.0001, + "num_tokens": 166058394.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 585.46484375, + "completions/mean_terminated_length": 585.46484375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.2525184687709872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003961106040419591, + "kl": 0.00867462158203125, + "learning_rate": 2.408822787679637e-06, + "loss": 0.0001, + "num_tokens": 166646264.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1254.0, + "completions/mean_length": 549.4453125, + "completions/mean_terminated_length": 546.5126953125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.25341392433400495, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06666940524903293, + "kl": 0.0089263916015625, + "learning_rate": 2.3944241757741475e-06, + "loss": 0.0215, + "num_tokens": 167224716.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1587.0, + "completions/max_terminated_length": 1587.0, + "completions/mean_length": 582.55859375, + "completions/mean_terminated_length": 582.55859375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.2543093798970226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003758111123823731, + "kl": 0.0085906982421875, + "learning_rate": 2.380040483118097e-06, + "loss": 0.0001, + "num_tokens": 167802602.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1457.0, + "completions/max_terminated_length": 1457.0, + "completions/mean_length": 566.951171875, + "completions/mean_terminated_length": 565.4912109375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.2552048354600403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007744807329269595, + "kl": 0.01422119140625, + "learning_rate": 2.365672313222419e-06, + "loss": 0.0001, + "num_tokens": 168397089.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1523.0, + "completions/max_terminated_length": 1523.0, + "completions/mean_length": 604.19140625, + "completions/mean_terminated_length": 604.19140625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.256100291023058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036783542825833512, + "kl": 0.009063720703125, + "learning_rate": 2.351320268946749e-06, + "loss": 0.0001, + "num_tokens": 169041395.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 515.228515625, + "completions/mean_terminated_length": 514.5733642578125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.25699574658607566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006435814982045985, + "kl": 0.015289306640625, + "learning_rate": 2.336984952474119e-06, + "loss": 0.0002, + "num_tokens": 169572568.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1268.0, + "completions/max_terminated_length": 1268.0, + "completions/mean_length": 539.8515625, + "completions/mean_terminated_length": 539.8515625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.25789120214909333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003547633403718012, + "kl": 0.0092926025390625, + "learning_rate": 2.322666965285697e-06, + "loss": 0.0001, + "num_tokens": 170114844.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1611.0, + "completions/max_terminated_length": 1611.0, + "completions/mean_length": 565.12109375, + "completions/mean_terminated_length": 565.12109375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.258786657712111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00270849039290505, + "kl": 0.008544921875, + "learning_rate": 2.3083669081355507e-06, + "loss": 0.0001, + "num_tokens": 170712010.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 546.095703125, + "completions/mean_terminated_length": 546.095703125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.25968211327512875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003303479243740684, + "kl": 0.0097198486328125, + "learning_rate": 2.2940853810254377e-06, + "loss": 0.0001, + "num_tokens": 171289019.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 1353.0, + "completions/mean_length": 555.048828125, + "completions/mean_terminated_length": 555.048828125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.2605775688381464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024881563588516143, + "kl": 0.0089111328125, + "learning_rate": 2.2798229831796313e-06, + "loss": 0.0001, + "num_tokens": 171833188.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1684.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 611.33203125, + "completions/mean_terminated_length": 611.33203125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.2614730244011641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002174622474287534, + "kl": 0.00872802734375, + "learning_rate": 2.2655803130197816e-06, + "loss": 0.0001, + "num_tokens": 172440414.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1332.0, + "completions/max_terminated_length": 1332.0, + "completions/mean_length": 547.375, + "completions/mean_terminated_length": 547.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.2623684799641818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002280527991701211, + "kl": 0.0083465576171875, + "learning_rate": 2.2513579681398034e-06, + "loss": 0.0001, + "num_tokens": 173010702.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1255.0, + "completions/max_terminated_length": 1255.0, + "completions/mean_length": 524.984375, + "completions/mean_terminated_length": 524.984375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.26326393552719946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023409260698287688, + "kl": 0.0089874267578125, + "learning_rate": 2.237156545280803e-06, + "loss": 0.0001, + "num_tokens": 173542710.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1383.0, + "completions/max_terminated_length": 1383.0, + "completions/mean_length": 564.89453125, + "completions/mean_terminated_length": 564.89453125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.26415939109021713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002285607375968095, + "kl": 0.00818634033203125, + "learning_rate": 2.2229766403060403e-06, + "loss": 0.0001, + "num_tokens": 174119952.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1364.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 575.5234375, + "completions/mean_terminated_length": 574.9177856445312, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.2650548466532348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003670069805512377, + "kl": 0.009918212890625, + "learning_rate": 2.2088188481759305e-06, + "loss": 0.0001, + "num_tokens": 174704316.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1883.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 594.546875, + "completions/mean_terminated_length": 592.0254516601562, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.26595030221625254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0036341999433888387, + "kl": 0.00913238525390625, + "learning_rate": 2.194683762923073e-06, + "loss": 0.0001, + "num_tokens": 175324692.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1260.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 544.1875, + "completions/mean_terminated_length": 544.1875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.2668457577792702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002395755802286087, + "kl": 0.0089874267578125, + "learning_rate": 2.1805719776273387e-06, + "loss": 0.0001, + "num_tokens": 175885508.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1519.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 584.34765625, + "completions/mean_terminated_length": 584.34765625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2677412133422879, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025381966746362396, + "kl": 0.00786590576171875, + "learning_rate": 2.166484084390974e-06, + "loss": 0.0001, + "num_tokens": 176479942.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1815.0, + "completions/max_terminated_length": 1815.0, + "completions/mean_length": 565.154296875, + "completions/mean_terminated_length": 565.154296875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.2686366689053056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002541608780619109, + "kl": 0.0078125, + "learning_rate": 2.1524206743137636e-06, + "loss": 0.0001, + "num_tokens": 177059461.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1600.0, + "completions/max_terminated_length": 1600.0, + "completions/mean_length": 589.416015625, + "completions/mean_terminated_length": 589.416015625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.26953212446832325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022764176721727304, + "kl": 0.00820159912109375, + "learning_rate": 2.1383823374682287e-06, + "loss": 0.0001, + "num_tokens": 177675898.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 560.4921875, + "completions/mean_terminated_length": 559.6829833984375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.27042758003134093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004655460261495098, + "kl": 0.010650634765625, + "learning_rate": 2.124369662874868e-06, + "loss": 0.0001, + "num_tokens": 178267958.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1839.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 543.73046875, + "completions/mean_terminated_length": 543.73046875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2713230355943586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0046579640130100415, + "kl": 0.0096588134765625, + "learning_rate": 2.110383238477441e-06, + "loss": 0.0001, + "num_tokens": 178811980.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 622.8203125, + "completions/mean_terminated_length": 621.3248291015625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.27221849115737634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008564486512419778, + "kl": 0.010589599609375, + "learning_rate": 2.096423651118305e-06, + "loss": 0.0001, + "num_tokens": 179447200.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1633.0, + "completions/max_terminated_length": 1633.0, + "completions/mean_length": 568.5234375, + "completions/mean_terminated_length": 568.5234375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.273113946720394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019657571004460936, + "kl": 0.008331298828125, + "learning_rate": 2.082491486513788e-06, + "loss": 0.0001, + "num_tokens": 180021660.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1380.0, + "completions/mean_length": 563.29296875, + "completions/mean_terminated_length": 560.387451171875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.2740094022834117, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07915972206940354, + "kl": 0.00823211669921875, + "learning_rate": 2.0685873292296116e-06, + "loss": 0.0116, + "num_tokens": 180586674.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1513.0, + "completions/max_terminated_length": 1513.0, + "completions/mean_length": 562.177734375, + "completions/mean_terminated_length": 562.177734375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.2749048578464294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022284860749790833, + "kl": 0.0083160400390625, + "learning_rate": 2.054711762656369e-06, + "loss": 0.0001, + "num_tokens": 181184941.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1528.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 596.412109375, + "completions/mean_terminated_length": 596.412109375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.27580031340944705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002106883421172668, + "kl": 0.0083770751953125, + "learning_rate": 2.040865368985044e-06, + "loss": 0.0001, + "num_tokens": 181774192.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1301.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 627.328125, + "completions/mean_terminated_length": 624.9509887695312, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.27669576897246473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003819372938022129, + "kl": 0.010894775390625, + "learning_rate": 2.027048729182583e-06, + "loss": 0.0001, + "num_tokens": 182406376.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1993.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 608.412109375, + "completions/mean_terminated_length": 608.412109375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.2775912245354824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002263166404069423, + "kl": 0.00876617431640625, + "learning_rate": 2.0132624229675205e-06, + "loss": 0.0001, + "num_tokens": 183054571.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 603.818359375, + "completions/mean_terminated_length": 603.818359375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.27848668009850014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002363976493271142, + "kl": 0.008575439453125, + "learning_rate": 1.9995070287856546e-06, + "loss": 0.0001, + "num_tokens": 183651086.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1508.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 567.517578125, + "completions/mean_terminated_length": 567.517578125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.2793821356615178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002390736098018006, + "kl": 0.0080413818359375, + "learning_rate": 1.985783123785774e-06, + "loss": 0.0001, + "num_tokens": 184225447.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 571.00390625, + "completions/mean_terminated_length": 571.00390625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.2802775912245355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002798288761742838, + "kl": 0.008148193359375, + "learning_rate": 1.9720912837954486e-06, + "loss": 0.0001, + "num_tokens": 184818617.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1704.0, + "completions/max_terminated_length": 1704.0, + "completions/mean_length": 581.486328125, + "completions/mean_terminated_length": 581.486328125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.28117304678755317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029589375969960724, + "kl": 0.00795745849609375, + "learning_rate": 1.958432083296862e-06, + "loss": 0.0001, + "num_tokens": 185410322.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1300.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 563.876953125, + "completions/mean_terminated_length": 563.876953125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.28206850235057085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033537999148738644, + "kl": 0.0084381103515625, + "learning_rate": 1.9448060954027093e-06, + "loss": 0.0001, + "num_tokens": 185972355.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1499.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 580.744140625, + "completions/mean_terminated_length": 580.744140625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.2829639579135885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0032118940001133137, + "kl": 0.00843048095703125, + "learning_rate": 1.931213891832153e-06, + "loss": 0.0001, + "num_tokens": 186569616.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1737.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 586.671875, + "completions/mean_terminated_length": 586.671875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.2838594134766062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024987795534890324, + "kl": 0.0081939697265625, + "learning_rate": 1.9176560428868336e-06, + "loss": 0.0001, + "num_tokens": 187160296.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1873.0, + "completions/max_terminated_length": 1873.0, + "completions/mean_length": 626.505859375, + "completions/mean_terminated_length": 626.505859375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.2847548690396239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004381529661568308, + "kl": 0.0082855224609375, + "learning_rate": 1.9041331174269373e-06, + "loss": 0.0001, + "num_tokens": 187822187.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 580.373046875, + "completions/mean_terminated_length": 580.373046875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.2856503246026416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022586600629337017, + "kl": 0.00856781005859375, + "learning_rate": 1.8906456828473341e-06, + "loss": 0.0001, + "num_tokens": 188435834.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1222.0, + "completions/max_terminated_length": 1222.0, + "completions/mean_length": 583.76171875, + "completions/mean_terminated_length": 583.3483276367188, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.2865457801656593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029577044291860563, + "kl": 0.00909423828125, + "learning_rate": 1.8771943050537656e-06, + "loss": 0.0001, + "num_tokens": 189024848.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 583.115234375, + "completions/mean_terminated_length": 577.37060546875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.28744123572867697, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.04177669181803199, + "kl": 0.00832366943359375, + "learning_rate": 1.8637795484391046e-06, + "loss": 0.0156, + "num_tokens": 189647579.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1411.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 567.107421875, + "completions/mean_terminated_length": 567.107421875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.28833669129169465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004438078937032764, + "kl": 0.0081787109375, + "learning_rate": 1.8504019758596698e-06, + "loss": 0.0001, + "num_tokens": 190261506.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 565.990234375, + "completions/mean_terminated_length": 563.0900268554688, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2892321468547123, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.09708382659198318, + "kl": 0.00830078125, + "learning_rate": 1.8370621486116163e-06, + "loss": 0.0111, + "num_tokens": 190851965.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1230.0, + "completions/max_terminated_length": 1230.0, + "completions/mean_length": 591.560546875, + "completions/mean_terminated_length": 591.560546875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.29012760241773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002022024961051474, + "kl": 0.00794219970703125, + "learning_rate": 1.823760626407377e-06, + "loss": 0.0001, + "num_tokens": 191493660.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1408.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 619.765625, + "completions/mean_terminated_length": 619.765625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.2910230579807477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001926941860733007, + "kl": 0.0073699951171875, + "learning_rate": 1.8104979673521838e-06, + "loss": 0.0001, + "num_tokens": 192138196.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 593.982421875, + "completions/mean_terminated_length": 593.982421875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.2919185135437654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003129085512985726, + "kl": 0.00824737548828125, + "learning_rate": 1.7972747279206482e-06, + "loss": 0.0001, + "num_tokens": 192743115.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1933.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 587.041015625, + "completions/mean_terminated_length": 587.041015625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.2928139691067831, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023860447198865166, + "kl": 0.008087158203125, + "learning_rate": 1.7840914629334122e-06, + "loss": 0.0001, + "num_tokens": 193366816.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 590.390625, + "completions/mean_terminated_length": 590.390625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.29370942466980077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0025688619292091523, + "kl": 0.0076141357421875, + "learning_rate": 1.7709487255338731e-06, + "loss": 0.0001, + "num_tokens": 193997736.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1800.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 620.236328125, + "completions/mean_terminated_length": 620.236328125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.29460488023281844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024549393405397333, + "kl": 0.0081329345703125, + "learning_rate": 1.7578470671649684e-06, + "loss": 0.0001, + "num_tokens": 194659121.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 554.986328125, + "completions/mean_terminated_length": 554.4657592773438, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.2955003357958361, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002308138559561996, + "kl": 0.0091552734375, + "learning_rate": 1.744787037546045e-06, + "loss": 0.0001, + "num_tokens": 195268170.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1778.0, + "completions/max_terminated_length": 1778.0, + "completions/mean_length": 600.041015625, + "completions/mean_terminated_length": 597.876708984375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.2963957913588538, + "frac_reward_zero_std": 0.96875, + "grad_norm": 3.3471337491266375, + "kl": 0.01007843017578125, + "learning_rate": 1.731769184649788e-06, + "loss": 0.0085, + "num_tokens": 195921263.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 586.232421875, + "completions/mean_terminated_length": 585.7279663085938, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.2972912469218715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026083962766531095, + "kl": 0.0088653564453125, + "learning_rate": 1.7187940546792325e-06, + "loss": 0.0001, + "num_tokens": 196514582.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1857.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 596.150390625, + "completions/mean_terminated_length": 596.150390625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2981867024848892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022097826939779137, + "kl": 0.00794219970703125, + "learning_rate": 1.7058621920448465e-06, + "loss": 0.0001, + "num_tokens": 197126915.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1471.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 591.99609375, + "completions/mean_terminated_length": 591.99609375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.2990821580479069, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.050800034708301765, + "kl": 0.0082550048828125, + "learning_rate": 1.6929741393416855e-06, + "loss": -0.001, + "num_tokens": 197742593.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1474.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 547.67578125, + "completions/mean_terminated_length": 547.67578125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.29997761361092457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020199348769265493, + "kl": 0.007354736328125, + "learning_rate": 1.6801304373266286e-06, + "loss": 0.0001, + "num_tokens": 198303307.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 563.806640625, + "completions/mean_terminated_length": 563.806640625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.30087306917394224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002861527132021575, + "kl": 0.00794219970703125, + "learning_rate": 1.667331624895689e-06, + "loss": 0.0001, + "num_tokens": 198905048.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1389.0, + "completions/max_terminated_length": 1389.0, + "completions/mean_length": 598.009765625, + "completions/mean_terminated_length": 595.9451293945312, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.3017685247369599, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7597664781521267, + "kl": 0.2764434814453125, + "learning_rate": 1.6545782390614037e-06, + "loss": 0.0028, + "num_tokens": 199507085.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 533.5234375, + "completions/mean_terminated_length": 533.5234375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.3026639802999776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002991528444840403, + "kl": 0.0081024169921875, + "learning_rate": 1.6418708149302992e-06, + "loss": 0.0001, + "num_tokens": 200047417.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 570.22265625, + "completions/mean_terminated_length": 569.5225219726562, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.3035594358629953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05531426793152702, + "kl": 0.02051544189453125, + "learning_rate": 1.6292098856804423e-06, + "loss": 0.0002, + "num_tokens": 200636411.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1506.0, + "completions/max_terminated_length": 1506.0, + "completions/mean_length": 631.513671875, + "completions/mean_terminated_length": 630.5577392578125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.304454891426013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3438139590956559, + "kl": 0.0713653564453125, + "learning_rate": 1.6165959825390661e-06, + "loss": 0.0007, + "num_tokens": 201340434.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1585.0, + "completions/mean_length": 600.896484375, + "completions/mean_terminated_length": 598.0645751953125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.3053503469890307, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08071543782786494, + "kl": 0.0074005126953125, + "learning_rate": 1.604029634760284e-06, + "loss": 0.0075, + "num_tokens": 201952957.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 569.55859375, + "completions/mean_terminated_length": 569.55859375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.30624580255204836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021342847414251354, + "kl": 0.007659912109375, + "learning_rate": 1.59151136960288e-06, + "loss": 0.0001, + "num_tokens": 202574059.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1491.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 569.984375, + "completions/mean_terminated_length": 569.984375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.30714125811506604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002683723340893326, + "kl": 0.0081024169921875, + "learning_rate": 1.5790417123081903e-06, + "loss": 0.0001, + "num_tokens": 203144787.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 557.97265625, + "completions/mean_terminated_length": 557.1643676757812, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.3080367136780837, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06623886544112349, + "kl": 0.0140838623046875, + "learning_rate": 1.5666211860780583e-06, + "loss": 0.0019, + "num_tokens": 203702405.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1489.0, + "completions/max_terminated_length": 1489.0, + "completions/mean_length": 584.458984375, + "completions/mean_terminated_length": 584.458984375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3089321692411014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021213763100731576, + "kl": 0.0073699951171875, + "learning_rate": 1.5542503120528918e-06, + "loss": 0.0001, + "num_tokens": 204315472.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1827.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 604.306640625, + "completions/mean_terminated_length": 603.7084350585938, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.3098276248041191, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06004751525363387, + "kl": 0.00905609130859375, + "learning_rate": 1.5419296092897866e-06, + "loss": -0.0006, + "num_tokens": 204897949.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1574.0, + "completions/max_terminated_length": 1574.0, + "completions/mean_length": 542.888671875, + "completions/mean_terminated_length": 542.888671875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3107230803671368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023636199931338707, + "kl": 0.00792694091796875, + "learning_rate": 1.529659594740755e-06, + "loss": 0.0001, + "num_tokens": 205452628.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1816.0, + "completions/max_terminated_length": 1816.0, + "completions/mean_length": 604.447265625, + "completions/mean_terminated_length": 604.447265625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3116185359301545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004080786857883244, + "kl": 0.010345458984375, + "learning_rate": 1.5174407832310338e-06, + "loss": 0.0001, + "num_tokens": 206092585.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1350.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 602.81640625, + "completions/mean_terminated_length": 602.81640625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.31251399149317216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002084310059902067, + "kl": 0.00750732421875, + "learning_rate": 1.5052736874374815e-06, + "loss": 0.0001, + "num_tokens": 206717835.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1481.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 554.201171875, + "completions/mean_terminated_length": 554.201171875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.31340944705618984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019713549988119757, + "kl": 0.0074462890625, + "learning_rate": 1.4931588178670695e-06, + "loss": 0.0001, + "num_tokens": 207287058.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1399.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 577.13671875, + "completions/mean_terminated_length": 574.8176879882812, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.3143049026192075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008723630209304048, + "kl": 0.014678955078125, + "learning_rate": 1.4810966828354605e-06, + "loss": 0.0001, + "num_tokens": 207936440.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 574.25390625, + "completions/mean_terminated_length": 571.369873046875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.3152003581822252, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0815697141471301, + "kl": 0.008209228515625, + "learning_rate": 1.469087788445684e-06, + "loss": 0.0096, + "num_tokens": 208534426.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1372.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 564.59765625, + "completions/mean_terminated_length": 561.8922119140625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.31609581374524287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01895225120802759, + "kl": 0.018096923828125, + "learning_rate": 1.4571326385668965e-06, + "loss": 0.0002, + "num_tokens": 209152124.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1622.0, + "completions/max_terminated_length": 1622.0, + "completions/mean_length": 576.927734375, + "completions/mean_terminated_length": 576.927734375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.3169912693082606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002275555025051679, + "kl": 0.00794219970703125, + "learning_rate": 1.4452317348132434e-06, + "loss": 0.0001, + "num_tokens": 209762903.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1488.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 559.623046875, + "completions/mean_terminated_length": 559.623046875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.3178867248712783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002180480749374212, + "kl": 0.00786590576171875, + "learning_rate": 1.4333855765228104e-06, + "loss": 0.0001, + "num_tokens": 210355718.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1631.0, + "completions/max_terminated_length": 1631.0, + "completions/mean_length": 615.33203125, + "completions/mean_terminated_length": 614.1036987304688, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.31878218043429596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004830947234790907, + "kl": 0.01129913330078125, + "learning_rate": 1.421594660736675e-06, + "loss": 0.0001, + "num_tokens": 210982272.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1258.0, + "completions/max_terminated_length": 1258.0, + "completions/mean_length": 543.94921875, + "completions/mean_terminated_length": 543.94921875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.31967763599731364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00195334360931251, + "kl": 0.008026123046875, + "learning_rate": 1.4098594821780476e-06, + "loss": 0.0001, + "num_tokens": 211534966.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 570.296875, + "completions/mean_terminated_length": 569.3424682617188, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.3205730915603313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039919552987063324, + "kl": 0.021514892578125, + "learning_rate": 1.3981805332315174e-06, + "loss": 0.0002, + "num_tokens": 212140958.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1803.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 578.76953125, + "completions/mean_terminated_length": 578.76953125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.321468547123349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018167346867108132, + "kl": 0.0074005126953125, + "learning_rate": 1.3865583039223929e-06, + "loss": 0.0001, + "num_tokens": 212733736.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1317.0, + "completions/max_terminated_length": 1317.0, + "completions/mean_length": 559.138671875, + "completions/mean_terminated_length": 559.138671875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.32236400268636667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020262721751415323, + "kl": 0.00768280029296875, + "learning_rate": 1.374993281896137e-06, + "loss": 0.0001, + "num_tokens": 213319055.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 531.10546875, + "completions/mean_terminated_length": 531.10546875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.3232594582493844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002543562893281148, + "kl": 0.00804901123046875, + "learning_rate": 1.3634859523979134e-06, + "loss": 0.0001, + "num_tokens": 213873925.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 567.673828125, + "completions/mean_terminated_length": 566.7005615234375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3241549138124021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005511610704299057, + "kl": 0.0147247314453125, + "learning_rate": 1.3520367982522208e-06, + "loss": 0.0001, + "num_tokens": 214450414.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1237.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 550.005859375, + "completions/mean_terminated_length": 550.005859375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.32505036937541976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019536587834060857, + "kl": 0.00791168212890625, + "learning_rate": 1.3406462998426358e-06, + "loss": 0.0001, + "num_tokens": 215014849.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1516.0, + "completions/max_terminated_length": 1516.0, + "completions/mean_length": 619.970703125, + "completions/mean_terminated_length": 618.8082275390625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.32594582493843743, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05185554275164731, + "kl": 0.01287078857421875, + "learning_rate": 1.3293149350916595e-06, + "loss": 0.0022, + "num_tokens": 215649426.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 581.537109375, + "completions/mean_terminated_length": 581.537109375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.3268412805014551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019177365493093504, + "kl": 0.0078887939453125, + "learning_rate": 1.3180431794406623e-06, + "loss": 0.0001, + "num_tokens": 216246581.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 579.71875, + "completions/mean_terminated_length": 578.8375854492188, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.3277367360644728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03819867611252609, + "kl": 0.00963592529296875, + "learning_rate": 1.3068315058299358e-06, + "loss": 0.0001, + "num_tokens": 216843077.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 596.908203125, + "completions/mean_terminated_length": 592.9371337890625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.32863219162749047, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08687211737432748, + "kl": 0.0146636962890625, + "learning_rate": 1.2956803846788503e-06, + "loss": 0.0118, + "num_tokens": 217450982.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 569.4765625, + "completions/mean_terminated_length": 569.4765625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.32952764719050814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021320786493948195, + "kl": 0.008209228515625, + "learning_rate": 1.284590283866116e-06, + "loss": 0.0001, + "num_tokens": 218059290.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2018.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 642.796875, + "completions/mean_terminated_length": 642.796875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.3304231027535259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00458793107523592, + "kl": 0.00799560546875, + "learning_rate": 1.2735616687101518e-06, + "loss": 0.0001, + "num_tokens": 218764274.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1426.0, + "completions/max_terminated_length": 1426.0, + "completions/mean_length": 581.8828125, + "completions/mean_terminated_length": 581.2681274414062, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.33131855831654355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0035059983661826573, + "kl": 0.01142120361328125, + "learning_rate": 1.2625950019495614e-06, + "loss": 0.0001, + "num_tokens": 219369398.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1706.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 566.892578125, + "completions/mean_terminated_length": 566.892578125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.33221401387956123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022668222341737827, + "kl": 0.00775146484375, + "learning_rate": 1.251690743723718e-06, + "loss": 0.0001, + "num_tokens": 219967663.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1261.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 576.318359375, + "completions/mean_terminated_length": 575.4148559570312, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.3331094694425789, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.9091138792849112, + "kl": 0.0088348388671875, + "learning_rate": 1.2408493515534581e-06, + "loss": 0.0058, + "num_tokens": 220548930.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 587.615234375, + "completions/mean_terminated_length": 587.615234375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.3340049250055966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019110468464679068, + "kl": 0.00717926025390625, + "learning_rate": 1.2300712803218834e-06, + "loss": 0.0001, + "num_tokens": 221138909.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1904.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 551.5546875, + "completions/mean_terminated_length": 551.5546875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.33490038056861426, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07259043152980842, + "kl": 0.00775909423828125, + "learning_rate": 1.2193569822552772e-06, + "loss": 0.0, + "num_tokens": 221735193.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 564.779296875, + "completions/mean_terminated_length": 564.0684814453125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.33579583613163194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005943532205458808, + "kl": 0.01444244384765625, + "learning_rate": 1.2087069069041268e-06, + "loss": 0.0001, + "num_tokens": 222321752.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1611.0, + "completions/max_terminated_length": 1611.0, + "completions/mean_length": 604.189453125, + "completions/mean_terminated_length": 604.189453125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3366912916946497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00325431853032737, + "kl": 0.007415771484375, + "learning_rate": 1.1981215011242654e-06, + "loss": 0.0001, + "num_tokens": 222941337.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1424.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 566.115234375, + "completions/mean_terminated_length": 566.115234375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.33758674725766735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003970803920716204, + "kl": 0.00750732421875, + "learning_rate": 1.1876012090581184e-06, + "loss": 0.0001, + "num_tokens": 223545140.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1824.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 578.40234375, + "completions/mean_terminated_length": 578.40234375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.33848220282068503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021426645687828397, + "kl": 0.00771331787109375, + "learning_rate": 1.177146472116071e-06, + "loss": 0.0001, + "num_tokens": 224156050.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1314.0, + "completions/max_terminated_length": 1314.0, + "completions/mean_length": 565.841796875, + "completions/mean_terminated_length": 565.841796875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.3393776583837027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034682607368906044, + "kl": 0.0072784423828125, + "learning_rate": 1.1667577289579462e-06, + "loss": 0.0001, + "num_tokens": 224759297.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1559.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 571.685546875, + "completions/mean_terminated_length": 571.685546875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.3402731139467204, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06140587993687195, + "kl": 0.00710296630859375, + "learning_rate": 1.1564354154746007e-06, + "loss": -0.0015, + "num_tokens": 225360416.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 1228.0, + "completions/mean_length": 547.423828125, + "completions/mean_terminated_length": 547.423828125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.34116856950973806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002907153664249217, + "kl": 0.00756072998046875, + "learning_rate": 1.146179964769635e-06, + "loss": 0.0001, + "num_tokens": 225934137.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1186.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 578.251953125, + "completions/mean_terminated_length": 578.251953125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.34206402507275574, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07039698042789873, + "kl": 0.0072784423828125, + "learning_rate": 1.1359918071412195e-06, + "loss": 0.0014, + "num_tokens": 226518314.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1157.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 567.06640625, + "completions/mean_terminated_length": 567.06640625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3429594806357735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028870283229968058, + "kl": 0.00708770751953125, + "learning_rate": 1.1258713700640456e-06, + "loss": 0.0001, + "num_tokens": 227092300.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1617.0, + "completions/max_terminated_length": 1617.0, + "completions/mean_length": 563.90234375, + "completions/mean_terminated_length": 563.90234375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.34385493619879115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003288040257158907, + "kl": 0.00762939453125, + "learning_rate": 1.115819078171383e-06, + "loss": 0.0001, + "num_tokens": 227668122.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 564.939453125, + "completions/mean_terminated_length": 564.939453125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.3447503917618088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027093794936639158, + "kl": 0.00782012939453125, + "learning_rate": 1.1058353532372667e-06, + "loss": 0.0001, + "num_tokens": 228249659.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 568.140625, + "completions/mean_terminated_length": 568.140625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.3456458473248265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004926334510014925, + "kl": 0.0101318359375, + "learning_rate": 1.0959206141587998e-06, + "loss": 0.0001, + "num_tokens": 228855667.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 573.947265625, + "completions/mean_terminated_length": 573.947265625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3465413028878442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0023365650433403236, + "kl": 0.00733184814453125, + "learning_rate": 1.0860752769385766e-06, + "loss": 0.0001, + "num_tokens": 229438296.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1745.0, + "completions/max_terminated_length": 1745.0, + "completions/mean_length": 604.919921875, + "completions/mean_terminated_length": 604.919921875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.34743675845086186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022590886365615234, + "kl": 0.00737762451171875, + "learning_rate": 1.0762997546672279e-06, + "loss": 0.0001, + "num_tokens": 230048767.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1621.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 537.255859375, + "completions/mean_terminated_length": 537.255859375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.34833221401387954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024853798289972955, + "kl": 0.0074615478515625, + "learning_rate": 1.0665944575060914e-06, + "loss": 0.0001, + "num_tokens": 230632434.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1692.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 588.02734375, + "completions/mean_terminated_length": 588.02734375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.34922766957689727, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002699862715476615, + "kl": 0.008087158203125, + "learning_rate": 1.056959792669997e-06, + "loss": 0.0001, + "num_tokens": 231250640.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1940.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 585.18359375, + "completions/mean_terminated_length": 585.18359375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.35012312513991495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002385195268313762, + "kl": 0.0077667236328125, + "learning_rate": 1.0473961644101856e-06, + "loss": 0.0001, + "num_tokens": 231891710.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 602.560546875, + "completions/mean_terminated_length": 602.560546875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.3510185807029326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020758379602187934, + "kl": 0.00725555419921875, + "learning_rate": 1.037903973997345e-06, + "loss": 0.0001, + "num_tokens": 232539213.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1703.0, + "completions/max_terminated_length": 1703.0, + "completions/mean_length": 574.234375, + "completions/mean_terminated_length": 574.234375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3519140362659503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019861421190973914, + "kl": 0.0070648193359375, + "learning_rate": 1.0284836197047737e-06, + "loss": 0.0001, + "num_tokens": 233154341.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 536.86328125, + "completions/mean_terminated_length": 536.86328125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.352809491828968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00770820639635427, + "kl": 0.00865936279296875, + "learning_rate": 1.0191354967916712e-06, + "loss": 0.0001, + "num_tokens": 233729551.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1199.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 587.541015625, + "completions/mean_terminated_length": 586.3444213867188, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.35370494739198566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06988031546833616, + "kl": 0.03610992431640625, + "learning_rate": 1.0098599974865515e-06, + "loss": 0.0004, + "num_tokens": 234325188.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1573.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 588.3984375, + "completions/mean_terminated_length": 588.3984375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.35460040295500334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021996142953093563, + "kl": 0.00753021240234375, + "learning_rate": 1.0006575109707898e-06, + "loss": 0.0001, + "num_tokens": 234951888.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1947.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 564.466796875, + "completions/mean_terminated_length": 564.466796875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.35549585851802107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027486815550077717, + "kl": 0.00732421875, + "learning_rate": 9.915284233622877e-07, + "loss": 0.0001, + "num_tokens": 235557407.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1270.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 573.923828125, + "completions/mean_terminated_length": 573.923828125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.35639131408103875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007078971381052134, + "kl": 0.00754547119140625, + "learning_rate": 9.824731176992796e-07, + "loss": 0.0001, + "num_tokens": 236116952.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1424.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 578.4453125, + "completions/mean_terminated_length": 578.4453125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3572867696440564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002419395267570303, + "kl": 0.00762176513671875, + "learning_rate": 9.734919739242543e-07, + "loss": 0.0001, + "num_tokens": 236716412.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1630.0, + "completions/max_terminated_length": 1630.0, + "completions/mean_length": 611.896484375, + "completions/mean_terminated_length": 611.896484375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3581822252070741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002399603414187874, + "kl": 0.00739288330078125, + "learning_rate": 9.645853688680177e-07, + "loss": 0.0001, + "num_tokens": 237356471.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 560.94921875, + "completions/mean_terminated_length": 560.94921875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.3590776807700918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019044463057180108, + "kl": 0.00698089599609375, + "learning_rate": 9.557536762338786e-07, + "loss": 0.0001, + "num_tokens": 237930573.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1735.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 555.783203125, + "completions/mean_terminated_length": 555.0528564453125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.35997313633310946, + "frac_reward_zero_std": 0.96875, + "grad_norm": 2.32710618983278, + "kl": 0.0330963134765625, + "learning_rate": 9.46997266581973e-07, + "loss": 0.003, + "num_tokens": 238499726.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1618.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 620.208984375, + "completions/mean_terminated_length": 620.208984375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.36086859189612713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018904971951806233, + "kl": 0.00704193115234375, + "learning_rate": 9.383165073137115e-07, + "loss": 0.0001, + "num_tokens": 239118313.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1901.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 590.24609375, + "completions/mean_terminated_length": 590.24609375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.36176404745914487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001994524648071092, + "kl": 0.00737762451171875, + "learning_rate": 9.297117626563687e-07, + "loss": 0.0001, + "num_tokens": 239715831.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 572.150390625, + "completions/mean_terminated_length": 572.150390625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.36265950302216254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002142527225070906, + "kl": 0.007049560546875, + "learning_rate": 9.211833936477957e-07, + "loss": 0.0001, + "num_tokens": 240289108.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1752.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 582.8046875, + "completions/mean_terminated_length": 582.8046875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.3635549585851802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002373460468135557, + "kl": 0.0075225830078125, + "learning_rate": 9.127317581212753e-07, + "loss": 0.0001, + "num_tokens": 240883216.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 560.4140625, + "completions/mean_terminated_length": 560.4140625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3644504141481979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001969697468176816, + "kl": 0.00736236572265625, + "learning_rate": 9.043572106905084e-07, + "loss": 0.0001, + "num_tokens": 241493444.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1697.0, + "completions/max_terminated_length": 1697.0, + "completions/mean_length": 561.98828125, + "completions/mean_terminated_length": 561.98828125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.3653458697112156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018965289544797352, + "kl": 0.00738525390625, + "learning_rate": 8.960601027347321e-07, + "loss": 0.0001, + "num_tokens": 242087262.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 546.6484375, + "completions/mean_terminated_length": 546.6484375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.36624132527423325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001865530385427833, + "kl": 0.0068817138671875, + "learning_rate": 8.878407823839788e-07, + "loss": 0.0001, + "num_tokens": 242652954.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1475.0, + "completions/max_terminated_length": 1475.0, + "completions/mean_length": 589.279296875, + "completions/mean_terminated_length": 589.279296875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.36713678083725093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018847578722863511, + "kl": 0.0071868896484375, + "learning_rate": 8.796995945044689e-07, + "loss": 0.0001, + "num_tokens": 243260969.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1590.0, + "completions/max_terminated_length": 1590.0, + "completions/mean_length": 582.318359375, + "completions/mean_terminated_length": 582.318359375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.36803223640026866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00203077251152688, + "kl": 0.00757598876953125, + "learning_rate": 8.716368806841405e-07, + "loss": 0.0001, + "num_tokens": 243868332.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 576.03125, + "completions/mean_terminated_length": 574.8258056640625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.36892769196328634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4546602572793698, + "kl": 0.19781494140625, + "learning_rate": 8.636529792183171e-07, + "loss": 0.002, + "num_tokens": 244456780.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 575.224609375, + "completions/mean_terminated_length": 573.8218994140625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.369823147526304, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.672365377691588, + "kl": 2.31805419921875, + "learning_rate": 8.557482250955144e-07, + "loss": 0.0231, + "num_tokens": 245059183.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1814.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 616.26171875, + "completions/mean_terminated_length": 616.26171875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.3707186030893217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022327067094846478, + "kl": 0.00756072998046875, + "learning_rate": 8.479229499833844e-07, + "loss": 0.0001, + "num_tokens": 245699077.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 560.572265625, + "completions/mean_terminated_length": 557.6614379882812, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3716140586523394, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05507044422176956, + "kl": 0.00771331787109375, + "learning_rate": 8.401774822147976e-07, + "loss": 0.0133, + "num_tokens": 246220810.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 565.83203125, + "completions/mean_terminated_length": 565.83203125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.37250951421535705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017348792946901474, + "kl": 0.0075531005859375, + "learning_rate": 8.325121467740695e-07, + "loss": 0.0001, + "num_tokens": 246831508.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 579.80078125, + "completions/mean_terminated_length": 579.80078125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.37340496977837473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001879141414398478, + "kl": 0.00748443603515625, + "learning_rate": 8.249272652833226e-07, + "loss": 0.0001, + "num_tokens": 247454382.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1437.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 549.587890625, + "completions/mean_terminated_length": 549.24658203125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3743004253413924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13101577924907357, + "kl": 0.058868408203125, + "learning_rate": 8.174231559889931e-07, + "loss": 0.0006, + "num_tokens": 248031675.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1120.0, + "completions/max_terminated_length": 1120.0, + "completions/mean_length": 531.248046875, + "completions/mean_terminated_length": 531.248046875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.37519588090441014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002574748354672376, + "kl": 0.00739288330078125, + "learning_rate": 8.100001337484787e-07, + "loss": 0.0001, + "num_tokens": 248567770.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1304.0, + "completions/max_terminated_length": 1304.0, + "completions/mean_length": 574.470703125, + "completions/mean_terminated_length": 574.470703125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.3760913364674278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022981337737840012, + "kl": 0.00763702392578125, + "learning_rate": 8.026585100169251e-07, + "loss": 0.0001, + "num_tokens": 249183803.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1862.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 616.513671875, + "completions/mean_terminated_length": 616.513671875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3769867920304455, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05132811797882821, + "kl": 0.007354736328125, + "learning_rate": 7.953985928341601e-07, + "loss": -0.0038, + "num_tokens": 249820402.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1447.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 609.41796875, + "completions/mean_terminated_length": 609.41796875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.37788224759346317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004694027579070297, + "kl": 0.00716400146484375, + "learning_rate": 7.882206868117693e-07, + "loss": 0.0001, + "num_tokens": 250431752.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1434.0, + "completions/max_terminated_length": 1434.0, + "completions/mean_length": 590.93359375, + "completions/mean_terminated_length": 590.93359375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.37877770315648085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016517934518544754, + "kl": 0.006988525390625, + "learning_rate": 7.81125093120313e-07, + "loss": 0.0001, + "num_tokens": 251055830.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 576.03515625, + "completions/mean_terminated_length": 573.1546020507812, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.3796731587194985, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07157949363599238, + "kl": 0.00707244873046875, + "learning_rate": 7.741121094766916e-07, + "loss": 0.0014, + "num_tokens": 251677096.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1469.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 546.15625, + "completions/mean_terminated_length": 546.15625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.3805686142825162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001872346621404152, + "kl": 0.0073394775390625, + "learning_rate": 7.671820301316532e-07, + "loss": 0.0001, + "num_tokens": 252233400.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1275.0, + "completions/max_terminated_length": 1275.0, + "completions/mean_length": 569.955078125, + "completions/mean_terminated_length": 569.955078125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.38146406984553394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018266300286058845, + "kl": 0.00664520263671875, + "learning_rate": 7.603351458574474e-07, + "loss": 0.0001, + "num_tokens": 252816513.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2036.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 581.267578125, + "completions/mean_terminated_length": 581.267578125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.3823595254085516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007915979461939848, + "kl": 0.0125885009765625, + "learning_rate": 7.535717439356255e-07, + "loss": 0.0001, + "num_tokens": 253493402.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 553.25, + "completions/mean_terminated_length": 553.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.3832549809715693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020653643768265492, + "kl": 0.0070037841796875, + "learning_rate": 7.46892108144986e-07, + "loss": 0.0001, + "num_tokens": 254064650.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1549.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 590.99609375, + "completions/mean_terminated_length": 590.99609375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.38415043653458697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019496797346678291, + "kl": 0.0080413818359375, + "learning_rate": 7.402965187496697e-07, + "loss": 0.0001, + "num_tokens": 254686536.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 588.392578125, + "completions/mean_terminated_length": 588.392578125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.38504589209760465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020164854597262494, + "kl": 0.0069580078125, + "learning_rate": 7.337852524873974e-07, + "loss": 0.0001, + "num_tokens": 255299089.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1393.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 581.587890625, + "completions/mean_terminated_length": 581.587890625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.3859413476606223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020515273257323617, + "kl": 0.00724029541015625, + "learning_rate": 7.273585825578608e-07, + "loss": 0.0001, + "num_tokens": 255868046.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1504.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 583.171875, + "completions/mean_terminated_length": 583.171875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.38683680322364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017395511754316104, + "kl": 0.006988525390625, + "learning_rate": 7.21016778611259e-07, + "loss": 0.0001, + "num_tokens": 256477958.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1372.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 566.326171875, + "completions/mean_terminated_length": 566.326171875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.38773225878665774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017721868753958108, + "kl": 0.00708770751953125, + "learning_rate": 7.147601067369835e-07, + "loss": 0.0001, + "num_tokens": 257066797.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1493.0, + "completions/max_terminated_length": 1493.0, + "completions/mean_length": 595.302734375, + "completions/mean_terminated_length": 595.302734375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.3886277143496754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024752450007139593, + "kl": 0.00705718994140625, + "learning_rate": 7.085888294524561e-07, + "loss": 0.0001, + "num_tokens": 257678216.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1581.0, + "completions/max_terminated_length": 1581.0, + "completions/mean_length": 608.5546875, + "completions/mean_terminated_length": 608.5546875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.3895231699126931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017861341278110877, + "kl": 0.00673675537109375, + "learning_rate": 7.025032056921117e-07, + "loss": 0.0001, + "num_tokens": 258294132.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 581.275390625, + "completions/mean_terminated_length": 581.275390625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.39041862547571077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017281723694945767, + "kl": 0.00643157958984375, + "learning_rate": 6.965034907965349e-07, + "loss": 0.0001, + "num_tokens": 258913697.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 560.501953125, + "completions/mean_terminated_length": 560.501953125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.39131408103872845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00201222494298188, + "kl": 0.00742340087890625, + "learning_rate": 6.905899365017462e-07, + "loss": 0.0001, + "num_tokens": 259487154.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1567.0, + "completions/max_terminated_length": 1567.0, + "completions/mean_length": 584.263671875, + "completions/mean_terminated_length": 584.263671875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.3922095366017461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004694895011256875, + "kl": 0.00927734375, + "learning_rate": 6.847627909286409e-07, + "loss": 0.0001, + "num_tokens": 260127929.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 1430.0, + "completions/mean_length": 595.798828125, + "completions/mean_terminated_length": 595.1781005859375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.3931049921647638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027166240735075925, + "kl": 0.01947021484375, + "learning_rate": 6.790222985725761e-07, + "loss": 0.0002, + "num_tokens": 260732914.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 1496.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 573.078125, + "completions/mean_terminated_length": 558.9515991210938, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.39400044772778153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38007504110459894, + "kl": 0.184814453125, + "learning_rate": 6.733687002931141e-07, + "loss": 0.0019, + "num_tokens": 261329754.0, + "reward": 0.09687499701976776, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.17416280508041382, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 586.638671875, + "completions/mean_terminated_length": 586.638671875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3948959032907992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017975609609217182, + "kl": 0.00704193115234375, + "learning_rate": 6.678022333039158e-07, + "loss": 0.0001, + "num_tokens": 261915617.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 624.666015625, + "completions/mean_terminated_length": 621.880615234375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.3957913588538169, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.08108864832847747, + "kl": 0.00714874267578125, + "learning_rate": 6.623231311627876e-07, + "loss": 0.0088, + "num_tokens": 262555238.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1683.0, + "completions/max_terminated_length": 1683.0, + "completions/mean_length": 621.076171875, + "completions/mean_terminated_length": 621.076171875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.39668681441683457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018508512221771447, + "kl": 0.0065765380859375, + "learning_rate": 6.569316237618811e-07, + "loss": 0.0001, + "num_tokens": 263162077.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1854.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 611.12109375, + "completions/mean_terminated_length": 603.7957153320312, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.39758226997985224, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0839393069811805, + "kl": 0.00927734375, + "learning_rate": 6.516279373180499e-07, + "loss": 0.0125, + "num_tokens": 263810891.0, + "reward": 0.099609375, + "reward_std": 0.0010673906654119492, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.06243881583213806, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1471.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 578.388671875, + "completions/mean_terminated_length": 578.388671875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.3984777255428699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034021836651294195, + "kl": 0.0074615478515625, + "learning_rate": 6.464122943633543e-07, + "loss": 0.0001, + "num_tokens": 264436114.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1832.0, + "completions/mean_length": 595.1796875, + "completions/mean_terminated_length": 592.3366088867188, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.3993731811058876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018822311558751052, + "kl": 0.007232666015625, + "learning_rate": 6.412849137357271e-07, + "loss": 0.0001, + "num_tokens": 265059486.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1697.0, + "completions/mean_length": 572.162109375, + "completions/mean_terminated_length": 569.2739868164062, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.40026863666890533, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.07171075527282686, + "kl": 0.00722503662109375, + "learning_rate": 6.3624601056979e-07, + "loss": 0.0126, + "num_tokens": 265664641.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1395.0, + "completions/max_terminated_length": 1395.0, + "completions/mean_length": 598.486328125, + "completions/mean_terminated_length": 598.486328125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.401164092231923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021966762722874593, + "kl": 0.00708770751953125, + "learning_rate": 6.312957962878278e-07, + "loss": 0.0001, + "num_tokens": 266243818.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 601.158203125, + "completions/mean_terminated_length": 598.3267822265625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.4020595477949407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017733959837999802, + "kl": 0.007049560546875, + "learning_rate": 6.264344785909181e-07, + "loss": 0.0001, + "num_tokens": 266845387.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 608.20703125, + "completions/mean_terminated_length": 608.20703125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.40295500335795836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017065534051172028, + "kl": 0.0072784423828125, + "learning_rate": 6.216622614502149e-07, + "loss": 0.0001, + "num_tokens": 267495125.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1912.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 636.064453125, + "completions/mean_terminated_length": 636.064453125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.40385045892097604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018493003144563498, + "kl": 0.007049560546875, + "learning_rate": 6.169793450983916e-07, + "loss": 0.0001, + "num_tokens": 268135174.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1495.0, + "completions/max_terminated_length": 1495.0, + "completions/mean_length": 625.46875, + "completions/mean_terminated_length": 625.46875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.4047459144839937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018044428594794818, + "kl": 0.00771331787109375, + "learning_rate": 6.123859260212393e-07, + "loss": 0.0001, + "num_tokens": 268799078.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1325.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 565.87109375, + "completions/mean_terminated_length": 565.87109375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.4056413700470114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001915458727608025, + "kl": 0.0070648193359375, + "learning_rate": 6.07882196949423e-07, + "loss": 0.0001, + "num_tokens": 269389476.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1871.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 613.662109375, + "completions/mean_terminated_length": 613.662109375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.40653682561002913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021355970538105215, + "kl": 0.00687408447265625, + "learning_rate": 6.034683468503948e-07, + "loss": 0.0001, + "num_tokens": 269983703.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 1599.0, + "completions/max_terminated_length": 1599.0, + "completions/mean_length": 607.080078125, + "completions/mean_terminated_length": 594.6846923828125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4074322811730468, + "frac_reward_zero_std": 0.96875, + "grad_norm": 1.8621664812287209, + "kl": 0.216064453125, + "learning_rate": 5.991445609204641e-07, + "loss": 0.0119, + "num_tokens": 270637712.0, + "reward": 0.09726563096046448, + "reward_std": 0.0010673906654119492, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.97265625, + "rewards/format_reward/std": 0.16324250400066376, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1247.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 572.48046875, + "completions/mean_terminated_length": 572.48046875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.4083277367360645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002494812575327247, + "kl": 0.00701141357421875, + "learning_rate": 5.949110205770292e-07, + "loss": 0.0001, + "num_tokens": 271264118.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1556.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 591.2734375, + "completions/mean_terminated_length": 591.2734375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.40922319229908216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001660612262807166, + "kl": 0.00687408447265625, + "learning_rate": 5.90767903450964e-07, + "loss": 0.0001, + "num_tokens": 271874258.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1187.0, + "completions/mean_length": 591.591796875, + "completions/mean_terminated_length": 588.74169921875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.41011864786209984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020372028375238455, + "kl": 0.00670623779296875, + "learning_rate": 5.867153833791652e-07, + "loss": 0.0001, + "num_tokens": 272459937.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1403.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 596.318359375, + "completions/mean_terminated_length": 595.5655517578125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.4110141034251175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005391106128944126, + "kl": 0.0085601806640625, + "learning_rate": 5.827536303972587e-07, + "loss": 0.0001, + "num_tokens": 273035332.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1457.0, + "completions/max_terminated_length": 1457.0, + "completions/mean_length": 586.96875, + "completions/mean_terminated_length": 586.96875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.4119095589881352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017693725274782219, + "kl": 0.00725555419921875, + "learning_rate": 5.78882810732465e-07, + "loss": 0.0001, + "num_tokens": 273617428.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1641.0, + "completions/max_terminated_length": 1641.0, + "completions/mean_length": 566.490234375, + "completions/mean_terminated_length": 564.8590698242188, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.4128050145511529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21273820513553693, + "kl": 0.09808349609375, + "learning_rate": 5.75103086796625e-07, + "loss": 0.001, + "num_tokens": 274228799.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1315.0, + "completions/max_terminated_length": 1315.0, + "completions/mean_length": 629.357421875, + "completions/mean_terminated_length": 629.357421875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.4137004701141706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001867753287103506, + "kl": 0.00658416748046875, + "learning_rate": 5.714146171793846e-07, + "loss": 0.0001, + "num_tokens": 274894902.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 663.58984375, + "completions/mean_terminated_length": 660.880615234375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.4145959256771883, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06420642626966507, + "kl": 0.0066986083984375, + "learning_rate": 5.678175566415422e-07, + "loss": 0.0118, + "num_tokens": 275600276.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1383.0, + "completions/max_terminated_length": 1383.0, + "completions/mean_length": 593.947265625, + "completions/mean_terminated_length": 593.1526489257812, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.41549138124020596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033834079144248906, + "kl": 0.00829315185546875, + "learning_rate": 5.643120561085528e-07, + "loss": 0.0001, + "num_tokens": 276232089.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 599.5, + "completions/mean_terminated_length": 596.6653442382812, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.41638683680322364, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.06985689858353016, + "kl": 0.00630950927734375, + "learning_rate": 5.608982626641991e-07, + "loss": 0.0092, + "num_tokens": 276867353.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1496.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 561.34375, + "completions/mean_terminated_length": 561.34375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4172822923662413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001829682064071725, + "kl": 0.00696563720703125, + "learning_rate": 5.575763195444166e-07, + "loss": 0.0001, + "num_tokens": 277402361.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1561.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 583.193359375, + "completions/mean_terminated_length": 583.193359375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.418177747929259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018341209530430575, + "kl": 0.00695037841796875, + "learning_rate": 5.543463661312847e-07, + "loss": 0.0001, + "num_tokens": 278019500.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1841.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 643.275390625, + "completions/mean_terminated_length": 643.275390625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.41907320349227667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018558159963784368, + "kl": 0.0070343017578125, + "learning_rate": 5.512085379471808e-07, + "loss": 0.0001, + "num_tokens": 278693369.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1303.0, + "completions/max_terminated_length": 1303.0, + "completions/mean_length": 561.912109375, + "completions/mean_terminated_length": 561.912109375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.4199686590552944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018229106338354208, + "kl": 0.00714111328125, + "learning_rate": 5.481629666490903e-07, + "loss": 0.0001, + "num_tokens": 279291708.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1657.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 587.775390625, + "completions/mean_terminated_length": 587.775390625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4208641146183121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0019173223019730975, + "kl": 0.0071563720703125, + "learning_rate": 5.452097800230853e-07, + "loss": 0.0001, + "num_tokens": 279906937.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1276.0, + "completions/max_terminated_length": 1276.0, + "completions/mean_length": 604.33203125, + "completions/mean_terminated_length": 604.33203125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.42175957018132976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017790077243040982, + "kl": 0.0069580078125, + "learning_rate": 5.423491019789623e-07, + "loss": 0.0001, + "num_tokens": 280532259.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1233.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 578.6875, + "completions/mean_terminated_length": 578.6875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.42265502574434743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017562351994829582, + "kl": 0.0067901611328125, + "learning_rate": 5.395810525450425e-07, + "loss": 0.0001, + "num_tokens": 281134947.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1296.0, + "completions/max_terminated_length": 1296.0, + "completions/mean_length": 585.494140625, + "completions/mean_terminated_length": 585.494140625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.4235504813073651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00176205230185391, + "kl": 0.0071563720703125, + "learning_rate": 5.369057478631359e-07, + "loss": 0.0001, + "num_tokens": 281772608.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 574.505859375, + "completions/mean_terminated_length": 572.3745727539062, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.4244459368703828, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0708422514046771, + "kl": 0.010009765625, + "learning_rate": 5.343233001836694e-07, + "loss": 0.0059, + "num_tokens": 282387587.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.953125, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 607.232421875, + "completions/mean_terminated_length": 601.7583618164062, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.42534139243340047, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.047813434399653715, + "kl": 0.007781982421875, + "learning_rate": 5.318338178609754e-07, + "loss": 0.0118, + "num_tokens": 283014490.0, + "reward": 0.09941406548023224, + "reward_std": 0.0012597277527675033, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.994140625, + "rewards/format_reward/std": 0.07639661431312561, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1361.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 557.375, + "completions/mean_terminated_length": 557.375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.4262368479964182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016574825912189395, + "kl": 0.00710296630859375, + "learning_rate": 5.294374053487459e-07, + "loss": 0.0001, + "num_tokens": 283582986.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 533.109375, + "completions/mean_terminated_length": 532.0665283203125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.4271323035594359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046049687083547626, + "kl": 0.0193328857421875, + "learning_rate": 5.271341631956511e-07, + "loss": 0.0002, + "num_tokens": 284136722.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 567.68359375, + "completions/mean_terminated_length": 567.68359375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.42802775912245355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018915923111198715, + "kl": 0.007598876953125, + "learning_rate": 5.249241880411181e-07, + "loss": 0.0001, + "num_tokens": 284745520.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1766.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 613.716796875, + "completions/mean_terminated_length": 613.716796875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.42892321468547123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017940656131543965, + "kl": 0.0068359375, + "learning_rate": 5.228075726112785e-07, + "loss": 0.0001, + "num_tokens": 285394559.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1412.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 594.916015625, + "completions/mean_terminated_length": 594.916015625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.4298186702484889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018361691126511986, + "kl": 0.0076141357421875, + "learning_rate": 5.207844057150768e-07, + "loss": 0.0001, + "num_tokens": 286027476.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1486.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 570.41796875, + "completions/mean_terminated_length": 570.41796875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.4307141258115066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018222271820144726, + "kl": 0.007171630859375, + "learning_rate": 5.188547722405437e-07, + "loss": 0.0001, + "num_tokens": 286647402.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 1866.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 606.814453125, + "completions/mean_terminated_length": 605.6647338867188, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.43160958137452426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058165050748120006, + "kl": 0.022003173828125, + "learning_rate": 5.170187531512351e-07, + "loss": 0.0002, + "num_tokens": 287290171.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 569.34765625, + "completions/mean_terminated_length": 569.34765625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.432505036937542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016401191771389131, + "kl": 0.00669097900390625, + "learning_rate": 5.152764254828348e-07, + "loss": 0.0001, + "num_tokens": 287903117.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 581.685546875, + "completions/mean_terminated_length": 581.685546875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.4334004925005597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002024954882989428, + "kl": 0.00728607177734375, + "learning_rate": 5.136278623399225e-07, + "loss": 0.0001, + "num_tokens": 288528300.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1311.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 561.482421875, + "completions/mean_terminated_length": 561.482421875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.43429594806357735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001827238886228116, + "kl": 0.00690460205078125, + "learning_rate": 5.120731328929058e-07, + "loss": 0.0001, + "num_tokens": 289124643.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 589.880859375, + "completions/mean_terminated_length": 589.880859375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.43519140362659503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016826621612342748, + "kl": 0.00684356689453125, + "learning_rate": 5.106123023751187e-07, + "loss": 0.0001, + "num_tokens": 289724726.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1495.0, + "completions/max_terminated_length": 1495.0, + "completions/mean_length": 618.705078125, + "completions/mean_terminated_length": 618.705078125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.4360868591896127, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.054507559806292745, + "kl": 0.00732421875, + "learning_rate": 5.092454320800833e-07, + "loss": -0.001, + "num_tokens": 290383343.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 574.888671875, + "completions/mean_terminated_length": 573.9080200195312, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4369823147526304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007263700317682045, + "kl": 0.0087127685546875, + "learning_rate": 5.079725793589405e-07, + "loss": 0.0001, + "num_tokens": 290988102.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1360.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 585.91796875, + "completions/mean_terminated_length": 585.91796875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.43787777031564806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016686153159851426, + "kl": 0.00637054443359375, + "learning_rate": 5.067937976180407e-07, + "loss": 0.0001, + "num_tokens": 291592652.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1547.0, + "completions/max_terminated_length": 1547.0, + "completions/mean_length": 578.55078125, + "completions/mean_terminated_length": 578.55078125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.4387732258786658, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.05563500405432078, + "kl": 0.0074005126953125, + "learning_rate": 5.057091363167046e-07, + "loss": 0.0005, + "num_tokens": 292205798.0, + "reward": 0.09980468451976776, + "reward_std": 0.0007812500116415322, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 0.998046875, + "rewards/format_reward/std": 0.04419417306780815, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 555.208984375, + "completions/mean_terminated_length": 555.208984375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4396686814416835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018705801444356403, + "kl": 0.007080078125, + "learning_rate": 5.047186409651489e-07, + "loss": 0.0001, + "num_tokens": 292786673.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1638.0, + "completions/max_terminated_length": 1638.0, + "completions/mean_length": 612.251953125, + "completions/mean_terminated_length": 612.251953125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.44056413700470115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.001757747553131733, + "kl": 0.00656890869140625, + "learning_rate": 5.038223531225742e-07, + "loss": 0.0001, + "num_tokens": 293394562.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1297.0, + "completions/max_terminated_length": 1297.0, + "completions/mean_length": 558.96484375, + "completions/mean_terminated_length": 558.96484375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.44145959256771883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009906982476004701, + "kl": 0.00750732421875, + "learning_rate": 5.030203103954232e-07, + "loss": 0.0001, + "num_tokens": 293980288.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1399.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 574.599609375, + "completions/mean_terminated_length": 574.599609375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.4423550481307365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020012748176014904, + "kl": 0.00701904296875, + "learning_rate": 5.023125464358026e-07, + "loss": 0.0001, + "num_tokens": 294557651.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 561.36328125, + "completions/mean_terminated_length": 561.36328125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.4432505036937542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002266287153034044, + "kl": 0.00643157958984375, + "learning_rate": 5.016990909400709e-07, + "loss": 0.0001, + "num_tokens": 295120541.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -6.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 622.67578125, + "completions/mean_terminated_length": 619.886474609375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.44414595925677186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0018880986832274613, + "kl": 0.0064239501953125, + "learning_rate": 5.011799696475915e-07, + "loss": 0.0001, + "num_tokens": 295733943.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1410.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 550.1484375, + "completions/mean_terminated_length": 550.1484375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.4450414148197896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00175187557341818, + "kl": 0.00661468505859375, + "learning_rate": 5.007552043396547e-07, + "loss": 0.0001, + "num_tokens": 296317315.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1519.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 561.388671875, + "completions/mean_terminated_length": 561.388671875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.44593687038280727, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029581018858531676, + "kl": 0.00745391845703125, + "learning_rate": 5.004248128385618e-07, + "loss": 0.0001, + "num_tokens": 296929690.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1419.0, + "completions/max_terminated_length": 1419.0, + "completions/mean_length": 571.595703125, + "completions/mean_terminated_length": 571.595703125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.44683232594582495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002798351333064508, + "kl": 0.00785064697265625, + "learning_rate": 5.001888090068784e-07, + "loss": 0.0001, + "num_tokens": 297506603.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 583.40234375, + "completions/mean_terminated_length": 583.40234375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4477277815088426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0016961154289319366, + "kl": 0.00682830810546875, + "learning_rate": 5.000472027468528e-07, + "loss": 0.0001, + "num_tokens": 298150537.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/code_reward/mean": 0.0, + "rewards/code_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 500 + }, + { + "epoch": 0.4477277815088426, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.000565722431474569, + "train_runtime": 12129.6023, + "train_samples_per_second": 21.105, + "train_steps_per_second": 0.041 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 298150537, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}