{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5372011818426, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7188.0, "completions/max_terminated_length": 7188.0, "completions/mean_length": 731.5078125, "completions/mean_terminated_length": 731.5078125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0010744023636852001, "grad_norm": 0.422673509158199, "kl": 4.0784478187561035e-05, "learning_rate": 0.0, "loss": 0.0189, "num_tokens": 654724.0, "reward": 0.3906328082084656, "reward_std": 0.17963062226772308, "rewards/correct_answer_reward_func": 0.1796875, "rewards/format_reward_func": 0.9592187404632568, "rewards/python_attempt_reward_func": 0.2265625, "rewards/python_count_reward_func": 0.10546875, "rewards/python_reward_func": 0.09648437798023224, "rewards/tool_execution_reward_func": 0.09550781548023224, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0021488047273704003, "grad_norm": 0.42278123238986814, "kl": 4.0784478187561035e-05, "learning_rate": 1e-07, "loss": 0.0189, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7311.0, "completions/max_terminated_length": 7311.0, "completions/mean_length": 766.943359375, "completions/mean_terminated_length": 766.943359375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0032232070910556, "grad_norm": 0.5282676312129658, "kl": 4.011392593383789e-05, "learning_rate": 2e-07, "loss": 0.0457, "num_tokens": 1327591.0, "reward": 0.40129464864730835, "reward_std": 0.17162421345710754, "rewards/correct_answer_reward_func": 0.19140625, "rewards/format_reward_func": 0.9559012651443481, "rewards/python_attempt_reward_func": 0.283203125, "rewards/python_count_reward_func": 0.107421875, "rewards/python_reward_func": 0.09458240866661072, "rewards/tool_execution_reward_func": 0.0935407355427742, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0042976094547408005, "grad_norm": 0.5280240028511717, "kl": 3.851205110549927e-05, "learning_rate": 3e-07, "loss": 0.0457, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7064.0, "completions/max_terminated_length": 7064.0, "completions/mean_length": 569.625, "completions/mean_terminated_length": 569.625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.005372011818426001, "grad_norm": 0.3266312438848178, "kl": 3.5002827644348145e-05, "learning_rate": 4e-07, "loss": 0.0647, "num_tokens": 1902567.0, "reward": 0.6631051898002625, "reward_std": 0.2504645884037018, "rewards/correct_answer_reward_func": 0.447265625, "rewards/format_reward_func": 0.9777979850769043, "rewards/python_attempt_reward_func": 0.255859375, "rewards/python_count_reward_func": 0.119140625, "rewards/python_reward_func": 0.1023763045668602, "rewards/tool_execution_reward_func": 0.1013997420668602, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0064464141821112, "grad_norm": 0.32968026279399176, "kl": 4.0471553802490234e-05, "learning_rate": 5e-07, "loss": 0.0647, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 466.11328125, "completions/mean_terminated_length": 466.11328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.007520816545796401, "grad_norm": 0.24914674295210712, "kl": 3.7904828786849976e-05, "learning_rate": 6e-07, "loss": -0.0182, "num_tokens": 2401569.0, "reward": 0.8384803533554077, "reward_std": 0.2415131777524948, "rewards/correct_answer_reward_func": 0.634765625, "rewards/format_reward_func": 0.9831380248069763, "rewards/python_attempt_reward_func": 0.09765625, "rewards/python_count_reward_func": 0.041015625, "rewards/python_reward_func": 0.0369001105427742, "rewards/tool_execution_reward_func": 0.0354352667927742, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.008595218909481601, "grad_norm": 0.25185293811376974, "kl": 4.707649350166321e-05, "learning_rate": 7e-07, "loss": -0.0182, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7409.0, "completions/max_terminated_length": 7409.0, "completions/mean_length": 665.333984375, "completions/mean_terminated_length": 665.333984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.009669621273166801, "grad_norm": 0.8064104120591891, "kl": 0.00012042373418807983, "learning_rate": 8e-07, "loss": 0.0292, "num_tokens": 3011564.0, "reward": 0.7223383188247681, "reward_std": 0.15446436405181885, "rewards/correct_answer_reward_func": 0.517578125, "rewards/format_reward_func": 0.9495822787284851, "rewards/python_attempt_reward_func": 0.17578125, "rewards/python_count_reward_func": 0.078125, "rewards/python_reward_func": 0.07421875, "rewards/tool_execution_reward_func": 0.07421875, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.010744023636852002, "grad_norm": 0.7980908062797426, "kl": 0.0002219676971435547, "learning_rate": 9e-07, "loss": 0.0292, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6872.0, "completions/max_terminated_length": 6872.0, "completions/mean_length": 632.28515625, "completions/mean_terminated_length": 632.28515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0118184260005372, "grad_norm": 0.5978692820884397, "kl": 0.00034165382385253906, "learning_rate": 1e-06, "loss": 0.0545, "num_tokens": 3629214.0, "reward": 0.5684857368469238, "reward_std": 0.19066008925437927, "rewards/correct_answer_reward_func": 0.3671875, "rewards/format_reward_func": 0.9697070121765137, "rewards/python_attempt_reward_func": 0.107421875, "rewards/python_count_reward_func": 0.0390625, "rewards/python_reward_func": 0.0377604179084301, "rewards/tool_execution_reward_func": 0.0367838554084301, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0128928283642224, "grad_norm": 0.5706063305055135, "kl": 0.0008192509412765503, "learning_rate": 1e-06, "loss": 0.0545, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5228.0, "completions/max_terminated_length": 5228.0, "completions/mean_length": 470.583984375, "completions/mean_terminated_length": 470.583984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.013967230727907601, "grad_norm": 0.23512306518577303, "kl": 0.000746503472328186, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 4136201.0, "reward": 0.8459792137145996, "reward_std": 0.23187264800071716, "rewards/correct_answer_reward_func": 0.642578125, "rewards/format_reward_func": 0.9873828291893005, "rewards/python_attempt_reward_func": 0.146484375, "rewards/python_count_reward_func": 0.033203125, "rewards/python_reward_func": 0.02962239645421505, "rewards/tool_execution_reward_func": 0.02962239645421505, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.015041633091592801, "grad_norm": 0.2330094726805987, "kl": 0.0012967437505722046, "learning_rate": 1e-06, "loss": 0.0175, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7250.0, "completions/max_terminated_length": 7250.0, "completions/mean_length": 539.611328125, "completions/mean_terminated_length": 539.611328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.016116035455278, "grad_norm": 0.5473411088938623, "kl": 0.0058386921882629395, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 4705538.0, "reward": 0.6663488149642944, "reward_std": 0.15986457467079163, "rewards/correct_answer_reward_func": 0.453125, "rewards/format_reward_func": 0.9700905084609985, "rewards/python_attempt_reward_func": 0.193359375, "rewards/python_count_reward_func": 0.107421875, "rewards/python_reward_func": 0.0970052108168602, "rewards/tool_execution_reward_func": 0.0960286483168602, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.017190437818963202, "grad_norm": 0.5141753847666146, "kl": 0.014339849352836609, "learning_rate": 1e-06, "loss": 0.0237, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7496.0, "completions/max_terminated_length": 7496.0, "completions/mean_length": 698.7109375, "completions/mean_terminated_length": 698.7109375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0182648401826484, "grad_norm": 0.43671800838998254, "kl": 0.06924819946289062, "learning_rate": 1e-06, "loss": 0.0496, "num_tokens": 5355598.0, "reward": 0.4543941020965576, "reward_std": 0.17994269728660583, "rewards/correct_answer_reward_func": 0.25390625, "rewards/format_reward_func": 0.9471006393432617, "rewards/python_attempt_reward_func": 0.169921875, "rewards/python_count_reward_func": 0.064453125, "rewards/python_reward_func": 0.0553385429084301, "rewards/tool_execution_reward_func": 0.0553385429084301, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.019339242546333603, "grad_norm": 0.3895310493998681, "kl": 0.16898536682128906, "learning_rate": 1e-06, "loss": 0.0497, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5285.0, "completions/max_terminated_length": 5285.0, "completions/mean_length": 570.900390625, "completions/mean_terminated_length": 570.900390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0204136449100188, "grad_norm": 0.23163938814817034, "kl": 0.061587199568748474, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 5938075.0, "reward": 0.6298880577087402, "reward_std": 0.1261117160320282, "rewards/correct_answer_reward_func": 0.423828125, "rewards/format_reward_func": 0.9840755462646484, "rewards/python_attempt_reward_func": 0.130859375, "rewards/python_count_reward_func": 0.052734375, "rewards/python_reward_func": 0.0462239608168602, "rewards/tool_execution_reward_func": 0.0462239608168602, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.021488047273704004, "grad_norm": 0.23789839270174667, "kl": 0.1234617829322815, "learning_rate": 1e-06, "loss": 0.0295, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5912.0, "completions/max_terminated_length": 5912.0, "completions/mean_length": 516.22265625, "completions/mean_terminated_length": 516.22265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.022562449637389202, "grad_norm": 0.25779946883759136, "kl": 0.25330890715122223, "learning_rate": 1e-06, "loss": 0.0383, "num_tokens": 6472205.0, "reward": 0.6552734375, "reward_std": 0.214857816696167, "rewards/correct_answer_reward_func": 0.443359375, "rewards/format_reward_func": 0.9814453125, "rewards/python_attempt_reward_func": 0.162109375, "rewards/python_count_reward_func": 0.08203125, "rewards/python_reward_func": 0.078125, "rewards/tool_execution_reward_func": 0.078125, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0236368520010744, "grad_norm": 0.2755964862243062, "kl": 0.43601472675800323, "learning_rate": 1e-06, "loss": 0.0385, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6814.0, "completions/max_terminated_length": 6814.0, "completions/mean_length": 638.552734375, "completions/mean_terminated_length": 638.552734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.024711254364759603, "grad_norm": 0.3529252178378096, "kl": 0.48722201585769653, "learning_rate": 1e-06, "loss": -0.0659, "num_tokens": 7077320.0, "reward": 0.46296095848083496, "reward_std": 0.11096877604722977, "rewards/correct_answer_reward_func": 0.248046875, "rewards/format_reward_func": 0.9596614837646484, "rewards/python_attempt_reward_func": 0.23046875, "rewards/python_count_reward_func": 0.138671875, "rewards/python_reward_func": 0.1149088516831398, "rewards/tool_execution_reward_func": 0.1149088516831398, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0257856567284448, "grad_norm": 0.716470568134732, "kl": 1.0456394255161285, "learning_rate": 1e-06, "loss": -0.0653, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7375.0, "completions/max_terminated_length": 7375.0, "completions/mean_length": 598.146484375, "completions/mean_terminated_length": 598.146484375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.026860059092130004, "grad_norm": 3.2908768316355403, "kl": 4.9970316886901855, "learning_rate": 1e-06, "loss": 0.033, "num_tokens": 7672115.0, "reward": 0.4335381090641022, "reward_std": 0.14894729852676392, "rewards/correct_answer_reward_func": 0.2265625, "rewards/format_reward_func": 0.9772608280181885, "rewards/python_attempt_reward_func": 0.12890625, "rewards/python_count_reward_func": 0.064453125, "rewards/python_reward_func": 0.0576171875, "rewards/tool_execution_reward_func": 0.0576171875, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.027934461455815202, "grad_norm": 2.6758823398024183, "kl": 4.681436479091644, "learning_rate": 1e-06, "loss": 0.0327, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5833.0, "completions/max_terminated_length": 5833.0, "completions/mean_length": 438.447265625, "completions/mean_terminated_length": 438.447265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.029008863819500404, "grad_norm": 0.7851619051504651, "kl": 1.6955763697624207, "learning_rate": 1e-06, "loss": -0.027, "num_tokens": 8165592.0, "reward": 0.5963294506072998, "reward_std": 0.1357751488685608, "rewards/correct_answer_reward_func": 0.37890625, "rewards/format_reward_func": 0.9873437881469727, "rewards/python_attempt_reward_func": 0.185546875, "rewards/python_count_reward_func": 0.125, "rewards/python_reward_func": 0.0997721403837204, "rewards/tool_execution_reward_func": 0.0997721403837204, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.030083266183185603, "grad_norm": 0.5787260777848525, "kl": 1.3097735047340393, "learning_rate": 1e-06, "loss": -0.0274, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7033.0, "completions/max_terminated_length": 7033.0, "completions/mean_length": 646.125, "completions/mean_terminated_length": 646.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0311576685468708, "grad_norm": 0.3881463945943215, "kl": 0.8679580688476562, "learning_rate": 1e-06, "loss": -0.0366, "num_tokens": 8768984.0, "reward": 0.5729016065597534, "reward_std": 0.23287412524223328, "rewards/correct_answer_reward_func": 0.3671875, "rewards/format_reward_func": 0.9629129767417908, "rewards/python_attempt_reward_func": 0.24609375, "rewards/python_count_reward_func": 0.095703125, "rewards/python_reward_func": 0.06647135317325592, "rewards/tool_execution_reward_func": 0.06565755605697632, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.032232070910556, "grad_norm": 0.32867756236420853, "kl": 0.7668304443359375, "learning_rate": 1e-06, "loss": -0.0367, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6393.0, "completions/max_terminated_length": 6393.0, "completions/mean_length": 539.203125, "completions/mean_terminated_length": 539.203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0333064732742412, "grad_norm": 0.42128065772800394, "kl": 1.128803789615631, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 9326496.0, "reward": 0.4357031285762787, "reward_std": 0.17815448343753815, "rewards/correct_answer_reward_func": 0.21875, "rewards/format_reward_func": 0.9826497435569763, "rewards/python_attempt_reward_func": 0.255859375, "rewards/python_count_reward_func": 0.12109375, "rewards/python_reward_func": 0.10276693105697632, "rewards/tool_execution_reward_func": 0.10211588442325592, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.034380875637926404, "grad_norm": 0.3751953243245928, "kl": 1.034396767616272, "learning_rate": 1e-06, "loss": 0.001, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6776.0, "completions/max_terminated_length": 6776.0, "completions/mean_length": 577.392578125, "completions/mean_terminated_length": 577.392578125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.035455278001611606, "grad_norm": 0.2699487521746226, "kl": 0.7010124921798706, "learning_rate": 1e-06, "loss": -0.0099, "num_tokens": 9909225.0, "reward": 0.6010091304779053, "reward_std": 0.205041766166687, "rewards/correct_answer_reward_func": 0.392578125, "rewards/format_reward_func": 0.9830728769302368, "rewards/python_attempt_reward_func": 0.126953125, "rewards/python_count_reward_func": 0.064453125, "rewards/python_reward_func": 0.05908203125, "rewards/tool_execution_reward_func": 0.05908203125, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0365296803652968, "grad_norm": 0.2480083205311938, "kl": 0.6783121228218079, "learning_rate": 1e-06, "loss": -0.0099, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 457.94921875, "completions/mean_terminated_length": 457.94921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.037604082728982004, "grad_norm": 0.35670190890859205, "kl": 0.645153820514679, "learning_rate": 1e-06, "loss": -0.0931, "num_tokens": 10407151.0, "reward": 0.5918294191360474, "reward_std": 0.2737268805503845, "rewards/correct_answer_reward_func": 0.36328125, "rewards/format_reward_func": 0.9723306894302368, "rewards/python_attempt_reward_func": 0.423828125, "rewards/python_count_reward_func": 0.212890625, "rewards/python_reward_func": 0.1733398586511612, "rewards/tool_execution_reward_func": 0.17041015625, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.038678485092667206, "grad_norm": 0.33662197326841614, "kl": 0.6320639848709106, "learning_rate": 1e-06, "loss": -0.0931, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5871.0, "completions/max_terminated_length": 5871.0, "completions/mean_length": 560.802734375, "completions/mean_terminated_length": 560.802734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0397528874563524, "grad_norm": 0.2187664202839913, "kl": 0.6941529512405396, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 10962474.0, "reward": 0.5715937614440918, "reward_std": 0.1944807469844818, "rewards/correct_answer_reward_func": 0.3671875, "rewards/format_reward_func": 0.9738541841506958, "rewards/python_attempt_reward_func": 0.099609375, "rewards/python_count_reward_func": 0.052734375, "rewards/python_reward_func": 0.0504557304084301, "rewards/tool_execution_reward_func": 0.0481770858168602, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0408272898200376, "grad_norm": 0.21733124934969478, "kl": 0.6671816110610962, "learning_rate": 1e-06, "loss": -0.0007, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7489.0, "completions/max_terminated_length": 7489.0, "completions/mean_length": 522.65625, "completions/mean_terminated_length": 522.65625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.041901692183722805, "grad_norm": 0.2825324888424224, "kl": 0.9119611978530884, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 11499994.0, "reward": 0.754798173904419, "reward_std": 0.2613673806190491, "rewards/correct_answer_reward_func": 0.533203125, "rewards/format_reward_func": 0.9779296517372131, "rewards/python_attempt_reward_func": 0.236328125, "rewards/python_count_reward_func": 0.142578125, "rewards/python_reward_func": 0.13134765625, "rewards/tool_execution_reward_func": 0.1300455778837204, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04297609454740801, "grad_norm": 0.25965545716531724, "kl": 0.8216338753700256, "learning_rate": 1e-06, "loss": 0.005, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7511.0, "completions/max_terminated_length": 7511.0, "completions/mean_length": 670.619140625, "completions/mean_terminated_length": 670.619140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0440504969110932, "grad_norm": 0.23378144149354474, "kl": 0.8895367383956909, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 12115095.0, "reward": 0.4506792426109314, "reward_std": 0.2128063291311264, "rewards/correct_answer_reward_func": 0.2421875, "rewards/format_reward_func": 0.9697048664093018, "rewards/python_attempt_reward_func": 0.17578125, "rewards/python_count_reward_func": 0.080078125, "rewards/python_reward_func": 0.0730794295668602, "rewards/tool_execution_reward_func": 0.07275390625, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.045124899274778404, "grad_norm": 0.2378948919825617, "kl": 0.889232337474823, "learning_rate": 1e-06, "loss": 0.0146, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7672.0, "completions/max_terminated_length": 7672.0, "completions/mean_length": 628.029296875, "completions/mean_terminated_length": 628.029296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.04619930163846361, "grad_norm": 0.27290066482713904, "kl": 0.7163619995117188, "learning_rate": 1e-06, "loss": -0.0232, "num_tokens": 12732998.0, "reward": 0.4593140482902527, "reward_std": 0.14964768290519714, "rewards/correct_answer_reward_func": 0.255859375, "rewards/format_reward_func": 0.9633181095123291, "rewards/python_attempt_reward_func": 0.162109375, "rewards/python_count_reward_func": 0.06640625, "rewards/python_reward_func": 0.0539550818502903, "rewards/tool_execution_reward_func": 0.0539550818502903, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0472737040021488, "grad_norm": 0.27614568405191176, "kl": 0.7776848673820496, "learning_rate": 1e-06, "loss": -0.0231, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6607.0, "completions/max_terminated_length": 6607.0, "completions/mean_length": 664.6875, "completions/mean_terminated_length": 664.6875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.048348106365834004, "grad_norm": 0.24392502410220013, "kl": 0.7164177894592285, "learning_rate": 1e-06, "loss": 0.0369, "num_tokens": 13349030.0, "reward": 0.5969592332839966, "reward_std": 0.15018638968467712, "rewards/correct_answer_reward_func": 0.380859375, "rewards/format_reward_func": 0.9816384315490723, "rewards/python_attempt_reward_func": 0.240234375, "rewards/python_count_reward_func": 0.12109375, "rewards/python_reward_func": 0.09902343899011612, "rewards/tool_execution_reward_func": 0.09886067360639572, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.049422508729519206, "grad_norm": 0.23116351403447843, "kl": 0.760589599609375, "learning_rate": 1e-06, "loss": 0.0369, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7246.0, "completions/max_terminated_length": 7246.0, "completions/mean_length": 593.9296875, "completions/mean_terminated_length": 593.9296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.05049691109320441, "grad_norm": 0.22851276686194322, "kl": 0.6500476598739624, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 13931938.0, "reward": 0.610074520111084, "reward_std": 0.1966492235660553, "rewards/correct_answer_reward_func": 0.376953125, "rewards/format_reward_func": 0.9895814657211304, "rewards/python_attempt_reward_func": 0.322265625, "rewards/python_count_reward_func": 0.205078125, "rewards/python_reward_func": 0.17706707119941711, "rewards/tool_execution_reward_func": 0.176025390625, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0515713134568896, "grad_norm": 0.24014843110296397, "kl": 0.7743828296661377, "learning_rate": 1e-06, "loss": 0.0065, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6844.0, "completions/max_terminated_length": 6844.0, "completions/mean_length": 554.81640625, "completions/mean_terminated_length": 554.81640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.052645715820574805, "grad_norm": 0.32765926881655766, "kl": 1.2967171669006348, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 14488804.0, "reward": 0.7126038074493408, "reward_std": 0.25725820660591125, "rewards/correct_answer_reward_func": 0.5078125, "rewards/format_reward_func": 0.9777321815490723, "rewards/python_attempt_reward_func": 0.130859375, "rewards/python_count_reward_func": 0.052734375, "rewards/python_reward_func": 0.0462239570915699, "rewards/tool_execution_reward_func": 0.0462239570915699, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05372011818426001, "grad_norm": 0.3569800315858644, "kl": 1.454946517944336, "learning_rate": 1e-06, "loss": 0.007, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3377.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 508.076171875, "completions/mean_terminated_length": 508.076171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0547945205479452, "grad_norm": 0.32952411464631703, "kl": 1.659942626953125, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 15016267.0, "reward": 0.7970638275146484, "reward_std": 0.18395200371742249, "rewards/correct_answer_reward_func": 0.578125, "rewards/format_reward_func": 0.9822266101837158, "rewards/python_attempt_reward_func": 0.1796875, "rewards/python_count_reward_func": 0.119140625, "rewards/python_reward_func": 0.1124674528837204, "rewards/tool_execution_reward_func": 0.1124674528837204, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.055868922911630405, "grad_norm": 0.2752093984561332, "kl": 1.5597953796386719, "learning_rate": 1e-06, "loss": 0.0278, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3243.0, "completions/max_terminated_length": 3243.0, "completions/mean_length": 452.712890625, "completions/mean_terminated_length": 452.712890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.05694332527531561, "grad_norm": 0.25177567448007604, "kl": 0.8945102691650391, "learning_rate": 1e-06, "loss": -0.0257, "num_tokens": 15511384.0, "reward": 0.5802316665649414, "reward_std": 0.21151582896709442, "rewards/correct_answer_reward_func": 0.37109375, "rewards/format_reward_func": 0.98779296875, "rewards/python_attempt_reward_func": 0.169921875, "rewards/python_count_reward_func": 0.080078125, "rewards/python_reward_func": 0.0598958358168602, "rewards/tool_execution_reward_func": 0.0578962080180645, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05801772763900081, "grad_norm": 0.24772791189596075, "kl": 0.8605780601501465, "learning_rate": 1e-06, "loss": -0.0258, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7541.0, "completions/max_terminated_length": 7541.0, "completions/mean_length": 645.138671875, "completions/mean_terminated_length": 645.138671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.059092130002686004, "grad_norm": 0.22220990645205937, "kl": 0.9538493156433105, "learning_rate": 1e-06, "loss": -0.026, "num_tokens": 16122207.0, "reward": 0.4852113723754883, "reward_std": 0.10728442668914795, "rewards/correct_answer_reward_func": 0.271484375, "rewards/format_reward_func": 0.9824044704437256, "rewards/python_attempt_reward_func": 0.26953125, "rewards/python_count_reward_func": 0.11328125, "rewards/python_reward_func": 0.08720703423023224, "rewards/tool_execution_reward_func": 0.08623047173023224, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.060166532366371206, "grad_norm": 0.20618097509581662, "kl": 0.9465702772140503, "learning_rate": 1e-06, "loss": -0.026, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7514.0, "completions/max_terminated_length": 7514.0, "completions/mean_length": 556.33984375, "completions/mean_terminated_length": 556.33984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06124093473005641, "grad_norm": 0.24631821866870257, "kl": 1.1780188083648682, "learning_rate": 1e-06, "loss": -0.0467, "num_tokens": 16683757.0, "reward": 0.6617369651794434, "reward_std": 0.20540699362754822, "rewards/correct_answer_reward_func": 0.443359375, "rewards/format_reward_func": 0.9756119847297668, "rewards/python_attempt_reward_func": 0.248046875, "rewards/python_count_reward_func": 0.12890625, "rewards/python_reward_func": 0.11627604812383652, "rewards/tool_execution_reward_func": 0.11627604812383652, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0623153370937416, "grad_norm": 0.22642239573056164, "kl": 1.1152443885803223, "learning_rate": 1e-06, "loss": -0.0467, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6773.0, "completions/max_terminated_length": 6773.0, "completions/mean_length": 551.880859375, "completions/mean_terminated_length": 551.880859375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0633897394574268, "grad_norm": 0.18350884744448637, "kl": 0.3809540271759033, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 17242192.0, "reward": 0.7287598848342896, "reward_std": 0.28533095121383667, "rewards/correct_answer_reward_func": 0.509765625, "rewards/format_reward_func": 0.989665150642395, "rewards/python_attempt_reward_func": 0.244140625, "rewards/python_count_reward_func": 0.126953125, "rewards/python_reward_func": 0.1053059920668602, "rewards/tool_execution_reward_func": 0.1053059920668602, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.064464141821112, "grad_norm": 0.18119227939244234, "kl": 0.35698628425598145, "learning_rate": 1e-06, "loss": 0.0152, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7013.0, "completions/max_terminated_length": 7013.0, "completions/mean_length": 606.3515625, "completions/mean_terminated_length": 606.3515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06553854418479721, "grad_norm": 0.1854057254430237, "kl": 0.23478460311889648, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 17836228.0, "reward": 0.5818214416503906, "reward_std": 0.21726202964782715, "rewards/correct_answer_reward_func": 0.357421875, "rewards/format_reward_func": 0.990405797958374, "rewards/python_attempt_reward_func": 0.3125, "rewards/python_count_reward_func": 0.150390625, "rewards/python_reward_func": 0.1328125, "rewards/tool_execution_reward_func": 0.131591796875, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0666129465484824, "grad_norm": 0.18548619762394203, "kl": 0.2370905876159668, "learning_rate": 1e-06, "loss": -0.0009, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7554.0, "completions/max_terminated_length": 7554.0, "completions/mean_length": 791.04296875, "completions/mean_terminated_length": 791.04296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06768734891216761, "grad_norm": 0.19332874133648173, "kl": 0.6362118721008301, "learning_rate": 1e-06, "loss": -0.0257, "num_tokens": 18533210.0, "reward": 0.43970760703086853, "reward_std": 0.16351783275604248, "rewards/correct_answer_reward_func": 0.22265625, "rewards/format_reward_func": 0.9801785349845886, "rewards/python_attempt_reward_func": 0.390625, "rewards/python_count_reward_func": 0.134765625, "rewards/python_reward_func": 0.10559895634651184, "rewards/tool_execution_reward_func": 0.10507813096046448, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06876175127585281, "grad_norm": 0.20060722244411272, "kl": 0.6624758243560791, "learning_rate": 1e-06, "loss": -0.0257, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 6144.0, "completions/mean_length": 635.248046875, "completions/mean_terminated_length": 635.248046875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.069836153639538, "grad_norm": 0.19205313792834616, "kl": 0.5466327667236328, "learning_rate": 1e-06, "loss": -0.0279, "num_tokens": 19135225.0, "reward": 0.6720768213272095, "reward_std": 0.30561724305152893, "rewards/correct_answer_reward_func": 0.453125, "rewards/format_reward_func": 0.9813151359558105, "rewards/python_attempt_reward_func": 0.2265625, "rewards/python_count_reward_func": 0.1328125, "rewards/python_reward_func": 0.1140950471162796, "rewards/tool_execution_reward_func": 0.1134440153837204, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07091055600322321, "grad_norm": 0.18638682792626687, "kl": 0.48171520233154297, "learning_rate": 1e-06, "loss": -0.028, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3474.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 498.373046875, "completions/mean_terminated_length": 498.373046875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.07198495836690841, "grad_norm": 0.19854810353862182, "kl": 0.3688240051269531, "learning_rate": 1e-06, "loss": -0.033, "num_tokens": 19653688.0, "reward": 0.7807265520095825, "reward_std": 0.27144283056259155, "rewards/correct_answer_reward_func": 0.556640625, "rewards/format_reward_func": 0.9876171946525574, "rewards/python_attempt_reward_func": 0.20703125, "rewards/python_count_reward_func": 0.14453125, "rewards/python_reward_func": 0.1328125, "rewards/tool_execution_reward_func": 0.1328125, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0730593607305936, "grad_norm": 0.19617594221860576, "kl": 0.33297061920166016, "learning_rate": 1e-06, "loss": -0.0331, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7454.0, "completions/max_terminated_length": 7454.0, "completions/mean_length": 653.9375, "completions/mean_terminated_length": 653.9375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.07413376309427881, "grad_norm": 0.2274845063411033, "kl": 1.0947084426879883, "learning_rate": 1e-06, "loss": -0.0412, "num_tokens": 20264696.0, "reward": 0.4946131706237793, "reward_std": 0.21573424339294434, "rewards/correct_answer_reward_func": 0.27734375, "rewards/format_reward_func": 0.9548366069793701, "rewards/python_attempt_reward_func": 0.30859375, "rewards/python_count_reward_func": 0.173828125, "rewards/python_reward_func": 0.1324869841337204, "rewards/tool_execution_reward_func": 0.1315104216337204, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07520816545796401, "grad_norm": 0.2151193297669159, "kl": 1.0380728244781494, "learning_rate": 1e-06, "loss": -0.0413, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7266.0, "completions/max_terminated_length": 7266.0, "completions/mean_length": 607.935546875, "completions/mean_terminated_length": 607.935546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0762825678216492, "grad_norm": 0.19963355304856245, "kl": 0.3727908134460449, "learning_rate": 1e-06, "loss": -0.0802, "num_tokens": 20848567.0, "reward": 0.6591888666152954, "reward_std": 0.19971919059753418, "rewards/correct_answer_reward_func": 0.43359375, "rewards/format_reward_func": 0.981458306312561, "rewards/python_attempt_reward_func": 0.33203125, "rewards/python_count_reward_func": 0.173828125, "rewards/python_reward_func": 0.14801432192325592, "rewards/tool_execution_reward_func": 0.14651691913604736, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07735697018533441, "grad_norm": 0.19985141889973237, "kl": 0.3663978576660156, "learning_rate": 1e-06, "loss": -0.0802, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3405.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 467.076171875, "completions/mean_terminated_length": 467.076171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0784313725490196, "grad_norm": 0.24560340649894466, "kl": 0.87725830078125, "learning_rate": 1e-06, "loss": -0.0837, "num_tokens": 21355518.0, "reward": 0.5361849069595337, "reward_std": 0.15991619229316711, "rewards/correct_answer_reward_func": 0.298828125, "rewards/format_reward_func": 0.9675781726837158, "rewards/python_attempt_reward_func": 0.439453125, "rewards/python_count_reward_func": 0.263671875, "rewards/python_reward_func": 0.22148437798023224, "rewards/tool_execution_reward_func": 0.21920573711395264, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0795057749127048, "grad_norm": 0.24249816431786816, "kl": 0.8611855506896973, "learning_rate": 1e-06, "loss": -0.0837, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3836.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 662.537109375, "completions/mean_terminated_length": 662.537109375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.08058017727639001, "grad_norm": 0.19597132224672675, "kl": 0.30386924743652344, "learning_rate": 1e-06, "loss": -0.0781, "num_tokens": 21981105.0, "reward": 0.4932091236114502, "reward_std": 0.17331062257289886, "rewards/correct_answer_reward_func": 0.26171875, "rewards/format_reward_func": 0.9889713525772095, "rewards/python_attempt_reward_func": 0.33984375, "rewards/python_count_reward_func": 0.216796875, "rewards/python_reward_func": 0.17310267686843872, "rewards/tool_execution_reward_func": 0.1684802770614624, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0816545796400752, "grad_norm": 0.19361829397870217, "kl": 0.30319738388061523, "learning_rate": 1e-06, "loss": -0.0781, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3478.0, "completions/max_terminated_length": 3478.0, "completions/mean_length": 558.08984375, "completions/mean_terminated_length": 558.08984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.08272898200376042, "grad_norm": 0.16966138188596389, "kl": 0.2281637191772461, "learning_rate": 1e-06, "loss": -0.0344, "num_tokens": 22540959.0, "reward": 0.6010416746139526, "reward_std": 0.13987550139427185, "rewards/correct_answer_reward_func": 0.38671875, "rewards/format_reward_func": 0.9921875, "rewards/python_attempt_reward_func": 0.205078125, "rewards/python_count_reward_func": 0.109375, "rewards/python_reward_func": 0.0797526016831398, "rewards/tool_execution_reward_func": 0.0794270783662796, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08380338436744561, "grad_norm": 0.16823980908926442, "kl": 0.23378849029541016, "learning_rate": 1e-06, "loss": -0.0343, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 621.962890625, "completions/mean_terminated_length": 621.962890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0848777867311308, "grad_norm": 0.20126579285613713, "kl": 0.5362334251403809, "learning_rate": 1e-06, "loss": -0.0528, "num_tokens": 23147884.0, "reward": 0.6057213544845581, "reward_std": 0.22327746450901031, "rewards/correct_answer_reward_func": 0.388671875, "rewards/format_reward_func": 0.9771745204925537, "rewards/python_attempt_reward_func": 0.20703125, "rewards/python_count_reward_func": 0.13671875, "rewards/python_reward_func": 0.1097005307674408, "rewards/tool_execution_reward_func": 0.1080729216337204, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08595218909481601, "grad_norm": 0.20008133809838127, "kl": 0.560577392578125, "learning_rate": 1e-06, "loss": -0.0527, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2965.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 607.359375, "completions/mean_terminated_length": 607.359375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.08702659145850121, "grad_norm": 0.2025347252921362, "kl": 0.7452774047851562, "learning_rate": 1e-06, "loss": -0.0819, "num_tokens": 23737924.0, "reward": 0.5824848413467407, "reward_std": 0.1892421543598175, "rewards/correct_answer_reward_func": 0.35546875, "rewards/format_reward_func": 0.974207878112793, "rewards/python_attempt_reward_func": 0.3203125, "rewards/python_count_reward_func": 0.20703125, "rewards/python_reward_func": 0.16266277432441711, "rewards/tool_execution_reward_func": 0.16087239980697632, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0881009938221864, "grad_norm": 0.20055702150542165, "kl": 0.7675552368164062, "learning_rate": 1e-06, "loss": -0.0819, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3959.0, "completions/max_terminated_length": 3959.0, "completions/mean_length": 603.66796875, "completions/mean_terminated_length": 603.66796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.08917539618587161, "grad_norm": 0.20217408970852815, "kl": 0.7464570999145508, "learning_rate": 1e-06, "loss": -0.0742, "num_tokens": 24326234.0, "reward": 0.6575562953948975, "reward_std": 0.21810975670814514, "rewards/correct_answer_reward_func": 0.43359375, "rewards/format_reward_func": 0.9703612923622131, "rewards/python_attempt_reward_func": 0.314453125, "rewards/python_count_reward_func": 0.1875, "rewards/python_reward_func": 0.15240885317325592, "rewards/tool_execution_reward_func": 0.14945125579833984, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09024979854955681, "grad_norm": 0.19971073671420764, "kl": 0.7682285308837891, "learning_rate": 1e-06, "loss": -0.0742, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3286.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 440.19140625, "completions/mean_terminated_length": 440.19140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.091324200913242, "grad_norm": 0.2467541315070497, "kl": 0.3160133361816406, "learning_rate": 1e-06, "loss": -0.043, "num_tokens": 24825692.0, "reward": 0.7999497652053833, "reward_std": 0.3305312991142273, "rewards/correct_answer_reward_func": 0.556640625, "rewards/format_reward_func": 0.9960286617279053, "rewards/python_attempt_reward_func": 0.36328125, "rewards/python_count_reward_func": 0.25390625, "rewards/python_reward_func": 0.22265625, "rewards/tool_execution_reward_func": 0.2205171138048172, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09239860327692721, "grad_norm": 0.24743127051298594, "kl": 0.3356924057006836, "learning_rate": 1e-06, "loss": -0.043, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4589.0, "completions/max_terminated_length": 4589.0, "completions/mean_length": 630.322265625, "completions/mean_terminated_length": 630.322265625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.09347300564061241, "grad_norm": 0.19087807879757454, "kl": 0.13110637664794922, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 25424417.0, "reward": 0.619391918182373, "reward_std": 0.23611268401145935, "rewards/correct_answer_reward_func": 0.37890625, "rewards/format_reward_func": 0.9974479675292969, "rewards/python_attempt_reward_func": 0.392578125, "rewards/python_count_reward_func": 0.248046875, "rewards/python_reward_func": 0.20530599355697632, "rewards/tool_execution_reward_func": 0.20498046278953552, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0945474080042976, "grad_norm": 0.1905821996671793, "kl": 0.1331472396850586, "learning_rate": 1e-06, "loss": -0.0097, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4833.0, "completions/max_terminated_length": 4833.0, "completions/mean_length": 626.29296875, "completions/mean_terminated_length": 626.29296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.09562181036798281, "grad_norm": 0.18837771669443762, "kl": 0.4132537841796875, "learning_rate": 1e-06, "loss": -0.0292, "num_tokens": 26017079.0, "reward": 0.5661834478378296, "reward_std": 0.17254231870174408, "rewards/correct_answer_reward_func": 0.326171875, "rewards/format_reward_func": 0.9891666173934937, "rewards/python_attempt_reward_func": 0.443359375, "rewards/python_count_reward_func": 0.267578125, "rewards/python_reward_func": 0.2118675708770752, "rewards/tool_execution_reward_func": 0.2108910083770752, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09669621273166801, "grad_norm": 0.18824471492095998, "kl": 0.42687416076660156, "learning_rate": 1e-06, "loss": -0.0292, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5514.0, "completions/max_terminated_length": 5514.0, "completions/mean_length": 785.83203125, "completions/mean_terminated_length": 785.83203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.09777061509535322, "grad_norm": 0.2286782724327448, "kl": 0.9786891937255859, "learning_rate": 1e-06, "loss": -0.1041, "num_tokens": 26710401.0, "reward": 0.6496062278747559, "reward_std": 0.22434821724891663, "rewards/correct_answer_reward_func": 0.41015625, "rewards/format_reward_func": 0.9702697396278381, "rewards/python_attempt_reward_func": 0.572265625, "rewards/python_count_reward_func": 0.326171875, "rewards/python_reward_func": 0.22990992665290833, "rewards/tool_execution_reward_func": 0.22698025405406952, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.09884501745903841, "grad_norm": 0.2258044111215503, "kl": 0.9523410797119141, "learning_rate": 1e-06, "loss": -0.1041, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4907.0, "completions/max_terminated_length": 4907.0, "completions/mean_length": 641.12890625, "completions/mean_terminated_length": 641.12890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0999194198227236, "grad_norm": 0.1953320322483021, "kl": 1.152261734008789, "learning_rate": 1e-06, "loss": -0.091, "num_tokens": 27320419.0, "reward": 0.6456731557846069, "reward_std": 0.24772222340106964, "rewards/correct_answer_reward_func": 0.408203125, "rewards/format_reward_func": 0.9632291793823242, "rewards/python_attempt_reward_func": 0.47265625, "rewards/python_count_reward_func": 0.2890625, "rewards/python_reward_func": 0.22578668594360352, "rewards/tool_execution_reward_func": 0.22412109375, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10099382218640882, "grad_norm": 0.18810623110412805, "kl": 1.0844545364379883, "learning_rate": 1e-06, "loss": -0.091, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7134.0, "completions/max_terminated_length": 7134.0, "completions/mean_length": 686.951171875, "completions/mean_terminated_length": 686.951171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.10206822455009401, "grad_norm": 0.20052076655836884, "kl": 0.46610069274902344, "learning_rate": 1e-06, "loss": -0.0967, "num_tokens": 27963882.0, "reward": 0.4921613335609436, "reward_std": 0.24272781610488892, "rewards/correct_answer_reward_func": 0.234375, "rewards/format_reward_func": 0.9822547435760498, "rewards/python_attempt_reward_func": 0.708984375, "rewards/python_count_reward_func": 0.42578125, "rewards/python_reward_func": 0.3101329803466797, "rewards/tool_execution_reward_func": 0.30667704343795776, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1031426269137792, "grad_norm": 0.2012438305899308, "kl": 0.4481163024902344, "learning_rate": 1e-06, "loss": -0.0967, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7485.0, "completions/max_terminated_length": 7485.0, "completions/mean_length": 668.7265625, "completions/mean_terminated_length": 668.7265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.10421702927746442, "grad_norm": 0.22591357675543772, "kl": 0.8233146667480469, "learning_rate": 1e-06, "loss": -0.1096, "num_tokens": 28583966.0, "reward": 0.5738232135772705, "reward_std": 0.23653070628643036, "rewards/correct_answer_reward_func": 0.337890625, "rewards/format_reward_func": 0.9759440422058105, "rewards/python_attempt_reward_func": 0.4296875, "rewards/python_count_reward_func": 0.291015625, "rewards/python_reward_func": 0.20890842378139496, "rewards/tool_execution_reward_func": 0.20371869206428528, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10529143164114961, "grad_norm": 0.22477155903660886, "kl": 0.7876548767089844, "learning_rate": 1e-06, "loss": -0.1096, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3640.0, "completions/max_terminated_length": 3640.0, "completions/mean_length": 544.185546875, "completions/mean_terminated_length": 544.185546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1063658340048348, "grad_norm": 0.21366799188538735, "kl": 0.21801376342773438, "learning_rate": 1e-06, "loss": -0.024, "num_tokens": 29135453.0, "reward": 0.5218055248260498, "reward_std": 0.22050708532333374, "rewards/correct_answer_reward_func": 0.267578125, "rewards/format_reward_func": 0.9925390481948853, "rewards/python_attempt_reward_func": 0.5078125, "rewards/python_count_reward_func": 0.34765625, "rewards/python_reward_func": 0.28147321939468384, "rewards/tool_execution_reward_func": 0.27859777212142944, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10744023636852001, "grad_norm": 0.21341018139203774, "kl": 0.2110595703125, "learning_rate": 1e-06, "loss": -0.024, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3971.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 584.697265625, "completions/mean_terminated_length": 584.697265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.10851463873220521, "grad_norm": 0.23467730392590627, "kl": 0.013135433197021484, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 29698178.0, "reward": 0.6198717951774597, "reward_std": 0.22528144717216492, "rewards/correct_answer_reward_func": 0.34765625, "rewards/format_reward_func": 0.9858853816986084, "rewards/python_attempt_reward_func": 0.7578125, "rewards/python_count_reward_func": 0.486328125, "rewards/python_reward_func": 0.37665706872940063, "rewards/tool_execution_reward_func": 0.37519222497940063, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1095890410958904, "grad_norm": 0.2348291820150738, "kl": 0.013751983642578125, "learning_rate": 1e-06, "loss": -0.0096, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7558.0, "completions/max_terminated_length": 7558.0, "completions/mean_length": 946.111328125, "completions/mean_terminated_length": 946.111328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.11066344345957561, "grad_norm": 0.19082770888805842, "kl": 0.023172378540039062, "learning_rate": 1e-06, "loss": -0.0289, "num_tokens": 30466331.0, "reward": 0.563745379447937, "reward_std": 0.33444541692733765, "rewards/correct_answer_reward_func": 0.30078125, "rewards/format_reward_func": 0.9704910516738892, "rewards/python_attempt_reward_func": 0.97265625, "rewards/python_count_reward_func": 0.548828125, "rewards/python_reward_func": 0.35754045844078064, "rewards/tool_execution_reward_func": 0.34432971477508545, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11173784582326081, "grad_norm": 0.1903121839876226, "kl": 0.025722503662109375, "learning_rate": 1e-06, "loss": -0.0289, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6014.0, "completions/max_terminated_length": 6014.0, "completions/mean_length": 580.76171875, "completions/mean_terminated_length": 580.76171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.11281224818694602, "grad_norm": 0.19080372707883472, "kl": 0.006140232086181641, "learning_rate": 1e-06, "loss": -0.0128, "num_tokens": 31055649.0, "reward": 0.7579362392425537, "reward_std": 0.28464215993881226, "rewards/correct_answer_reward_func": 0.505859375, "rewards/format_reward_func": 0.9967447519302368, "rewards/python_attempt_reward_func": 0.505859375, "rewards/python_count_reward_func": 0.3203125, "rewards/python_reward_func": 0.26656901836395264, "rewards/tool_execution_reward_func": 0.26363933086395264, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11388665055063121, "grad_norm": 0.1909821929983437, "kl": 0.007145881652832031, "learning_rate": 1e-06, "loss": -0.0128, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7294.0, "completions/max_terminated_length": 7294.0, "completions/mean_length": 740.78515625, "completions/mean_terminated_length": 740.78515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.11496105291431641, "grad_norm": 0.23769712252441955, "kl": 0.022179603576660156, "learning_rate": 1e-06, "loss": -0.0323, "num_tokens": 31728019.0, "reward": 0.650765061378479, "reward_std": 0.31070852279663086, "rewards/correct_answer_reward_func": 0.380859375, "rewards/format_reward_func": 0.99079430103302, "rewards/python_attempt_reward_func": 0.869140625, "rewards/python_count_reward_func": 0.45703125, "rewards/python_reward_func": 0.36187297105789185, "rewards/tool_execution_reward_func": 0.35873404145240784, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11603545527800162, "grad_norm": 0.23734392049057842, "kl": 0.026244163513183594, "learning_rate": 1e-06, "loss": -0.0323, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7051.0, "completions/max_terminated_length": 7051.0, "completions/mean_length": 563.4921875, "completions/mean_terminated_length": 563.4921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.11710985764168681, "grad_norm": 0.21353191541322428, "kl": 0.03123950958251953, "learning_rate": 1e-06, "loss": -0.0724, "num_tokens": 32289231.0, "reward": 0.6626380681991577, "reward_std": 0.20601725578308105, "rewards/correct_answer_reward_func": 0.4140625, "rewards/format_reward_func": 0.9885156154632568, "rewards/python_attempt_reward_func": 0.51171875, "rewards/python_count_reward_func": 0.333984375, "rewards/python_reward_func": 0.25550130009651184, "rewards/tool_execution_reward_func": 0.25436198711395264, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11818426000537201, "grad_norm": 0.2129839606806815, "kl": 0.045807838439941406, "learning_rate": 1e-06, "loss": -0.0724, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4498.0, "completions/max_terminated_length": 4498.0, "completions/mean_length": 658.173828125, "completions/mean_terminated_length": 658.173828125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.11925866236905722, "grad_norm": 0.22984175341215135, "kl": 0.06760025024414062, "learning_rate": 1e-06, "loss": -0.0219, "num_tokens": 32918248.0, "reward": 0.5886523723602295, "reward_std": 0.19920453429222107, "rewards/correct_answer_reward_func": 0.28515625, "rewards/format_reward_func": 0.9909830689430237, "rewards/python_attempt_reward_func": 0.98046875, "rewards/python_count_reward_func": 0.640625, "rewards/python_reward_func": 0.5272297859191895, "rewards/tool_execution_reward_func": 0.5264973640441895, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12033306473274241, "grad_norm": 0.23038505685189462, "kl": 0.0901956558227539, "learning_rate": 1e-06, "loss": -0.0219, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6528.0, "completions/max_terminated_length": 6528.0, "completions/mean_length": 704.6640625, "completions/mean_terminated_length": 704.6640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.12140746709642761, "grad_norm": 0.21364869768150066, "kl": 0.06395626068115234, "learning_rate": 1e-06, "loss": -0.0387, "num_tokens": 33548380.0, "reward": 0.5936071872711182, "reward_std": 0.18968228995800018, "rewards/correct_answer_reward_func": 0.322265625, "rewards/format_reward_func": 0.9932942986488342, "rewards/python_attempt_reward_func": 0.90234375, "rewards/python_count_reward_func": 0.52734375, "rewards/python_reward_func": 0.36997073888778687, "rewards/tool_execution_reward_func": 0.36341381072998047, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12248186946011282, "grad_norm": 0.21495881788042823, "kl": 0.0816802978515625, "learning_rate": 1e-06, "loss": -0.0387, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5335.0, "completions/max_terminated_length": 5335.0, "completions/mean_length": 723.998046875, "completions/mean_terminated_length": 723.998046875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.12355627182379801, "grad_norm": 0.2387311503299123, "kl": 0.19479656219482422, "learning_rate": 1e-06, "loss": -0.0273, "num_tokens": 34190555.0, "reward": 0.6483234167098999, "reward_std": 0.22812014818191528, "rewards/correct_answer_reward_func": 0.375, "rewards/format_reward_func": 0.9908202886581421, "rewards/python_attempt_reward_func": 0.708984375, "rewards/python_count_reward_func": 0.4921875, "rewards/python_reward_func": 0.38577473163604736, "rewards/tool_execution_reward_func": 0.37579673528671265, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1246306741874832, "grad_norm": 0.24044222984706032, "kl": 0.2297067642211914, "learning_rate": 1e-06, "loss": -0.0272, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4211.0, "completions/max_terminated_length": 4211.0, "completions/mean_length": 645.984375, "completions/mean_terminated_length": 645.984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.12570507655116842, "grad_norm": 0.26672803828969505, "kl": 0.08442115783691406, "learning_rate": 1e-06, "loss": -0.0421, "num_tokens": 34801619.0, "reward": 0.6638801097869873, "reward_std": 0.1512978971004486, "rewards/correct_answer_reward_func": 0.369140625, "rewards/format_reward_func": 0.9966402053833008, "rewards/python_attempt_reward_func": 0.98828125, "rewards/python_count_reward_func": 0.6640625, "rewards/python_reward_func": 0.47929224371910095, "rewards/tool_execution_reward_func": 0.47705698013305664, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1267794789148536, "grad_norm": 0.26729683008650673, "kl": 0.08513832092285156, "learning_rate": 1e-06, "loss": -0.0421, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4417.0, "completions/max_terminated_length": 4417.0, "completions/mean_length": 659.00390625, "completions/mean_terminated_length": 659.00390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1278538812785388, "grad_norm": 0.247360447895365, "kl": 0.3147773742675781, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 35413205.0, "reward": 0.7946230173110962, "reward_std": 0.347296804189682, "rewards/correct_answer_reward_func": 0.5, "rewards/format_reward_func": 0.9885481595993042, "rewards/python_attempt_reward_func": 0.99609375, "rewards/python_count_reward_func": 0.60546875, "rewards/python_reward_func": 0.49134117364883423, "rewards/tool_execution_reward_func": 0.48456722497940063, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.128928283642224, "grad_norm": 0.23787718599225305, "kl": 0.3175697326660156, "learning_rate": 1e-06, "loss": 0.0046, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7415.0, "completions/max_terminated_length": 7415.0, "completions/mean_length": 942.119140625, "completions/mean_terminated_length": 942.119140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.13000268600590922, "grad_norm": 0.22680106136501893, "kl": 0.22305679321289062, "learning_rate": 1e-06, "loss": -0.0386, "num_tokens": 36184626.0, "reward": 0.6462082266807556, "reward_std": 0.21996113657951355, "rewards/correct_answer_reward_func": 0.37109375, "rewards/format_reward_func": 0.9891883730888367, "rewards/python_attempt_reward_func": 1.103515625, "rewards/python_count_reward_func": 0.578125, "rewards/python_reward_func": 0.39501953125, "rewards/tool_execution_reward_func": 0.38638395071029663, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13107708836959442, "grad_norm": 0.2403592414930314, "kl": 0.2853546142578125, "learning_rate": 1e-06, "loss": -0.0385, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7642.0, "completions/max_terminated_length": 7642.0, "completions/mean_length": 766.734375, "completions/mean_terminated_length": 766.734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.13215149073327961, "grad_norm": 0.44064645381957, "kl": 0.93389892578125, "learning_rate": 1e-06, "loss": -0.0758, "num_tokens": 36869322.0, "reward": 0.6366337537765503, "reward_std": 0.1877112090587616, "rewards/correct_answer_reward_func": 0.337890625, "rewards/format_reward_func": 0.9781543016433716, "rewards/python_attempt_reward_func": 1.044921875, "rewards/python_count_reward_func": 0.75, "rewards/python_reward_func": 0.521267294883728, "rewards/tool_execution_reward_func": 0.5155614614486694, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1332258930969648, "grad_norm": 0.4772102921256755, "kl": 0.97454833984375, "learning_rate": 1e-06, "loss": -0.0757, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7104.0, "completions/max_terminated_length": 7104.0, "completions/mean_length": 832.189453125, "completions/mean_terminated_length": 832.189453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.13430029546065, "grad_norm": 0.32575440344778894, "kl": 0.4351654052734375, "learning_rate": 1e-06, "loss": -0.0416, "num_tokens": 37560811.0, "reward": 0.6553205251693726, "reward_std": 0.25574129819869995, "rewards/correct_answer_reward_func": 0.3671875, "rewards/format_reward_func": 0.987967848777771, "rewards/python_attempt_reward_func": 0.9375, "rewards/python_count_reward_func": 0.646484375, "rewards/python_reward_func": 0.45974236726760864, "rewards/tool_execution_reward_func": 0.45269718766212463, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13537469782433523, "grad_norm": 0.3021652944891372, "kl": 0.4197235107421875, "learning_rate": 1e-06, "loss": -0.0417, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7643.0, "completions/max_terminated_length": 7643.0, "completions/mean_length": 972.48828125, "completions/mean_terminated_length": 972.48828125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.13644910018802042, "grad_norm": 0.47144602852601003, "kl": 1.36688232421875, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 38324869.0, "reward": 0.6969922184944153, "reward_std": 0.21087324619293213, "rewards/correct_answer_reward_func": 0.404296875, "rewards/format_reward_func": 0.9801952838897705, "rewards/python_attempt_reward_func": 1.466796875, "rewards/python_count_reward_func": 0.822265625, "rewards/python_reward_func": 0.49179691076278687, "rewards/tool_execution_reward_func": 0.4832814335823059, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13752350255170562, "grad_norm": 0.3185084641814634, "kl": 1.1292724609375, "learning_rate": 1e-06, "loss": -0.0047, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6440.0, "completions/max_terminated_length": 6440.0, "completions/mean_length": 796.146484375, "completions/mean_terminated_length": 796.146484375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1385979049153908, "grad_norm": 0.28669889866852877, "kl": 0.79986572265625, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 38993584.0, "reward": 0.8627153038978577, "reward_std": 0.22562414407730103, "rewards/correct_answer_reward_func": 0.556640625, "rewards/format_reward_func": 0.9941970109939575, "rewards/python_attempt_reward_func": 1.48828125, "rewards/python_count_reward_func": 0.83984375, "rewards/python_reward_func": 0.5460999011993408, "rewards/tool_execution_reward_func": 0.5361762046813965, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.139672307279076, "grad_norm": 0.2381290270558057, "kl": 0.62579345703125, "learning_rate": 1e-06, "loss": -0.0053, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5982.0, "completions/max_terminated_length": 5982.0, "completions/mean_length": 802.34375, "completions/mean_terminated_length": 802.34375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1407467096427612, "grad_norm": 0.26838334417081017, "kl": 0.38581085205078125, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 39685952.0, "reward": 0.7600735425949097, "reward_std": 0.2963756024837494, "rewards/correct_answer_reward_func": 0.466796875, "rewards/format_reward_func": 0.9903255701065063, "rewards/python_attempt_reward_func": 1.0390625, "rewards/python_count_reward_func": 0.693359375, "rewards/python_reward_func": 0.4806501269340515, "rewards/tool_execution_reward_func": 0.4760579466819763, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14182111200644643, "grad_norm": 0.26661714519573837, "kl": 0.4087677001953125, "learning_rate": 1e-06, "loss": 0.0092, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7629.0, "completions/max_terminated_length": 7629.0, "completions/mean_length": 773.978515625, "completions/mean_terminated_length": 773.978515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.14289551437013162, "grad_norm": 0.20843400010526072, "kl": 0.4949493408203125, "learning_rate": 1e-06, "loss": -0.0585, "num_tokens": 40364021.0, "reward": 0.8757146596908569, "reward_std": 0.2284032255411148, "rewards/correct_answer_reward_func": 0.56640625, "rewards/format_reward_func": 0.9932768940925598, "rewards/python_attempt_reward_func": 1.203125, "rewards/python_count_reward_func": 0.783203125, "rewards/python_reward_func": 0.5602562427520752, "rewards/tool_execution_reward_func": 0.5532653331756592, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14396991673381682, "grad_norm": 0.21164563869214434, "kl": 0.5514984130859375, "learning_rate": 1e-06, "loss": -0.0584, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7344.0, "completions/max_terminated_length": 7344.0, "completions/mean_length": 920.419921875, "completions/mean_terminated_length": 920.419921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.145044319097502, "grad_norm": 0.30670645278654085, "kl": 0.93798828125, "learning_rate": 1e-06, "loss": 0.0604, "num_tokens": 41100492.0, "reward": 0.9157981872558594, "reward_std": 0.24134208261966705, "rewards/correct_answer_reward_func": 0.607421875, "rewards/format_reward_func": 0.9776078462600708, "rewards/python_attempt_reward_func": 1.353515625, "rewards/python_count_reward_func": 0.919921875, "rewards/python_reward_func": 0.5738226771354675, "rewards/tool_execution_reward_func": 0.5642740726470947, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1461187214611872, "grad_norm": 0.3067350110519889, "kl": 1.0177001953125, "learning_rate": 1e-06, "loss": 0.0605, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6788.0, "completions/max_terminated_length": 6788.0, "completions/mean_length": 700.529296875, "completions/mean_terminated_length": 700.529296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1471931238248724, "grad_norm": 0.20327813029518688, "kl": 0.6613807678222656, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 41737563.0, "reward": 0.8853088617324829, "reward_std": 0.2581197917461395, "rewards/correct_answer_reward_func": 0.5859375, "rewards/format_reward_func": 0.9882226586341858, "rewards/python_attempt_reward_func": 0.970703125, "rewards/python_count_reward_func": 0.68359375, "rewards/python_reward_func": 0.51318359375, "rewards/tool_execution_reward_func": 0.508634090423584, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.14826752618855762, "grad_norm": 0.1978279584430828, "kl": 0.7326469421386719, "learning_rate": 1e-06, "loss": 0.0024, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7522.0, "completions/max_terminated_length": 7522.0, "completions/mean_length": 984.49609375, "completions/mean_terminated_length": 984.49609375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.14934192855224282, "grad_norm": 0.22632871078992167, "kl": 0.7314453125, "learning_rate": 1e-06, "loss": -0.0541, "num_tokens": 42524505.0, "reward": 0.8438538908958435, "reward_std": 0.21459010243415833, "rewards/correct_answer_reward_func": 0.546875, "rewards/format_reward_func": 0.9938448667526245, "rewards/python_attempt_reward_func": 1.470703125, "rewards/python_count_reward_func": 0.87890625, "rewards/python_reward_func": 0.5000317692756653, "rewards/tool_execution_reward_func": 0.49104970693588257, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15041633091592801, "grad_norm": 0.27494112946262583, "kl": 0.82208251953125, "learning_rate": 1e-06, "loss": -0.0541, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7402.0, "completions/max_terminated_length": 7402.0, "completions/mean_length": 986.14453125, "completions/mean_terminated_length": 986.14453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1514907332796132, "grad_norm": 0.25777798830782805, "kl": 0.9907379150390625, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 43308547.0, "reward": 0.732244610786438, "reward_std": 0.23545396327972412, "rewards/correct_answer_reward_func": 0.421875, "rewards/format_reward_func": 0.995763897895813, "rewards/python_attempt_reward_func": 1.46875, "rewards/python_count_reward_func": 0.916015625, "rewards/python_reward_func": 0.5638911724090576, "rewards/tool_execution_reward_func": 0.556084156036377, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1525651356432984, "grad_norm": 0.24221381114875812, "kl": 0.9665069580078125, "learning_rate": 1e-06, "loss": 0.0167, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4611.0, "completions/max_terminated_length": 4611.0, "completions/mean_length": 662.564453125, "completions/mean_terminated_length": 662.564453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.15363953800698363, "grad_norm": 0.33693714692658416, "kl": 2.0987548828125, "learning_rate": 1e-06, "loss": 0.0341, "num_tokens": 43920420.0, "reward": 0.900015652179718, "reward_std": 0.2951924204826355, "rewards/correct_answer_reward_func": 0.583984375, "rewards/format_reward_func": 0.9951822757720947, "rewards/python_attempt_reward_func": 1.150390625, "rewards/python_count_reward_func": 0.837890625, "rewards/python_reward_func": 0.5944653749465942, "rewards/tool_execution_reward_func": 0.5849741101264954, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15471394037066882, "grad_norm": 0.32416474424872, "kl": 2.00836181640625, "learning_rate": 1e-06, "loss": 0.034, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7516.0, "completions/max_terminated_length": 7516.0, "completions/mean_length": 814.86328125, "completions/mean_terminated_length": 814.86328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.15578834273435402, "grad_norm": 0.2300001785229148, "kl": 1.22021484375, "learning_rate": 1e-06, "loss": -0.028, "num_tokens": 44612766.0, "reward": 0.9035619497299194, "reward_std": 0.2257683277130127, "rewards/correct_answer_reward_func": 0.5859375, "rewards/format_reward_func": 0.9895095229148865, "rewards/python_attempt_reward_func": 1.4375, "rewards/python_count_reward_func": 0.935546875, "rewards/python_reward_func": 0.6082481145858765, "rewards/tool_execution_reward_func": 0.5986126661300659, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1568627450980392, "grad_norm": 0.2237831941533481, "kl": 1.1923828125, "learning_rate": 1e-06, "loss": -0.028, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4917.0, "completions/max_terminated_length": 4917.0, "completions/mean_length": 814.494140625, "completions/mean_terminated_length": 814.494140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1579371474617244, "grad_norm": 0.30259020032882117, "kl": 1.4819984436035156, "learning_rate": 1e-06, "loss": -0.0371, "num_tokens": 45306427.0, "reward": 0.7285399436950684, "reward_std": 0.24780330061912537, "rewards/correct_answer_reward_func": 0.400390625, "rewards/format_reward_func": 0.993567705154419, "rewards/python_attempt_reward_func": 1.328125, "rewards/python_count_reward_func": 0.916015625, "rewards/python_reward_func": 0.6544108390808105, "rewards/tool_execution_reward_func": 0.6471787691116333, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1590115498254096, "grad_norm": 0.24903509884631406, "kl": 1.2524490356445312, "learning_rate": 1e-06, "loss": -0.0373, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7522.0, "completions/max_terminated_length": 7522.0, "completions/mean_length": 993.490234375, "completions/mean_terminated_length": 993.490234375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.16008595218909483, "grad_norm": 0.19849327789950735, "kl": 0.873565673828125, "learning_rate": 1e-06, "loss": -0.0422, "num_tokens": 46100534.0, "reward": 0.7380402088165283, "reward_std": 0.29725003242492676, "rewards/correct_answer_reward_func": 0.4140625, "rewards/format_reward_func": 0.9921115636825562, "rewards/python_attempt_reward_func": 1.646484375, "rewards/python_count_reward_func": 1.15625, "rewards/python_reward_func": 0.6426711678504944, "rewards/tool_execution_reward_func": 0.6277770400047302, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16116035455278002, "grad_norm": 0.19635961729727056, "kl": 0.8181915283203125, "learning_rate": 1e-06, "loss": -0.0423, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7372.0, "completions/max_terminated_length": 7372.0, "completions/mean_length": 1104.91796875, "completions/mean_terminated_length": 1104.91796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.16223475691646522, "grad_norm": 0.24897727196911873, "kl": 0.9105224609375, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 46941836.0, "reward": 0.8712106943130493, "reward_std": 0.31053128838539124, "rewards/correct_answer_reward_func": 0.533203125, "rewards/format_reward_func": 0.9924045205116272, "rewards/python_attempt_reward_func": 1.9453125, "rewards/python_count_reward_func": 1.294921875, "rewards/python_reward_func": 0.7074179649353027, "rewards/tool_execution_reward_func": 0.6976338028907776, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1633091592801504, "grad_norm": 0.24866056897043184, "kl": 0.8956298828125, "learning_rate": 1e-06, "loss": 0.0266, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7491.0, "completions/max_terminated_length": 7491.0, "completions/mean_length": 953.466796875, "completions/mean_terminated_length": 953.466796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1643835616438356, "grad_norm": 0.2364656558337386, "kl": 0.6204910278320312, "learning_rate": 1e-06, "loss": 0.0534, "num_tokens": 47695547.0, "reward": 0.6423863768577576, "reward_std": 0.2180616706609726, "rewards/correct_answer_reward_func": 0.310546875, "rewards/format_reward_func": 0.9953248500823975, "rewards/python_attempt_reward_func": 1.642578125, "rewards/python_count_reward_func": 1.076171875, "rewards/python_reward_func": 0.6763733625411987, "rewards/tool_execution_reward_func": 0.6638725996017456, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16545796400752083, "grad_norm": 0.23083214515348444, "kl": 0.679840087890625, "learning_rate": 1e-06, "loss": 0.0535, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7647.0, "completions/max_terminated_length": 7647.0, "completions/mean_length": 965.5546875, "completions/mean_terminated_length": 965.5546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.16653236637120603, "grad_norm": 0.22865921670287156, "kl": 0.88580322265625, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 48471031.0, "reward": 0.7883422374725342, "reward_std": 0.17044253647327423, "rewards/correct_answer_reward_func": 0.478515625, "rewards/format_reward_func": 0.9853844046592712, "rewards/python_attempt_reward_func": 1.587890625, "rewards/python_count_reward_func": 1.099609375, "rewards/python_reward_func": 0.5727546811103821, "rewards/tool_execution_reward_func": 0.5637485980987549, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16760676873489122, "grad_norm": 0.23335632687165114, "kl": 0.9998779296875, "learning_rate": 1e-06, "loss": -0.0087, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7904.0, "completions/max_terminated_length": 7904.0, "completions/mean_length": 1128.275390625, "completions/mean_terminated_length": 1128.275390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.16868117109857642, "grad_norm": 0.27847104846778636, "kl": 1.26214599609375, "learning_rate": 1e-06, "loss": -0.0523, "num_tokens": 49328708.0, "reward": 0.6507740616798401, "reward_std": 0.18220829963684082, "rewards/correct_answer_reward_func": 0.326171875, "rewards/format_reward_func": 0.9876468181610107, "rewards/python_attempt_reward_func": 1.830078125, "rewards/python_count_reward_func": 1.14453125, "rewards/python_reward_func": 0.6436740756034851, "rewards/tool_execution_reward_func": 0.6353639364242554, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1697555734622616, "grad_norm": 0.2750305802512771, "kl": 1.35693359375, "learning_rate": 1e-06, "loss": -0.0522, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7344.0, "completions/max_terminated_length": 7344.0, "completions/mean_length": 959.59375, "completions/mean_terminated_length": 959.59375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1708299758259468, "grad_norm": 0.23613410972204055, "kl": 1.334716796875, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 50103668.0, "reward": 0.7099658250808716, "reward_std": 0.2486053854227066, "rewards/correct_answer_reward_func": 0.373046875, "rewards/format_reward_func": 0.9906944036483765, "rewards/python_attempt_reward_func": 1.81640625, "rewards/python_count_reward_func": 1.26953125, "rewards/python_reward_func": 0.7001596689224243, "rewards/tool_execution_reward_func": 0.6939003467559814, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17190437818963203, "grad_norm": 0.2366198339604775, "kl": 1.385498046875, "learning_rate": 1e-06, "loss": 0.0351, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7383.0, "completions/max_terminated_length": 7383.0, "completions/mean_length": 1481.63671875, "completions/mean_terminated_length": 1481.63671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.17297878055331722, "grad_norm": 0.2554006177599805, "kl": 1.85797119140625, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 51139962.0, "reward": 0.5833898782730103, "reward_std": 0.24890509247779846, "rewards/correct_answer_reward_func": 0.2578125, "rewards/format_reward_func": 0.9764392375946045, "rewards/python_attempt_reward_func": 2.576171875, "rewards/python_count_reward_func": 1.658203125, "rewards/python_reward_func": 0.6678401231765747, "rewards/tool_execution_reward_func": 0.6514477729797363, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17405318291700242, "grad_norm": 0.23500017379940996, "kl": 1.89495849609375, "learning_rate": 1e-06, "loss": -0.0045, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6638.0, "completions/max_terminated_length": 6638.0, "completions/mean_length": 880.212890625, "completions/mean_terminated_length": 880.212890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.17512758528068761, "grad_norm": 0.26660575688649757, "kl": 1.1519775390625, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 51867463.0, "reward": 0.9868017435073853, "reward_std": 0.24281340837478638, "rewards/correct_answer_reward_func": 0.65625, "rewards/format_reward_func": 0.9948046803474426, "rewards/python_attempt_reward_func": 1.384765625, "rewards/python_count_reward_func": 1.048828125, "rewards/python_reward_func": 0.6665806174278259, "rewards/tool_execution_reward_func": 0.6579543352127075, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1762019876443728, "grad_norm": 0.27401484133323906, "kl": 1.2177734375, "learning_rate": 1e-06, "loss": -0.0047, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7480.0, "completions/max_terminated_length": 7480.0, "completions/mean_length": 1333.07421875, "completions/mean_terminated_length": 1333.07421875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.177276390008058, "grad_norm": 0.2993531597164075, "kl": 2.77978515625, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 52830317.0, "reward": 0.8195416927337646, "reward_std": 0.3531012535095215, "rewards/correct_answer_reward_func": 0.482421875, "rewards/format_reward_func": 0.9950850009918213, "rewards/python_attempt_reward_func": 2.12890625, "rewards/python_count_reward_func": 1.4140625, "rewards/python_reward_func": 0.714287281036377, "rewards/tool_execution_reward_func": 0.6905142068862915, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17835079237174323, "grad_norm": 0.2870213691492818, "kl": 2.749755859375, "learning_rate": 1e-06, "loss": 0.0375, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7485.0, "completions/max_terminated_length": 7485.0, "completions/mean_length": 1141.302734375, "completions/mean_terminated_length": 1141.302734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.17942519473542842, "grad_norm": 0.24074560900384037, "kl": 1.764984130859375, "learning_rate": 1e-06, "loss": 0.0646, "num_tokens": 53691272.0, "reward": 0.7149697542190552, "reward_std": 0.24138948321342468, "rewards/correct_answer_reward_func": 0.375, "rewards/format_reward_func": 0.9919022917747498, "rewards/python_attempt_reward_func": 1.958984375, "rewards/python_count_reward_func": 1.392578125, "rewards/python_reward_func": 0.7202039957046509, "rewards/tool_execution_reward_func": 0.7079465985298157, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18049959709911362, "grad_norm": 0.2199049949999439, "kl": 1.642852783203125, "learning_rate": 1e-06, "loss": 0.0645, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4795.0, "completions/max_terminated_length": 4795.0, "completions/mean_length": 686.283203125, "completions/mean_terminated_length": 686.283203125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.1815739994627988, "grad_norm": 0.2857515374765721, "kl": 1.479095458984375, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 54306393.0, "reward": 1.0564508438110352, "reward_std": 0.20868231356143951, "rewards/correct_answer_reward_func": 0.724609375, "rewards/format_reward_func": 0.9982031583786011, "rewards/python_attempt_reward_func": 1.341796875, "rewards/python_count_reward_func": 0.9921875, "rewards/python_reward_func": 0.665947437286377, "rewards/tool_execution_reward_func": 0.661004900932312, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.182648401826484, "grad_norm": 0.2880975636835659, "kl": 1.504241943359375, "learning_rate": 1e-06, "loss": 0.0377, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4578.0, "completions/max_terminated_length": 4578.0, "completions/mean_length": 793.228515625, "completions/mean_terminated_length": 793.228515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.18372280419016923, "grad_norm": 0.2915098295493895, "kl": 1.862060546875, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 54997934.0, "reward": 0.9323036670684814, "reward_std": 0.32551535964012146, "rewards/correct_answer_reward_func": 0.58203125, "rewards/format_reward_func": 0.9946354031562805, "rewards/python_attempt_reward_func": 1.673828125, "rewards/python_count_reward_func": 1.267578125, "rewards/python_reward_func": 0.7654420733451843, "rewards/tool_execution_reward_func": 0.7567266225814819, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18479720655385443, "grad_norm": 0.2960240962588208, "kl": 1.955322265625, "learning_rate": 1e-06, "loss": 0.0403, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7936.0, "completions/max_terminated_length": 7936.0, "completions/mean_length": 1340.7734375, "completions/mean_terminated_length": 1340.7734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.18587160891753962, "grad_norm": 0.3280164273356732, "kl": 2.300384521484375, "learning_rate": 1e-06, "loss": 0.0737, "num_tokens": 55983258.0, "reward": 0.6804744005203247, "reward_std": 0.2821764349937439, "rewards/correct_answer_reward_func": 0.33203125, "rewards/format_reward_func": 0.9947324395179749, "rewards/python_attempt_reward_func": 2.2890625, "rewards/python_count_reward_func": 1.634765625, "rewards/python_reward_func": 0.7638702988624573, "rewards/tool_execution_reward_func": 0.7474834322929382, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18694601128122482, "grad_norm": 0.26538801534928824, "kl": 2.33740234375, "learning_rate": 1e-06, "loss": 0.0738, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6887.0, "completions/max_terminated_length": 6887.0, "completions/mean_length": 873.611328125, "completions/mean_terminated_length": 873.611328125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.18802041364491, "grad_norm": 0.31707500336817074, "kl": 1.3775634765625, "learning_rate": 1e-06, "loss": 0.0456, "num_tokens": 56694387.0, "reward": 0.9454830884933472, "reward_std": 0.14642590284347534, "rewards/correct_answer_reward_func": 0.580078125, "rewards/format_reward_func": 0.9990462064743042, "rewards/python_attempt_reward_func": 1.9453125, "rewards/python_count_reward_func": 1.486328125, "rewards/python_reward_func": 0.8364420533180237, "rewards/tool_execution_reward_func": 0.8279784917831421, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1890948160085952, "grad_norm": 0.30706390175723025, "kl": 1.5810546875, "learning_rate": 1e-06, "loss": 0.0458, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7282.0, "completions/max_terminated_length": 7282.0, "completions/mean_length": 1077.88671875, "completions/mean_terminated_length": 1077.88671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.19016921837228043, "grad_norm": 0.35946004285395716, "kl": 2.9658203125, "learning_rate": 1e-06, "loss": -0.0112, "num_tokens": 57526617.0, "reward": 1.003706455230713, "reward_std": 0.2776513397693634, "rewards/correct_answer_reward_func": 0.65234375, "rewards/format_reward_func": 0.9930481314659119, "rewards/python_attempt_reward_func": 1.927734375, "rewards/python_count_reward_func": 1.44140625, "rewards/python_reward_func": 0.7742288112640381, "rewards/tool_execution_reward_func": 0.7637656927108765, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19124362073596562, "grad_norm": 0.3634310644081999, "kl": 3.17333984375, "learning_rate": 1e-06, "loss": -0.011, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7506.0, "completions/max_terminated_length": 7506.0, "completions/mean_length": 1141.26953125, "completions/mean_terminated_length": 1141.26953125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.19231802309965082, "grad_norm": 1.2811052426735237, "kl": 6.50634765625, "learning_rate": 1e-06, "loss": 0.0761, "num_tokens": 58400099.0, "reward": 0.7914488315582275, "reward_std": 0.22771939635276794, "rewards/correct_answer_reward_func": 0.45703125, "rewards/format_reward_func": 0.9921576976776123, "rewards/python_attempt_reward_func": 1.919921875, "rewards/python_count_reward_func": 1.025390625, "rewards/python_reward_func": 0.6885509490966797, "rewards/tool_execution_reward_func": 0.6799300909042358, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19339242546333602, "grad_norm": 0.6728082442681859, "kl": 5.6026611328125, "learning_rate": 1e-06, "loss": 0.0752, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7929.0, "completions/max_terminated_length": 7929.0, "completions/mean_length": 1215.99609375, "completions/mean_terminated_length": 1215.99609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.1944668278270212, "grad_norm": 0.328745558025499, "kl": 3.82568359375, "learning_rate": 1e-06, "loss": 0.0771, "num_tokens": 59304641.0, "reward": 0.8904491066932678, "reward_std": 0.24833080172538757, "rewards/correct_answer_reward_func": 0.537109375, "rewards/format_reward_func": 0.997413158416748, "rewards/python_attempt_reward_func": 2.15625, "rewards/python_count_reward_func": 1.5078125, "rewards/python_reward_func": 0.7887942790985107, "rewards/tool_execution_reward_func": 0.7692855596542358, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19554123019070643, "grad_norm": 0.3155244191653759, "kl": 3.767578125, "learning_rate": 1e-06, "loss": 0.077, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7414.0, "completions/max_terminated_length": 7414.0, "completions/mean_length": 1211.556640625, "completions/mean_terminated_length": 1211.556640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.19661563255439163, "grad_norm": 0.3719181494715338, "kl": 3.38525390625, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 60202942.0, "reward": 0.7607579231262207, "reward_std": 0.27335304021835327, "rewards/correct_answer_reward_func": 0.416015625, "rewards/format_reward_func": 0.9936068058013916, "rewards/python_attempt_reward_func": 2.166015625, "rewards/python_count_reward_func": 1.427734375, "rewards/python_reward_func": 0.7414140105247498, "rewards/tool_execution_reward_func": 0.7301045060157776, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.19769003491807682, "grad_norm": 0.34534593537248537, "kl": 3.25, "learning_rate": 1e-06, "loss": 0.02, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7305.0, "completions/max_terminated_length": 7305.0, "completions/mean_length": 1154.302734375, "completions/mean_terminated_length": 1154.302734375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.19876443728176202, "grad_norm": 0.27253270682956426, "kl": 2.59423828125, "learning_rate": 1e-06, "loss": 0.0423, "num_tokens": 61065369.0, "reward": 0.8091665506362915, "reward_std": 0.33404308557510376, "rewards/correct_answer_reward_func": 0.45703125, "rewards/format_reward_func": 0.9956777095794678, "rewards/python_attempt_reward_func": 2.1328125, "rewards/python_count_reward_func": 1.451171875, "rewards/python_reward_func": 0.7700009346008301, "rewards/tool_execution_reward_func": 0.7649987936019897, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1998388396454472, "grad_norm": 0.22880857788893605, "kl": 2.361328125, "learning_rate": 1e-06, "loss": 0.042, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8250.0, "completions/max_terminated_length": 8250.0, "completions/mean_length": 996.294921875, "completions/mean_terminated_length": 996.294921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2009132420091324, "grad_norm": 0.33274490814419067, "kl": 3.40478515625, "learning_rate": 1e-06, "loss": 0.0343, "num_tokens": 61849232.0, "reward": 0.9090028405189514, "reward_std": 0.3073886036872864, "rewards/correct_answer_reward_func": 0.564453125, "rewards/format_reward_func": 0.9915637373924255, "rewards/python_attempt_reward_func": 1.6953125, "rewards/python_count_reward_func": 1.314453125, "rewards/python_reward_func": 0.7484948635101318, "rewards/tool_execution_reward_func": 0.7311849594116211, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20198764437281763, "grad_norm": 0.3108539953495422, "kl": 3.233154296875, "learning_rate": 1e-06, "loss": 0.0341, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5588.0, "completions/max_terminated_length": 5588.0, "completions/mean_length": 1004.17578125, "completions/mean_terminated_length": 1004.17578125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20306204673650283, "grad_norm": 0.2153698297359144, "kl": 2.33587646484375, "learning_rate": 1e-06, "loss": 0.0573, "num_tokens": 62652074.0, "reward": 0.8259343504905701, "reward_std": 0.2549676299095154, "rewards/correct_answer_reward_func": 0.46875, "rewards/format_reward_func": 0.9930338263511658, "rewards/python_attempt_reward_func": 1.986328125, "rewards/python_count_reward_func": 1.470703125, "rewards/python_reward_func": 0.8039217591285706, "rewards/tool_execution_reward_func": 0.7928881645202637, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20413644910018802, "grad_norm": 0.21609210055169029, "kl": 2.15167236328125, "learning_rate": 1e-06, "loss": 0.0571, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7344.0, "completions/max_terminated_length": 7344.0, "completions/mean_length": 1072.5859375, "completions/mean_terminated_length": 1072.5859375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20521085146387322, "grad_norm": 1.8126503763526618, "kl": 4.24267578125, "learning_rate": 1e-06, "loss": 0.1085, "num_tokens": 63480406.0, "reward": 0.8541597127914429, "reward_std": 0.24349257349967957, "rewards/correct_answer_reward_func": 0.501953125, "rewards/format_reward_func": 0.9960528016090393, "rewards/python_attempt_reward_func": 2.017578125, "rewards/python_count_reward_func": 1.46484375, "rewards/python_reward_func": 0.7842168807983398, "rewards/tool_execution_reward_func": 0.7649801969528198, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2062852538275584, "grad_norm": 0.5190025643998739, "kl": 3.036865234375, "learning_rate": 1e-06, "loss": 0.1073, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7248.0, "completions/max_terminated_length": 7248.0, "completions/mean_length": 788.87890625, "completions/mean_terminated_length": 788.87890625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.2073596561912436, "grad_norm": 0.2549181916085058, "kl": 2.2529296875, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 64151224.0, "reward": 1.0420122146606445, "reward_std": 0.2245749831199646, "rewards/correct_answer_reward_func": 0.6953125, "rewards/format_reward_func": 0.9989546537399292, "rewards/python_attempt_reward_func": 1.671875, "rewards/python_count_reward_func": 1.255859375, "rewards/python_reward_func": 0.7458914518356323, "rewards/tool_execution_reward_func": 0.7345439791679382, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.20843405855492883, "grad_norm": 0.24648750318525628, "kl": 2.21875, "learning_rate": 1e-06, "loss": 0.0257, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5993.0, "completions/max_terminated_length": 5993.0, "completions/mean_length": 1079.15625, "completions/mean_terminated_length": 1079.15625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20950846091861403, "grad_norm": 0.3459661297616505, "kl": 2.8326416015625, "learning_rate": 1e-06, "loss": 0.0654, "num_tokens": 64975848.0, "reward": 0.7157565951347351, "reward_std": 0.2364378273487091, "rewards/correct_answer_reward_func": 0.3671875, "rewards/format_reward_func": 0.9962332248687744, "rewards/python_attempt_reward_func": 2.25390625, "rewards/python_count_reward_func": 1.55859375, "rewards/python_reward_func": 0.7591122388839722, "rewards/tool_execution_reward_func": 0.7466123104095459, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21058286328229922, "grad_norm": 0.3505316667299284, "kl": 3.122802734375, "learning_rate": 1e-06, "loss": 0.0656, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3383.0, "completions/max_terminated_length": 3383.0, "completions/mean_length": 664.810546875, "completions/mean_terminated_length": 664.810546875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.21165726564598442, "grad_norm": 0.34010007502087447, "kl": 2.83935546875, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 65581351.0, "reward": 0.9867193698883057, "reward_std": 0.20384810864925385, "rewards/correct_answer_reward_func": 0.6328125, "rewards/format_reward_func": 0.9984592199325562, "rewards/python_attempt_reward_func": 1.5234375, "rewards/python_count_reward_func": 1.173828125, "rewards/python_reward_func": 0.7802091240882874, "rewards/tool_execution_reward_func": 0.7710751891136169, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2127316680096696, "grad_norm": 0.3267099121123493, "kl": 2.8663330078125, "learning_rate": 1e-06, "loss": 0.016, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6701.0, "completions/max_terminated_length": 6701.0, "completions/mean_length": 941.85546875, "completions/mean_terminated_length": 941.85546875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.21380607037335483, "grad_norm": 0.30469796722995224, "kl": 1.745849609375, "learning_rate": 1e-06, "loss": 0.055, "num_tokens": 66352957.0, "reward": 0.7681488990783691, "reward_std": 0.21544794738292694, "rewards/correct_answer_reward_func": 0.416015625, "rewards/format_reward_func": 0.9961133003234863, "rewards/python_attempt_reward_func": 1.7890625, "rewards/python_count_reward_func": 1.392578125, "rewards/python_reward_func": 0.7727283239364624, "rewards/tool_execution_reward_func": 0.7645531296730042, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21488047273704003, "grad_norm": 0.28201191985249213, "kl": 1.751953125, "learning_rate": 1e-06, "loss": 0.055, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6720.0, "completions/max_terminated_length": 6720.0, "completions/mean_length": 958.0625, "completions/mean_terminated_length": 958.0625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21595487510072522, "grad_norm": 0.44272699887872774, "kl": 0.5402679443359375, "learning_rate": 1e-06, "loss": 0.0367, "num_tokens": 67122269.0, "reward": 0.7662996649742126, "reward_std": 0.24068810045719147, "rewards/correct_answer_reward_func": 0.416015625, "rewards/format_reward_func": 0.9867903590202332, "rewards/python_attempt_reward_func": 1.75, "rewards/python_count_reward_func": 1.349609375, "rewards/python_reward_func": 0.7718773484230042, "rewards/tool_execution_reward_func": 0.7646298408508301, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21702927746441042, "grad_norm": 0.43510529346531357, "kl": 0.600372314453125, "learning_rate": 1e-06, "loss": 0.0368, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7446.0, "completions/max_terminated_length": 7446.0, "completions/mean_length": 1247.15234375, "completions/mean_terminated_length": 1247.15234375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21810367982809561, "grad_norm": 0.3930062333087388, "kl": 0.9957275390625, "learning_rate": 1e-06, "loss": 0.0415, "num_tokens": 68050859.0, "reward": 0.7801809310913086, "reward_std": 0.2143365889787674, "rewards/correct_answer_reward_func": 0.44921875, "rewards/format_reward_func": 0.9931836128234863, "rewards/python_attempt_reward_func": 2.369140625, "rewards/python_count_reward_func": 1.662109375, "rewards/python_reward_func": 0.6730585098266602, "rewards/tool_execution_reward_func": 0.6616273522377014, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2191780821917808, "grad_norm": 0.36915755309932446, "kl": 1.0137939453125, "learning_rate": 1e-06, "loss": 0.0415, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7208.0, "completions/max_terminated_length": 7208.0, "completions/mean_length": 990.87890625, "completions/mean_terminated_length": 990.87890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.22025248455546603, "grad_norm": 0.31282721356602583, "kl": 0.72283935546875, "learning_rate": 1e-06, "loss": 0.0642, "num_tokens": 68845453.0, "reward": 0.8166325688362122, "reward_std": 0.23826858401298523, "rewards/correct_answer_reward_func": 0.462890625, "rewards/format_reward_func": 0.9967782497406006, "rewards/python_attempt_reward_func": 1.916015625, "rewards/python_count_reward_func": 1.490234375, "rewards/python_reward_func": 0.7780429124832153, "rewards/tool_execution_reward_func": 0.7719315886497498, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22132688691915123, "grad_norm": 0.30532812611792975, "kl": 0.8726806640625, "learning_rate": 1e-06, "loss": 0.0643, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7312.0, "completions/max_terminated_length": 7312.0, "completions/mean_length": 1388.958984375, "completions/mean_terminated_length": 1388.958984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.22240128928283642, "grad_norm": 0.3875068990517743, "kl": 1.7373046875, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 69841368.0, "reward": 0.7827950119972229, "reward_std": 0.2220541089773178, "rewards/correct_answer_reward_func": 0.427734375, "rewards/format_reward_func": 0.9918427467346191, "rewards/python_attempt_reward_func": 2.462890625, "rewards/python_count_reward_func": 1.763671875, "rewards/python_reward_func": 0.8002464771270752, "rewards/tool_execution_reward_func": 0.7834604978561401, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22347569164652162, "grad_norm": 0.28923511974157634, "kl": 1.906494140625, "learning_rate": 1e-06, "loss": 0.0141, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7354.0, "completions/max_terminated_length": 7354.0, "completions/mean_length": 1194.611328125, "completions/mean_terminated_length": 1194.611328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2245500940102068, "grad_norm": 0.2735647719160465, "kl": 2.147705078125, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 70740081.0, "reward": 0.8746207356452942, "reward_std": 0.27493181824684143, "rewards/correct_answer_reward_func": 0.51171875, "rewards/format_reward_func": 0.9949113130569458, "rewards/python_attempt_reward_func": 2.23828125, "rewards/python_count_reward_func": 1.75390625, "rewards/python_reward_func": 0.8339626789093018, "rewards/tool_execution_reward_func": 0.8195986747741699, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22562449637389204, "grad_norm": 0.2952077623948561, "kl": 2.582275390625, "learning_rate": 1e-06, "loss": 0.0311, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7612.0, "completions/max_terminated_length": 7612.0, "completions/mean_length": 1007.609375, "completions/mean_terminated_length": 1007.609375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.22669889873757723, "grad_norm": 0.964016954550929, "kl": 4.3134765625, "learning_rate": 1e-06, "loss": 0.0796, "num_tokens": 71538601.0, "reward": 1.0122286081314087, "reward_std": 0.31813332438468933, "rewards/correct_answer_reward_func": 0.65234375, "rewards/format_reward_func": 0.9928992986679077, "rewards/python_attempt_reward_func": 1.994140625, "rewards/python_count_reward_func": 1.462890625, "rewards/python_reward_func": 0.8195871114730835, "rewards/tool_execution_reward_func": 0.8065251708030701, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22777330110126243, "grad_norm": 0.8001628441709349, "kl": 4.60595703125, "learning_rate": 1e-06, "loss": 0.0799, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7526.0, "completions/max_terminated_length": 7526.0, "completions/mean_length": 1187.81640625, "completions/mean_terminated_length": 1187.81640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.22884770346494762, "grad_norm": 0.6009544698374307, "kl": 3.863525390625, "learning_rate": 1e-06, "loss": 0.0444, "num_tokens": 72418443.0, "reward": 0.9286837577819824, "reward_std": 0.1935826539993286, "rewards/correct_answer_reward_func": 0.572265625, "rewards/format_reward_func": 0.9952245950698853, "rewards/python_attempt_reward_func": 2.1796875, "rewards/python_count_reward_func": 1.654296875, "rewards/python_reward_func": 0.8003131151199341, "rewards/tool_execution_reward_func": 0.7868660688400269, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.22992210582863282, "grad_norm": 0.39227021151289126, "kl": 3.768310546875, "learning_rate": 1e-06, "loss": 0.0443, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5734.0, "completions/max_terminated_length": 5734.0, "completions/mean_length": 1038.798828125, "completions/mean_terminated_length": 1038.798828125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.230996508192318, "grad_norm": 0.47063741494550587, "kl": 4.43505859375, "learning_rate": 1e-06, "loss": 0.0559, "num_tokens": 73239172.0, "reward": 0.834419846534729, "reward_std": 0.266712486743927, "rewards/correct_answer_reward_func": 0.48828125, "rewards/format_reward_func": 0.9945619106292725, "rewards/python_attempt_reward_func": 1.9765625, "rewards/python_count_reward_func": 1.443359375, "rewards/python_reward_func": 0.7473183274269104, "rewards/tool_execution_reward_func": 0.7361312508583069, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23207091055600323, "grad_norm": 0.4374438071246729, "kl": 4.337158203125, "learning_rate": 1e-06, "loss": 0.0558, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7474.0, "completions/max_terminated_length": 7474.0, "completions/mean_length": 1350.86328125, "completions/mean_terminated_length": 1350.86328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.23314531291968843, "grad_norm": 0.38449531759915434, "kl": 4.4296875, "learning_rate": 1e-06, "loss": 0.0838, "num_tokens": 74216638.0, "reward": 0.8545554876327515, "reward_std": 0.2969059944152832, "rewards/correct_answer_reward_func": 0.49609375, "rewards/format_reward_func": 0.9930658340454102, "rewards/python_attempt_reward_func": 2.525390625, "rewards/python_count_reward_func": 1.828125, "rewards/python_reward_func": 0.8110677003860474, "rewards/tool_execution_reward_func": 0.7992427349090576, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23421971528337363, "grad_norm": 0.3347831508571784, "kl": 4.21240234375, "learning_rate": 1e-06, "loss": 0.0836, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6086.0, "completions/max_terminated_length": 6086.0, "completions/mean_length": 846.935546875, "completions/mean_terminated_length": 846.935546875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.23529411764705882, "grad_norm": 0.39244425727217047, "kl": 3.273193359375, "learning_rate": 1e-06, "loss": 0.0545, "num_tokens": 74922685.0, "reward": 0.940233588218689, "reward_std": 0.1665063202381134, "rewards/correct_answer_reward_func": 0.5703125, "rewards/format_reward_func": 0.9984667897224426, "rewards/python_attempt_reward_func": 1.703125, "rewards/python_count_reward_func": 1.341796875, "rewards/python_reward_func": 0.8584534525871277, "rewards/tool_execution_reward_func": 0.8511385917663574, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23636852001074402, "grad_norm": 0.3671289280649945, "kl": 3.1669921875, "learning_rate": 1e-06, "loss": 0.0543, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8055.0, "completions/max_terminated_length": 8055.0, "completions/mean_length": 1004.23828125, "completions/mean_terminated_length": 1004.23828125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2374429223744292, "grad_norm": 4.719428468417647, "kl": 7.929931640625, "learning_rate": 1e-06, "loss": 0.0583, "num_tokens": 75720183.0, "reward": 0.7979421615600586, "reward_std": 0.19189885258674622, "rewards/correct_answer_reward_func": 0.431640625, "rewards/format_reward_func": 0.9972529411315918, "rewards/python_attempt_reward_func": 1.69140625, "rewards/python_count_reward_func": 1.419921875, "rewards/python_reward_func": 0.8474826812744141, "rewards/tool_execution_reward_func": 0.8342548608779907, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23851732473811443, "grad_norm": 1.633050701596008, "kl": 5.0794677734375, "learning_rate": 1e-06, "loss": 0.0555, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7379.0, "completions/max_terminated_length": 7379.0, "completions/mean_length": 1140.44921875, "completions/mean_terminated_length": 1140.44921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.23959172710179963, "grad_norm": 0.3795314153058355, "kl": 4.8115234375, "learning_rate": 1e-06, "loss": 0.0695, "num_tokens": 76575613.0, "reward": 0.8959815502166748, "reward_std": 0.320169061422348, "rewards/correct_answer_reward_func": 0.53125, "rewards/format_reward_func": 0.9877474308013916, "rewards/python_attempt_reward_func": 2.4609375, "rewards/python_count_reward_func": 1.904296875, "rewards/python_reward_func": 0.8473454713821411, "rewards/tool_execution_reward_func": 0.8359103798866272, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24066612946548482, "grad_norm": 0.3311500541420088, "kl": 4.9091796875, "learning_rate": 1e-06, "loss": 0.0696, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8455.0, "completions/max_terminated_length": 8455.0, "completions/mean_length": 1252.404296875, "completions/mean_terminated_length": 1252.404296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.24174053182917002, "grad_norm": 2.4147601790659565, "kl": 7.45556640625, "learning_rate": 1e-06, "loss": 0.0932, "num_tokens": 77500396.0, "reward": 0.950271487236023, "reward_std": 0.2508386969566345, "rewards/correct_answer_reward_func": 0.59375, "rewards/format_reward_func": 0.9959781169891357, "rewards/python_attempt_reward_func": 2.296875, "rewards/python_count_reward_func": 1.712890625, "rewards/python_reward_func": 0.7964541912078857, "rewards/tool_execution_reward_func": 0.7866296768188477, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24281493419285521, "grad_norm": 0.9538582473569891, "kl": 5.82373046875, "learning_rate": 1e-06, "loss": 0.0916, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6703.0, "completions/max_terminated_length": 6703.0, "completions/mean_length": 683.466796875, "completions/mean_terminated_length": 683.466796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.24388933655654044, "grad_norm": 0.4353781721616983, "kl": 5.201904296875, "learning_rate": 1e-06, "loss": 0.0712, "num_tokens": 78119835.0, "reward": 0.9478119611740112, "reward_std": 0.1921873688697815, "rewards/correct_answer_reward_func": 0.578125, "rewards/format_reward_func": 0.9979166984558105, "rewards/python_attempt_reward_func": 1.599609375, "rewards/python_count_reward_func": 1.3359375, "rewards/python_reward_func": 0.863116979598999, "rewards/tool_execution_reward_func": 0.850518524646759, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24496373892022563, "grad_norm": 0.3569099809321047, "kl": 4.8414306640625, "learning_rate": 1e-06, "loss": 0.0708, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6084.0, "completions/max_terminated_length": 6084.0, "completions/mean_length": 860.611328125, "completions/mean_terminated_length": 860.611328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.24603814128391083, "grad_norm": 0.8403275073582749, "kl": 9.31689453125, "learning_rate": 1e-06, "loss": 0.0777, "num_tokens": 78832436.0, "reward": 0.9972695708274841, "reward_std": 0.17482107877731323, "rewards/correct_answer_reward_func": 0.6484375, "rewards/format_reward_func": 0.993058979511261, "rewards/python_attempt_reward_func": 1.779296875, "rewards/python_count_reward_func": 1.41796875, "rewards/python_reward_func": 0.7705527544021606, "rewards/tool_execution_reward_func": 0.7511013746261597, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.24711254364759602, "grad_norm": 0.6181514871147658, "kl": 8.08935546875, "learning_rate": 1e-06, "loss": 0.0765, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4876.0, "completions/max_terminated_length": 4876.0, "completions/mean_length": 943.396484375, "completions/mean_terminated_length": 943.396484375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.24818694601128122, "grad_norm": 0.3229147340015564, "kl": 3.882568359375, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 79594367.0, "reward": 0.801682710647583, "reward_std": 0.30930235981941223, "rewards/correct_answer_reward_func": 0.43359375, "rewards/format_reward_func": 0.9945703148841858, "rewards/python_attempt_reward_func": 2.083984375, "rewards/python_count_reward_func": 1.666015625, "rewards/python_reward_func": 0.8549959659576416, "rewards/tool_execution_reward_func": 0.8458744287490845, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2492613483749664, "grad_norm": 0.2523609575439555, "kl": 3.3427734375, "learning_rate": 1e-06, "loss": 0.0402, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6701.0, "completions/max_terminated_length": 6701.0, "completions/mean_length": 1030.5703125, "completions/mean_terminated_length": 1030.5703125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2503357507386516, "grad_norm": 0.30136701729418924, "kl": 3.6898193359375, "learning_rate": 1e-06, "loss": 0.0432, "num_tokens": 80405763.0, "reward": 0.9304864406585693, "reward_std": 0.24752673506736755, "rewards/correct_answer_reward_func": 0.568359375, "rewards/format_reward_func": 0.9946949481964111, "rewards/python_attempt_reward_func": 2.16796875, "rewards/python_count_reward_func": 1.6328125, "rewards/python_reward_func": 0.8242225646972656, "rewards/tool_execution_reward_func": 0.8159404397010803, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25141015310233683, "grad_norm": 0.3020465051549465, "kl": 3.4061279296875, "learning_rate": 1e-06, "loss": 0.0429, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6865.0, "completions/max_terminated_length": 6865.0, "completions/mean_length": 1574.7265625, "completions/mean_terminated_length": 1574.7265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.252484555466022, "grad_norm": 0.21528629616688705, "kl": 2.597412109375, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 81497463.0, "reward": 0.7755173444747925, "reward_std": 0.26789799332618713, "rewards/correct_answer_reward_func": 0.416015625, "rewards/format_reward_func": 0.9942734241485596, "rewards/python_attempt_reward_func": 2.767578125, "rewards/python_count_reward_func": 2.0625, "rewards/python_reward_func": 0.8152785301208496, "rewards/tool_execution_reward_func": 0.8032350540161133, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2535589578297072, "grad_norm": 0.2045121767031371, "kl": 2.433349609375, "learning_rate": 1e-06, "loss": 0.0407, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7571.0, "completions/max_terminated_length": 7571.0, "completions/mean_length": 1098.421875, "completions/mean_terminated_length": 1098.421875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.25463336019339244, "grad_norm": 0.30831225015193464, "kl": 3.0888671875, "learning_rate": 1e-06, "loss": 0.0512, "num_tokens": 82330095.0, "reward": 0.8915045261383057, "reward_std": 0.2799481153488159, "rewards/correct_answer_reward_func": 0.537109375, "rewards/format_reward_func": 0.9926695823669434, "rewards/python_attempt_reward_func": 2.107421875, "rewards/python_count_reward_func": 1.6484375, "rewards/python_reward_func": 0.7989296913146973, "rewards/tool_execution_reward_func": 0.779306173324585, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2557077625570776, "grad_norm": 0.3093077473928091, "kl": 3.235595703125, "learning_rate": 1e-06, "loss": 0.0513, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7367.0, "completions/max_terminated_length": 7367.0, "completions/mean_length": 1149.53515625, "completions/mean_terminated_length": 1149.53515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.25678216492076283, "grad_norm": 0.34299644573467697, "kl": 2.7286376953125, "learning_rate": 1e-06, "loss": 0.0892, "num_tokens": 83196929.0, "reward": 0.877892792224884, "reward_std": 0.2363237589597702, "rewards/correct_answer_reward_func": 0.515625, "rewards/format_reward_func": 0.9951747059822083, "rewards/python_attempt_reward_func": 2.2734375, "rewards/python_count_reward_func": 1.779296875, "rewards/python_reward_func": 0.822381854057312, "rewards/tool_execution_reward_func": 0.816164493560791, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.257856567284448, "grad_norm": 0.3567958560819511, "kl": 3.1865234375, "learning_rate": 1e-06, "loss": 0.0897, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7114.0, "completions/max_terminated_length": 7114.0, "completions/mean_length": 1052.146484375, "completions/mean_terminated_length": 1052.146484375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2589309696481332, "grad_norm": 0.39152016026674896, "kl": 3.7958984375, "learning_rate": 1e-06, "loss": 0.0566, "num_tokens": 84003628.0, "reward": 1.085820198059082, "reward_std": 0.2424648404121399, "rewards/correct_answer_reward_func": 0.716796875, "rewards/format_reward_func": 0.99602210521698, "rewards/python_attempt_reward_func": 2.255859375, "rewards/python_count_reward_func": 1.720703125, "rewards/python_reward_func": 0.8560926914215088, "rewards/tool_execution_reward_func": 0.8490947484970093, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26000537201181845, "grad_norm": 0.2832748547494978, "kl": 3.85498046875, "learning_rate": 1e-06, "loss": 0.0567, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7424.0, "completions/max_terminated_length": 7424.0, "completions/mean_length": 1298.306640625, "completions/mean_terminated_length": 1298.306640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2610797743755036, "grad_norm": 0.5314879235210641, "kl": 7.5546875, "learning_rate": 1e-06, "loss": 0.0706, "num_tokens": 84963049.0, "reward": 0.7816027998924255, "reward_std": 0.358371376991272, "rewards/correct_answer_reward_func": 0.42578125, "rewards/format_reward_func": 0.9907143115997314, "rewards/python_attempt_reward_func": 2.580078125, "rewards/python_count_reward_func": 1.787109375, "rewards/python_reward_func": 0.803045928478241, "rewards/tool_execution_reward_func": 0.7883936166763306, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26215417673918884, "grad_norm": 0.43718795565440516, "kl": 7.578125, "learning_rate": 1e-06, "loss": 0.0707, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5834.0, "completions/max_terminated_length": 5834.0, "completions/mean_length": 868.22265625, "completions/mean_terminated_length": 868.22265625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.263228579102874, "grad_norm": 0.5783648805390917, "kl": 4.75506591796875, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 85680315.0, "reward": 1.0560284852981567, "reward_std": 0.24768708646297455, "rewards/correct_answer_reward_func": 0.6875, "rewards/format_reward_func": 0.9990885257720947, "rewards/python_attempt_reward_func": 1.86328125, "rewards/python_count_reward_func": 1.474609375, "rewards/python_reward_func": 0.849239706993103, "rewards/tool_execution_reward_func": 0.843553900718689, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26430298146655923, "grad_norm": 0.5018060984862908, "kl": 4.52587890625, "learning_rate": 1e-06, "loss": 0.0648, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7182.0, "completions/max_terminated_length": 7182.0, "completions/mean_length": 1334.931640625, "completions/mean_terminated_length": 1334.931640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.26537738383024445, "grad_norm": 0.4266105303554584, "kl": 5.056640625, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 86647768.0, "reward": 0.9107307195663452, "reward_std": 0.3422984480857849, "rewards/correct_answer_reward_func": 0.552734375, "rewards/format_reward_func": 0.9944162368774414, "rewards/python_attempt_reward_func": 2.39453125, "rewards/python_count_reward_func": 1.6796875, "rewards/python_reward_func": 0.8098485469818115, "rewards/tool_execution_reward_func": 0.7955659627914429, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2664517861939296, "grad_norm": 0.3587072099028023, "kl": 4.7713623046875, "learning_rate": 1e-06, "loss": 0.0242, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7294.0, "completions/max_terminated_length": 7294.0, "completions/mean_length": 1387.69140625, "completions/mean_terminated_length": 1387.69140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.26752618855761484, "grad_norm": 0.46142741513027985, "kl": 7.18212890625, "learning_rate": 1e-06, "loss": 0.0976, "num_tokens": 87633978.0, "reward": 0.811184823513031, "reward_std": 0.2305002212524414, "rewards/correct_answer_reward_func": 0.462890625, "rewards/format_reward_func": 0.9942535161972046, "rewards/python_attempt_reward_func": 2.607421875, "rewards/python_count_reward_func": 1.8125, "rewards/python_reward_func": 0.7804772853851318, "rewards/tool_execution_reward_func": 0.7472175359725952, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2686005909213, "grad_norm": 0.35652814840545327, "kl": 6.57861328125, "learning_rate": 1e-06, "loss": 0.097, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6885.0, "completions/max_terminated_length": 6885.0, "completions/mean_length": 1141.103515625, "completions/mean_terminated_length": 1141.103515625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.26967499328498523, "grad_norm": 0.4434411966323654, "kl": 4.1226806640625, "learning_rate": 1e-06, "loss": 0.04, "num_tokens": 88485295.0, "reward": 0.872504711151123, "reward_std": 0.23566101491451263, "rewards/correct_answer_reward_func": 0.505859375, "rewards/format_reward_func": 0.9959244728088379, "rewards/python_attempt_reward_func": 2.515625, "rewards/python_count_reward_func": 1.890625, "rewards/python_reward_func": 0.8439569473266602, "rewards/tool_execution_reward_func": 0.8373023271560669, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.27074939564867045, "grad_norm": 0.3231589431682435, "kl": 3.692138671875, "learning_rate": 1e-06, "loss": 0.0396, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6960.0, "completions/max_terminated_length": 6960.0, "completions/mean_length": 1127.0703125, "completions/mean_terminated_length": 1127.0703125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2718237980123556, "grad_norm": 0.3314408623063777, "kl": 3.487548828125, "learning_rate": 1e-06, "loss": 0.0341, "num_tokens": 89335411.0, "reward": 0.912482500076294, "reward_std": 0.16836607456207275, "rewards/correct_answer_reward_func": 0.5546875, "rewards/format_reward_func": 0.9961718320846558, "rewards/python_attempt_reward_func": 2.400390625, "rewards/python_count_reward_func": 1.818359375, "rewards/python_reward_func": 0.8039302825927734, "rewards/tool_execution_reward_func": 0.7928029298782349, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.27289820037604084, "grad_norm": 0.28088691860286286, "kl": 3.2529296875, "learning_rate": 1e-06, "loss": 0.0339, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7375.0, "completions/max_terminated_length": 7375.0, "completions/mean_length": 1253.10546875, "completions/mean_terminated_length": 1253.10546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.273972602739726, "grad_norm": 0.20629286951652578, "kl": 2.376953125, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 90271273.0, "reward": 0.8955801129341125, "reward_std": 0.2396700382232666, "rewards/correct_answer_reward_func": 0.533203125, "rewards/format_reward_func": 0.9940125942230225, "rewards/python_attempt_reward_func": 2.45703125, "rewards/python_count_reward_func": 1.701171875, "rewards/python_reward_func": 0.8215091824531555, "rewards/tool_execution_reward_func": 0.8178726434707642, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.27504700510341124, "grad_norm": 0.20228122800935705, "kl": 2.3583984375, "learning_rate": 1e-06, "loss": 0.0245, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6862.0, "completions/max_terminated_length": 6862.0, "completions/mean_length": 1254.4765625, "completions/mean_terminated_length": 1254.4765625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2761214074670964, "grad_norm": 0.35324629649709804, "kl": 2.94921875, "learning_rate": 1e-06, "loss": -0.0321, "num_tokens": 91202845.0, "reward": 0.8933559656143188, "reward_std": 0.2493988275527954, "rewards/correct_answer_reward_func": 0.541015625, "rewards/format_reward_func": 0.9814918041229248, "rewards/python_attempt_reward_func": 2.271484375, "rewards/python_count_reward_func": 1.6015625, "rewards/python_reward_func": 0.7866877317428589, "rewards/tool_execution_reward_func": 0.7802098989486694, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2771958098307816, "grad_norm": 0.29605958734193044, "kl": 2.8046875, "learning_rate": 1e-06, "loss": -0.0322, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7228.0, "completions/max_terminated_length": 7228.0, "completions/mean_length": 1143.15625, "completions/mean_terminated_length": 1143.15625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.27827021219446685, "grad_norm": 0.4679693003435708, "kl": 3.01708984375, "learning_rate": 1e-06, "loss": 0.0782, "num_tokens": 92073357.0, "reward": 0.9647353291511536, "reward_std": 0.19333884119987488, "rewards/correct_answer_reward_func": 0.611328125, "rewards/format_reward_func": 0.99776691198349, "rewards/python_attempt_reward_func": 2.2421875, "rewards/python_count_reward_func": 1.607421875, "rewards/python_reward_func": 0.7768725156784058, "rewards/tool_execution_reward_func": 0.7692692875862122, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.279344614558152, "grad_norm": 0.4698435419980313, "kl": 2.982421875, "learning_rate": 1e-06, "loss": 0.0781, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6858.0, "completions/max_terminated_length": 6858.0, "completions/mean_length": 1063.947265625, "completions/mean_terminated_length": 1063.947265625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.28041901692183724, "grad_norm": 0.4552849258736135, "kl": 4.1064453125, "learning_rate": 1e-06, "loss": 0.0821, "num_tokens": 92893298.0, "reward": 0.9417753219604492, "reward_std": 0.27139800786972046, "rewards/correct_answer_reward_func": 0.578125, "rewards/format_reward_func": 0.9990918040275574, "rewards/python_attempt_reward_func": 2.55078125, "rewards/python_count_reward_func": 1.91015625, "rewards/python_reward_func": 0.8251030445098877, "rewards/tool_execution_reward_func": 0.819159984588623, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2814934192855224, "grad_norm": 0.4136230006281678, "kl": 4.4638671875, "learning_rate": 1e-06, "loss": 0.0825, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5961.0, "completions/max_terminated_length": 5961.0, "completions/mean_length": 971.28125, "completions/mean_terminated_length": 971.28125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.28256782164920763, "grad_norm": 0.48261348618947464, "kl": 4.1259765625, "learning_rate": 1e-06, "loss": 0.092, "num_tokens": 93660642.0, "reward": 0.7837750911712646, "reward_std": 0.20174726843833923, "rewards/correct_answer_reward_func": 0.41796875, "rewards/format_reward_func": 0.9980664253234863, "rewards/python_attempt_reward_func": 2.107421875, "rewards/python_count_reward_func": 1.529296875, "rewards/python_reward_func": 0.8383386135101318, "rewards/tool_execution_reward_func": 0.8309655785560608, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.28364222401289285, "grad_norm": 0.4808406241148242, "kl": 4.98291015625, "learning_rate": 1e-06, "loss": 0.0929, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6989.0, "completions/max_terminated_length": 6989.0, "completions/mean_length": 1246.337890625, "completions/mean_terminated_length": 1246.337890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.284716626376578, "grad_norm": 0.6880701175566324, "kl": 4.62109375, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 94588303.0, "reward": 0.8699891567230225, "reward_std": 0.21225351095199585, "rewards/correct_answer_reward_func": 0.513671875, "rewards/format_reward_func": 0.9924153089523315, "rewards/python_attempt_reward_func": 2.30078125, "rewards/python_count_reward_func": 1.732421875, "rewards/python_reward_func": 0.802823543548584, "rewards/tool_execution_reward_func": 0.789171040058136, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.28579102874026324, "grad_norm": 0.7091778432597814, "kl": 5.134765625, "learning_rate": 1e-06, "loss": 0.0232, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6553.0, "completions/max_terminated_length": 6553.0, "completions/mean_length": 970.9140625, "completions/mean_terminated_length": 970.9140625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.2868654311039484, "grad_norm": 0.37307298589528887, "kl": 4.310546875, "learning_rate": 1e-06, "loss": 0.0621, "num_tokens": 95371619.0, "reward": 0.8533837199211121, "reward_std": 0.2156163454055786, "rewards/correct_answer_reward_func": 0.484375, "rewards/format_reward_func": 0.9984375238418579, "rewards/python_attempt_reward_func": 2.234375, "rewards/python_count_reward_func": 1.86328125, "rewards/python_reward_func": 0.850946307182312, "rewards/tool_execution_reward_func": 0.8466060757637024, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.28793983346763363, "grad_norm": 0.30451851906267763, "kl": 4.2314453125, "learning_rate": 1e-06, "loss": 0.0621, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6939.0, "completions/max_terminated_length": 6939.0, "completions/mean_length": 1200.587890625, "completions/mean_terminated_length": 1200.587890625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.28901423583131886, "grad_norm": 0.3546875890456202, "kl": 3.976806640625, "learning_rate": 1e-06, "loss": 0.0393, "num_tokens": 96272688.0, "reward": 0.777186930179596, "reward_std": 0.21054261922836304, "rewards/correct_answer_reward_func": 0.416015625, "rewards/format_reward_func": 0.9993445873260498, "rewards/python_attempt_reward_func": 2.20703125, "rewards/python_count_reward_func": 1.69921875, "rewards/python_reward_func": 0.8161667585372925, "rewards/tool_execution_reward_func": 0.8065119981765747, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.290088638195004, "grad_norm": 0.3345537631574293, "kl": 3.851806640625, "learning_rate": 1e-06, "loss": 0.0392, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4018.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 726.23046875, "completions/mean_terminated_length": 726.23046875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.29116304055868925, "grad_norm": 0.37748744531399453, "kl": 3.7386474609375, "learning_rate": 1e-06, "loss": 0.0892, "num_tokens": 96910502.0, "reward": 0.9568201303482056, "reward_std": 0.16565418243408203, "rewards/correct_answer_reward_func": 0.58203125, "rewards/format_reward_func": 0.997473955154419, "rewards/python_attempt_reward_func": 1.810546875, "rewards/python_count_reward_func": 1.552734375, "rewards/python_reward_func": 0.8786094188690186, "rewards/tool_execution_reward_func": 0.8764702677726746, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2922374429223744, "grad_norm": 0.3746818464522976, "kl": 3.6004791259765625, "learning_rate": 1e-06, "loss": 0.089, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7319.0, "completions/max_terminated_length": 7319.0, "completions/mean_length": 1161.267578125, "completions/mean_terminated_length": 1161.267578125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.29331184528605964, "grad_norm": 0.47421806897272173, "kl": 7.60498046875, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 97774159.0, "reward": 0.7042557597160339, "reward_std": 0.24961255490779877, "rewards/correct_answer_reward_func": 0.34765625, "rewards/format_reward_func": 0.9971182346343994, "rewards/python_attempt_reward_func": 2.15234375, "rewards/python_count_reward_func": 1.505859375, "rewards/python_reward_func": 0.7954962253570557, "rewards/tool_execution_reward_func": 0.785879373550415, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2943862476497448, "grad_norm": 0.4305981751968214, "kl": 7.3134765625, "learning_rate": 1e-06, "loss": 0.052, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7419.0, "completions/max_terminated_length": 7419.0, "completions/mean_length": 1129.650390625, "completions/mean_terminated_length": 1129.650390625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.29546065001343, "grad_norm": 0.34391699492690864, "kl": 5.841796875, "learning_rate": 1e-06, "loss": 0.0927, "num_tokens": 98624284.0, "reward": 0.8376463055610657, "reward_std": 0.15532733500003815, "rewards/correct_answer_reward_func": 0.466796875, "rewards/format_reward_func": 0.9967866539955139, "rewards/python_attempt_reward_func": 2.7421875, "rewards/python_count_reward_func": 2.07421875, "rewards/python_reward_func": 0.8660706281661987, "rewards/tool_execution_reward_func": 0.857460618019104, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29653505237711525, "grad_norm": 0.36127629424339963, "kl": 5.67578125, "learning_rate": 1e-06, "loss": 0.0925, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7467.0, "completions/max_terminated_length": 7467.0, "completions/mean_length": 1025.146484375, "completions/mean_terminated_length": 1025.146484375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2976094547408004, "grad_norm": 0.3061464836121289, "kl": 4.154052734375, "learning_rate": 1e-06, "loss": 0.0376, "num_tokens": 99426439.0, "reward": 0.8263065814971924, "reward_std": 0.15674075484275818, "rewards/correct_answer_reward_func": 0.458984375, "rewards/format_reward_func": 0.9930533766746521, "rewards/python_attempt_reward_func": 2.11328125, "rewards/python_count_reward_func": 1.65625, "rewards/python_reward_func": 0.8535621166229248, "rewards/tool_execution_reward_func": 0.8435578346252441, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29868385710448564, "grad_norm": 0.293573354073546, "kl": 4.28466796875, "learning_rate": 1e-06, "loss": 0.0377, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7321.0, "completions/max_terminated_length": 7321.0, "completions/mean_length": 969.294921875, "completions/mean_terminated_length": 969.294921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2997582594681708, "grad_norm": 0.25326074556918493, "kl": 3.95849609375, "learning_rate": 1e-06, "loss": 0.0682, "num_tokens": 100207230.0, "reward": 0.9373247623443604, "reward_std": 0.12485288083553314, "rewards/correct_answer_reward_func": 0.572265625, "rewards/format_reward_func": 0.9939583539962769, "rewards/python_attempt_reward_func": 1.97265625, "rewards/python_count_reward_func": 1.63671875, "rewards/python_reward_func": 0.8410420417785645, "rewards/tool_execution_reward_func": 0.8313376307487488, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30083266183185603, "grad_norm": 0.2624739755399176, "kl": 4.19677734375, "learning_rate": 1e-06, "loss": 0.0685, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6995.0, "completions/max_terminated_length": 6995.0, "completions/mean_length": 1020.251953125, "completions/mean_terminated_length": 1020.251953125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.30190706419554125, "grad_norm": 0.30799885158706974, "kl": 4.38134765625, "learning_rate": 1e-06, "loss": 0.1022, "num_tokens": 101005727.0, "reward": 0.827180802822113, "reward_std": 0.17602810263633728, "rewards/correct_answer_reward_func": 0.453125, "rewards/format_reward_func": 0.9945294260978699, "rewards/python_attempt_reward_func": 2.17578125, "rewards/python_count_reward_func": 1.794921875, "rewards/python_reward_func": 0.8815515041351318, "rewards/tool_execution_reward_func": 0.8757494688034058, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3029814665592264, "grad_norm": 0.3106693695524203, "kl": 4.647705078125, "learning_rate": 1e-06, "loss": 0.1025, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7019.0, "completions/max_terminated_length": 7019.0, "completions/mean_length": 935.63671875, "completions/mean_terminated_length": 935.63671875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.30405586892291164, "grad_norm": 0.3348697860455815, "kl": 6.6201171875, "learning_rate": 1e-06, "loss": 0.0979, "num_tokens": 101760261.0, "reward": 0.833647608757019, "reward_std": 0.2169898897409439, "rewards/correct_answer_reward_func": 0.470703125, "rewards/format_reward_func": 0.9975846409797668, "rewards/python_attempt_reward_func": 1.921875, "rewards/python_count_reward_func": 1.478515625, "rewards/python_reward_func": 0.8243682980537415, "rewards/tool_execution_reward_func": 0.8171378374099731, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3051302712865968, "grad_norm": 0.3049581311388452, "kl": 6.859619140625, "learning_rate": 1e-06, "loss": 0.0981, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7831.0, "completions/max_terminated_length": 7831.0, "completions/mean_length": 1096.005859375, "completions/mean_terminated_length": 1096.005859375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.30620467365028203, "grad_norm": 0.32907897198743447, "kl": 6.064453125, "learning_rate": 1e-06, "loss": 0.0343, "num_tokens": 102593160.0, "reward": 0.9224615693092346, "reward_std": 0.2010919600725174, "rewards/correct_answer_reward_func": 0.548828125, "rewards/format_reward_func": 0.9916775226593018, "rewards/python_attempt_reward_func": 2.263671875, "rewards/python_count_reward_func": 1.76953125, "rewards/python_reward_func": 0.8846741914749146, "rewards/tool_execution_reward_func": 0.8764896392822266, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30727907601396726, "grad_norm": 0.31267035918433955, "kl": 6.0849609375, "learning_rate": 1e-06, "loss": 0.0343, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7123.0, "completions/max_terminated_length": 7123.0, "completions/mean_length": 797.33984375, "completions/mean_terminated_length": 797.33984375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.3083534783776524, "grad_norm": 0.42447404155494195, "kl": 6.5830078125, "learning_rate": 1e-06, "loss": 0.1399, "num_tokens": 103276246.0, "reward": 0.8188380599021912, "reward_std": 0.1865319311618805, "rewards/correct_answer_reward_func": 0.44140625, "rewards/format_reward_func": 0.9994140267372131, "rewards/python_attempt_reward_func": 1.908203125, "rewards/python_count_reward_func": 1.580078125, "rewards/python_reward_func": 0.8933353424072266, "rewards/tool_execution_reward_func": 0.8877449035644531, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.30942788074133765, "grad_norm": 0.40639939833322997, "kl": 6.8212890625, "learning_rate": 1e-06, "loss": 0.1401, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6992.0, "completions/max_terminated_length": 6992.0, "completions/mean_length": 1200.46875, "completions/mean_terminated_length": 1200.46875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3105022831050228, "grad_norm": 0.693549036095002, "kl": 8.810546875, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 104165478.0, "reward": 0.904310941696167, "reward_std": 0.2529483139514923, "rewards/correct_answer_reward_func": 0.546875, "rewards/format_reward_func": 0.9847460985183716, "rewards/python_attempt_reward_func": 2.447265625, "rewards/python_count_reward_func": 1.87890625, "rewards/python_reward_func": 0.8088293671607971, "rewards/tool_execution_reward_func": 0.8024336099624634, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31157668546870804, "grad_norm": 0.685985511192068, "kl": 8.818359375, "learning_rate": 1e-06, "loss": -0.0003, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7312.0, "completions/max_terminated_length": 7312.0, "completions/mean_length": 1184.91796875, "completions/mean_terminated_length": 1184.91796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3126510878323932, "grad_norm": 1.0633678919838028, "kl": 10.19921875, "learning_rate": 1e-06, "loss": 0.0575, "num_tokens": 105049244.0, "reward": 0.8679827451705933, "reward_std": 0.1353558897972107, "rewards/correct_answer_reward_func": 0.501953125, "rewards/format_reward_func": 0.9964061975479126, "rewards/python_attempt_reward_func": 2.482421875, "rewards/python_count_reward_func": 1.78515625, "rewards/python_reward_func": 0.841903805732727, "rewards/tool_execution_reward_func": 0.8337417840957642, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3137254901960784, "grad_norm": 0.3771334684922118, "kl": 9.041015625, "learning_rate": 1e-06, "loss": 0.0563, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6146.0, "completions/max_terminated_length": 6146.0, "completions/mean_length": 1310.5078125, "completions/mean_terminated_length": 1310.5078125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.31479989255976365, "grad_norm": 0.5427524388900242, "kl": 10.03515625, "learning_rate": 1e-06, "loss": 0.0719, "num_tokens": 106012864.0, "reward": 0.8725683689117432, "reward_std": 0.2535207271575928, "rewards/correct_answer_reward_func": 0.51171875, "rewards/format_reward_func": 0.9912128448486328, "rewards/python_attempt_reward_func": 2.705078125, "rewards/python_count_reward_func": 2.029296875, "rewards/python_reward_func": 0.8195599317550659, "rewards/tool_execution_reward_func": 0.8130355477333069, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3158742949234488, "grad_norm": 0.43841829028076246, "kl": 9.0537109375, "learning_rate": 1e-06, "loss": 0.0709, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6875.0, "completions/max_terminated_length": 6875.0, "completions/mean_length": 860.580078125, "completions/mean_terminated_length": 860.580078125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.31694869728713404, "grad_norm": 0.2692157479833395, "kl": 3.734375, "learning_rate": 1e-06, "loss": 0.0437, "num_tokens": 106717193.0, "reward": 1.0660090446472168, "reward_std": 0.13595138490200043, "rewards/correct_answer_reward_func": 0.685546875, "rewards/format_reward_func": 0.997960090637207, "rewards/python_attempt_reward_func": 2.0625, "rewards/python_count_reward_func": 1.703125, "rewards/python_reward_func": 0.907969057559967, "rewards/tool_execution_reward_func": 0.9043511152267456, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3180230996508192, "grad_norm": 0.27099883705913397, "kl": 3.451416015625, "learning_rate": 1e-06, "loss": 0.0434, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6906.0, "completions/max_terminated_length": 6906.0, "completions/mean_length": 1136.453125, "completions/mean_terminated_length": 1136.453125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.31909750201450443, "grad_norm": 0.462611534396906, "kl": 4.99609375, "learning_rate": 1e-06, "loss": 0.0997, "num_tokens": 107566769.0, "reward": 0.9160683155059814, "reward_std": 0.20744861662387848, "rewards/correct_answer_reward_func": 0.548828125, "rewards/format_reward_func": 0.9989508986473083, "rewards/python_attempt_reward_func": 2.650390625, "rewards/python_count_reward_func": 1.955078125, "rewards/python_reward_func": 0.8413891792297363, "rewards/tool_execution_reward_func": 0.8372504711151123, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32017190437818965, "grad_norm": 0.44624322958098933, "kl": 5.07958984375, "learning_rate": 1e-06, "loss": 0.0997, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7374.0, "completions/max_terminated_length": 7374.0, "completions/mean_length": 1337.1875, "completions/mean_terminated_length": 1337.1875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3212463067418748, "grad_norm": 0.3722886939803352, "kl": 7.3330078125, "learning_rate": 1e-06, "loss": 0.0956, "num_tokens": 108520273.0, "reward": 0.891697347164154, "reward_std": 0.32055869698524475, "rewards/correct_answer_reward_func": 0.529296875, "rewards/format_reward_func": 0.9920231699943542, "rewards/python_attempt_reward_func": 2.83203125, "rewards/python_count_reward_func": 2.099609375, "rewards/python_reward_func": 0.8350330591201782, "rewards/tool_execution_reward_func": 0.8199792504310608, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32232070910556004, "grad_norm": 0.3720014312941975, "kl": 7.28662109375, "learning_rate": 1e-06, "loss": 0.0955, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5881.0, "completions/max_terminated_length": 5881.0, "completions/mean_length": 1238.7109375, "completions/mean_terminated_length": 1238.7109375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3233951114692452, "grad_norm": 0.8184066364827487, "kl": 1.57781982421875, "learning_rate": 1e-06, "loss": 0.0808, "num_tokens": 109427293.0, "reward": 0.9172317981719971, "reward_std": 0.23061399161815643, "rewards/correct_answer_reward_func": 0.546875, "rewards/format_reward_func": 0.9974631071090698, "rewards/python_attempt_reward_func": 2.861328125, "rewards/python_count_reward_func": 2.0859375, "rewards/python_reward_func": 0.8665473461151123, "rewards/tool_execution_reward_func": 0.8543208837509155, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.32446951383293043, "grad_norm": 0.8176939141881384, "kl": 1.593170166015625, "learning_rate": 1e-06, "loss": 0.0808, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7051.0, "completions/max_terminated_length": 7051.0, "completions/mean_length": 1246.421875, "completions/mean_terminated_length": 1246.421875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.32554391619661566, "grad_norm": 0.7271718835781644, "kl": 1.7904052734375, "learning_rate": 1e-06, "loss": 0.0749, "num_tokens": 110337749.0, "reward": 0.9503316879272461, "reward_std": 0.2673990726470947, "rewards/correct_answer_reward_func": 0.580078125, "rewards/format_reward_func": 0.9973111748695374, "rewards/python_attempt_reward_func": 2.529296875, "rewards/python_count_reward_func": 1.9921875, "rewards/python_reward_func": 0.8680331110954285, "rewards/tool_execution_reward_func": 0.8539566993713379, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3266183185603008, "grad_norm": 0.700444255705869, "kl": 2.0306396484375, "learning_rate": 1e-06, "loss": 0.0752, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5196.0, "completions/max_terminated_length": 5196.0, "completions/mean_length": 812.40234375, "completions/mean_terminated_length": 812.40234375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.32769272092398605, "grad_norm": 0.5804280202081895, "kl": 1.453369140625, "learning_rate": 1e-06, "loss": 0.072, "num_tokens": 111022211.0, "reward": 0.8069182634353638, "reward_std": 0.18226929008960724, "rewards/correct_answer_reward_func": 0.431640625, "rewards/format_reward_func": 0.9988671541213989, "rewards/python_attempt_reward_func": 2.111328125, "rewards/python_count_reward_func": 1.826171875, "rewards/python_reward_func": 0.88593590259552, "rewards/tool_execution_reward_func": 0.8775212168693542, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3287671232876712, "grad_norm": 0.5422347836966864, "kl": 1.9068603515625, "learning_rate": 1e-06, "loss": 0.0724, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7142.0, "completions/max_terminated_length": 7142.0, "completions/mean_length": 1341.771484375, "completions/mean_terminated_length": 1341.771484375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.32984152565135644, "grad_norm": 1.0171771248594133, "kl": 4.63134765625, "learning_rate": 1e-06, "loss": 0.0451, "num_tokens": 111982894.0, "reward": 0.8721405863761902, "reward_std": 0.2619967460632324, "rewards/correct_answer_reward_func": 0.515625, "rewards/format_reward_func": 0.99699467420578, "rewards/python_attempt_reward_func": 2.94921875, "rewards/python_count_reward_func": 2.12890625, "rewards/python_reward_func": 0.7970563769340515, "rewards/tool_execution_reward_func": 0.7855833172798157, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.33091592801504166, "grad_norm": 1.0528348636791434, "kl": 5.8056640625, "learning_rate": 1e-06, "loss": 0.0462, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7384.0, "completions/max_terminated_length": 7384.0, "completions/mean_length": 1369.32421875, "completions/mean_terminated_length": 1369.32421875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.33199033037872683, "grad_norm": 1.0197239442850687, "kl": 9.7578125, "learning_rate": 1e-06, "loss": 0.0626, "num_tokens": 112973108.0, "reward": 0.7650319337844849, "reward_std": 0.18703925609588623, "rewards/correct_answer_reward_func": 0.40625, "rewards/format_reward_func": 0.9973589777946472, "rewards/python_attempt_reward_func": 2.791015625, "rewards/python_count_reward_func": 1.955078125, "rewards/python_reward_func": 0.811015784740448, "rewards/tool_execution_reward_func": 0.7965510487556458, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.33306473274241205, "grad_norm": 0.9066912798139816, "kl": 10.8173828125, "learning_rate": 1e-06, "loss": 0.0637, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6782.0, "completions/max_terminated_length": 6782.0, "completions/mean_length": 1332.224609375, "completions/mean_terminated_length": 1332.224609375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3341391351060972, "grad_norm": 0.8891776758424454, "kl": 11.2091064453125, "learning_rate": 1e-06, "loss": 0.1349, "num_tokens": 113935623.0, "reward": 0.8227730989456177, "reward_std": 0.19914670288562775, "rewards/correct_answer_reward_func": 0.4609375, "rewards/format_reward_func": 0.9967959523200989, "rewards/python_attempt_reward_func": 2.544921875, "rewards/python_count_reward_func": 1.7265625, "rewards/python_reward_func": 0.819326639175415, "rewards/tool_execution_reward_func": 0.8123822212219238, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.33521353746978244, "grad_norm": 0.8128728382627733, "kl": 11.3111572265625, "learning_rate": 1e-06, "loss": 0.135, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7508.0, "completions/max_terminated_length": 7508.0, "completions/mean_length": 2068.341796875, "completions/mean_terminated_length": 2068.341796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3362879398334676, "grad_norm": 0.4613327783473211, "kl": 7.0107421875, "learning_rate": 1e-06, "loss": 0.0455, "num_tokens": 115281110.0, "reward": 0.8294681906700134, "reward_std": 0.20745569467544556, "rewards/correct_answer_reward_func": 0.482421875, "rewards/format_reward_func": 0.9927722215652466, "rewards/python_attempt_reward_func": 3.34375, "rewards/python_count_reward_func": 2.23046875, "rewards/python_reward_func": 0.752071738243103, "rewards/tool_execution_reward_func": 0.7424595355987549, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.33736234219715283, "grad_norm": 0.391680965330086, "kl": 6.7275390625, "learning_rate": 1e-06, "loss": 0.0453, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5934.0, "completions/max_terminated_length": 5934.0, "completions/mean_length": 1085.447265625, "completions/mean_terminated_length": 1085.447265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.33843674456083805, "grad_norm": 0.6398863683212199, "kl": 10.05078125, "learning_rate": 1e-06, "loss": 0.0925, "num_tokens": 116104923.0, "reward": 0.8426446914672852, "reward_std": 0.1787755787372589, "rewards/correct_answer_reward_func": 0.48046875, "rewards/format_reward_func": 0.9948990941047668, "rewards/python_attempt_reward_func": 2.10546875, "rewards/python_count_reward_func": 1.57421875, "rewards/python_reward_func": 0.8236646056175232, "rewards/tool_execution_reward_func": 0.8159807324409485, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3395111469245232, "grad_norm": 0.5661222267845725, "kl": 10.060302734375, "learning_rate": 1e-06, "loss": 0.0925, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6081.0, "completions/max_terminated_length": 6081.0, "completions/mean_length": 1084.462890625, "completions/mean_terminated_length": 1084.462890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.34058554928820844, "grad_norm": 0.7209763158064156, "kl": 9.87109375, "learning_rate": 1e-06, "loss": 0.0683, "num_tokens": 116936744.0, "reward": 0.9359917640686035, "reward_std": 0.31436675786972046, "rewards/correct_answer_reward_func": 0.56640625, "rewards/format_reward_func": 0.9953320026397705, "rewards/python_attempt_reward_func": 2.291015625, "rewards/python_count_reward_func": 1.828125, "rewards/python_reward_func": 0.862179160118103, "rewards/tool_execution_reward_func": 0.8525956869125366, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3416599516518936, "grad_norm": 0.7968888416775589, "kl": 10.15625, "learning_rate": 1e-06, "loss": 0.0686, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5288.0, "completions/max_terminated_length": 5288.0, "completions/mean_length": 1176.763671875, "completions/mean_terminated_length": 1176.763671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.34273435401557883, "grad_norm": 0.8693136250457819, "kl": 10.6494140625, "learning_rate": 1e-06, "loss": 0.1411, "num_tokens": 117811727.0, "reward": 0.949396550655365, "reward_std": 0.2499719113111496, "rewards/correct_answer_reward_func": 0.580078125, "rewards/format_reward_func": 0.9969773292541504, "rewards/python_attempt_reward_func": 2.595703125, "rewards/python_count_reward_func": 1.974609375, "rewards/python_reward_func": 0.8630836009979248, "rewards/tool_execution_reward_func": 0.8496148586273193, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.34380875637926406, "grad_norm": 0.5333212350007965, "kl": 9.900390625, "learning_rate": 1e-06, "loss": 0.1404, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5054.0, "completions/max_terminated_length": 5054.0, "completions/mean_length": 1075.650390625, "completions/mean_terminated_length": 1075.650390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3448831587429492, "grad_norm": 0.4958488055306517, "kl": 10.8359375, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 118648028.0, "reward": 0.8093916177749634, "reward_std": 0.17847992479801178, "rewards/correct_answer_reward_func": 0.435546875, "rewards/format_reward_func": 0.9960630536079407, "rewards/python_attempt_reward_func": 2.205078125, "rewards/python_count_reward_func": 1.90234375, "rewards/python_reward_func": 0.884813666343689, "rewards/tool_execution_reward_func": 0.8731608390808105, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.34595756110663445, "grad_norm": 0.4713856330502625, "kl": 10.99755859375, "learning_rate": 1e-06, "loss": 0.1008, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7560.0, "completions/max_terminated_length": 7560.0, "completions/mean_length": 970.787109375, "completions/mean_terminated_length": 970.787109375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.3470319634703196, "grad_norm": 0.5503001741747028, "kl": 10.844505310058594, "learning_rate": 1e-06, "loss": 0.1028, "num_tokens": 119408047.0, "reward": 1.0383081436157227, "reward_std": 0.24911047518253326, "rewards/correct_answer_reward_func": 0.658203125, "rewards/format_reward_func": 0.9987152814865112, "rewards/python_attempt_reward_func": 2.01953125, "rewards/python_count_reward_func": 1.705078125, "rewards/python_reward_func": 0.9161938428878784, "rewards/tool_execution_reward_func": 0.9018096923828125, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.34810636583400484, "grad_norm": 0.5145210910041135, "kl": 11.976402282714844, "learning_rate": 1e-06, "loss": 0.104, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7236.0, "completions/max_terminated_length": 7236.0, "completions/mean_length": 1339.212890625, "completions/mean_terminated_length": 1339.212890625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.34918076819769006, "grad_norm": 1.11854091780033, "kl": 15.408203125, "learning_rate": 1e-06, "loss": 0.0871, "num_tokens": 120368892.0, "reward": 0.8930552005767822, "reward_std": 0.2748866379261017, "rewards/correct_answer_reward_func": 0.53515625, "rewards/format_reward_func": 0.997199535369873, "rewards/python_attempt_reward_func": 2.314453125, "rewards/python_count_reward_func": 1.798828125, "rewards/python_reward_func": 0.8036093711853027, "rewards/tool_execution_reward_func": 0.7922952175140381, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.35025517056137523, "grad_norm": 1.1251423537180596, "kl": 15.607421875, "learning_rate": 1e-06, "loss": 0.0873, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7599.0, "completions/max_terminated_length": 7599.0, "completions/mean_length": 1279.19140625, "completions/mean_terminated_length": 1279.19140625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.35132957292506045, "grad_norm": 1.2723481278708273, "kl": 24.1748046875, "learning_rate": 1e-06, "loss": 0.1527, "num_tokens": 121290270.0, "reward": 0.9262570142745972, "reward_std": 0.25314611196517944, "rewards/correct_answer_reward_func": 0.5703125, "rewards/format_reward_func": 0.9986154437065125, "rewards/python_attempt_reward_func": 2.56640625, "rewards/python_count_reward_func": 1.771484375, "rewards/python_reward_func": 0.794070839881897, "rewards/tool_execution_reward_func": 0.7811074256896973, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3524039752887456, "grad_norm": 0.9417675601696023, "kl": 21.51171875, "learning_rate": 1e-06, "loss": 0.1501, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6307.0, "completions/max_terminated_length": 6307.0, "completions/mean_length": 1161.853515625, "completions/mean_terminated_length": 1161.853515625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.35347837765243084, "grad_norm": 0.8410309989127575, "kl": 10.3740234375, "learning_rate": 1e-06, "loss": 0.1252, "num_tokens": 122159923.0, "reward": 0.7525599002838135, "reward_std": 0.14981023967266083, "rewards/correct_answer_reward_func": 0.380859375, "rewards/format_reward_func": 0.9983965754508972, "rewards/python_attempt_reward_func": 2.650390625, "rewards/python_count_reward_func": 2.22265625, "rewards/python_reward_func": 0.8699474334716797, "rewards/tool_execution_reward_func": 0.8601059317588806, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.354552780016116, "grad_norm": 0.8592508511911222, "kl": 10.0009765625, "learning_rate": 1e-06, "loss": 0.1248, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6610.0, "completions/max_terminated_length": 6610.0, "completions/mean_length": 922.25, "completions/mean_terminated_length": 922.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.35562718237980123, "grad_norm": 0.3995504477715787, "kl": 8.7666015625, "learning_rate": 1e-06, "loss": 0.0664, "num_tokens": 122893139.0, "reward": 0.9793539643287659, "reward_std": 0.1993875503540039, "rewards/correct_answer_reward_func": 0.60546875, "rewards/format_reward_func": 0.9973914623260498, "rewards/python_attempt_reward_func": 2.150390625, "rewards/python_count_reward_func": 1.66015625, "rewards/python_reward_func": 0.8800641894340515, "rewards/tool_execution_reward_func": 0.8720346689224243, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.35670158474348646, "grad_norm": 0.38405497044964937, "kl": 9.1611328125, "learning_rate": 1e-06, "loss": 0.0668, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7550.0, "completions/max_terminated_length": 7550.0, "completions/mean_length": 1242.4921875, "completions/mean_terminated_length": 1242.4921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3577759871071716, "grad_norm": 0.6754030708230367, "kl": 13.03759765625, "learning_rate": 1e-06, "loss": 0.1157, "num_tokens": 123792079.0, "reward": 0.9526435732841492, "reward_std": 0.2000865340232849, "rewards/correct_answer_reward_func": 0.58984375, "rewards/format_reward_func": 0.9953320026397705, "rewards/python_attempt_reward_func": 2.443359375, "rewards/python_count_reward_func": 1.96875, "rewards/python_reward_func": 0.8284063339233398, "rewards/tool_execution_reward_func": 0.8186670541763306, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.35885038947085685, "grad_norm": 0.5304425905530655, "kl": 13.5654296875, "learning_rate": 1e-06, "loss": 0.1162, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6998.0, "completions/max_terminated_length": 6998.0, "completions/mean_length": 906.4296875, "completions/mean_terminated_length": 906.4296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.359924791834542, "grad_norm": 0.6113736207347797, "kl": 10.108795166015625, "learning_rate": 1e-06, "loss": 0.0854, "num_tokens": 124522059.0, "reward": 1.050907015800476, "reward_std": 0.24836012721061707, "rewards/correct_answer_reward_func": 0.6796875, "rewards/format_reward_func": 0.9978255033493042, "rewards/python_attempt_reward_func": 2.09375, "rewards/python_count_reward_func": 1.6484375, "rewards/python_reward_func": 0.8656753897666931, "rewards/tool_execution_reward_func": 0.858272135257721, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.36099919419822724, "grad_norm": 0.534551548592428, "kl": 9.725860595703125, "learning_rate": 1e-06, "loss": 0.085, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4880.0, "completions/max_terminated_length": 4880.0, "completions/mean_length": 961.650390625, "completions/mean_terminated_length": 961.650390625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.36207359656191246, "grad_norm": 0.429926795273153, "kl": 9.7568359375, "learning_rate": 1e-06, "loss": 0.0716, "num_tokens": 125291512.0, "reward": 0.9744051098823547, "reward_std": 0.2640829086303711, "rewards/correct_answer_reward_func": 0.609375, "rewards/format_reward_func": 0.9986718893051147, "rewards/python_attempt_reward_func": 2.11328125, "rewards/python_count_reward_func": 1.796875, "rewards/python_reward_func": 0.8365234136581421, "rewards/tool_execution_reward_func": 0.8264787793159485, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3631479989255976, "grad_norm": 0.31830942916290933, "kl": 8.868896484375, "learning_rate": 1e-06, "loss": 0.0708, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6252.0, "completions/max_terminated_length": 6252.0, "completions/mean_length": 1005.79296875, "completions/mean_terminated_length": 1005.79296875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.36422240128928285, "grad_norm": 0.5725578425117273, "kl": 7.89031982421875, "learning_rate": 1e-06, "loss": 0.0747, "num_tokens": 126078158.0, "reward": 0.8794068098068237, "reward_std": 0.17705777287483215, "rewards/correct_answer_reward_func": 0.498046875, "rewards/format_reward_func": 0.9985416531562805, "rewards/python_attempt_reward_func": 2.126953125, "rewards/python_count_reward_func": 1.822265625, "rewards/python_reward_func": 0.9149312973022461, "rewards/tool_execution_reward_func": 0.9082581996917725, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.365296803652968, "grad_norm": 0.5674520127074292, "kl": 8.352712631225586, "learning_rate": 1e-06, "loss": 0.0752, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6585.0, "completions/max_terminated_length": 6585.0, "completions/mean_length": 1490.890625, "completions/mean_terminated_length": 1490.890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.36637120601665324, "grad_norm": 0.6679843322768971, "kl": 12.167724609375, "learning_rate": 1e-06, "loss": 0.0385, "num_tokens": 127115766.0, "reward": 0.9480749368667603, "reward_std": 0.2804620862007141, "rewards/correct_answer_reward_func": 0.587890625, "rewards/format_reward_func": 0.9920312762260437, "rewards/python_attempt_reward_func": 2.990234375, "rewards/python_count_reward_func": 2.1875, "rewards/python_reward_func": 0.8199552297592163, "rewards/tool_execution_reward_func": 0.8088905811309814, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.36744560838033846, "grad_norm": 0.6450446057182583, "kl": 12.67236328125, "learning_rate": 1e-06, "loss": 0.039, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4897.0, "completions/max_terminated_length": 4897.0, "completions/mean_length": 985.650390625, "completions/mean_terminated_length": 985.650390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.36852001074402363, "grad_norm": 0.5182942368294806, "kl": 12.37109375, "learning_rate": 1e-06, "loss": 0.1127, "num_tokens": 127899427.0, "reward": 0.9941079616546631, "reward_std": 0.23645541071891785, "rewards/correct_answer_reward_func": 0.619140625, "rewards/format_reward_func": 0.9968340992927551, "rewards/python_attempt_reward_func": 2.208984375, "rewards/python_count_reward_func": 1.90234375, "rewards/python_reward_func": 0.8842587471008301, "rewards/tool_execution_reward_func": 0.8780025243759155, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.36959441310770885, "grad_norm": 0.44962448389006093, "kl": 12.87451171875, "learning_rate": 1e-06, "loss": 0.1132, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5335.0, "completions/max_terminated_length": 5335.0, "completions/mean_length": 760.73828125, "completions/mean_terminated_length": 760.73828125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.370668815471394, "grad_norm": 0.44640608367196594, "kl": 11.7757568359375, "learning_rate": 1e-06, "loss": 0.113, "num_tokens": 128551837.0, "reward": 1.026365876197815, "reward_std": 0.142514169216156, "rewards/correct_answer_reward_func": 0.642578125, "rewards/format_reward_func": 0.9992882013320923, "rewards/python_attempt_reward_func": 1.94140625, "rewards/python_count_reward_func": 1.66796875, "rewards/python_reward_func": 0.9232816696166992, "rewards/tool_execution_reward_func": 0.919650673866272, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.37174321783507924, "grad_norm": 0.5162823600142957, "kl": 12.38238525390625, "learning_rate": 1e-06, "loss": 0.1135, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6255.0, "completions/max_terminated_length": 6255.0, "completions/mean_length": 1083.333984375, "completions/mean_terminated_length": 1083.333984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3728176201987644, "grad_norm": 0.4871932516340003, "kl": 9.424560546875, "learning_rate": 1e-06, "loss": 0.0633, "num_tokens": 129379080.0, "reward": 0.9098683595657349, "reward_std": 0.2785007953643799, "rewards/correct_answer_reward_func": 0.533203125, "rewards/format_reward_func": 0.9975911378860474, "rewards/python_attempt_reward_func": 2.490234375, "rewards/python_count_reward_func": 2.138671875, "rewards/python_reward_func": 0.8917053937911987, "rewards/tool_execution_reward_func": 0.8857351541519165, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.37389202256244963, "grad_norm": 0.3736974535978776, "kl": 9.149169921875, "learning_rate": 1e-06, "loss": 0.0631, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6742.0, "completions/max_terminated_length": 6742.0, "completions/mean_length": 1391.2109375, "completions/mean_terminated_length": 1391.2109375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.37496642492613486, "grad_norm": 0.341231460228584, "kl": 9.611328125, "learning_rate": 1e-06, "loss": 0.0813, "num_tokens": 130377492.0, "reward": 0.8440700173377991, "reward_std": 0.20659494400024414, "rewards/correct_answer_reward_func": 0.4765625, "rewards/format_reward_func": 0.9975651502609253, "rewards/python_attempt_reward_func": 3.080078125, "rewards/python_count_reward_func": 2.568359375, "rewards/python_reward_func": 0.8498713970184326, "rewards/tool_execution_reward_func": 0.8399724364280701, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.37604082728982, "grad_norm": 0.34347450164771515, "kl": 9.021484375, "learning_rate": 1e-06, "loss": 0.0807, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6631.0, "completions/max_terminated_length": 6631.0, "completions/mean_length": 1113.158203125, "completions/mean_terminated_length": 1113.158203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.37711522965350525, "grad_norm": 0.3412724038932783, "kl": 10.885009765625, "learning_rate": 1e-06, "loss": 0.0903, "num_tokens": 131219205.0, "reward": 0.9898566007614136, "reward_std": 0.23378436267375946, "rewards/correct_answer_reward_func": 0.61328125, "rewards/format_reward_func": 0.9953906536102295, "rewards/python_attempt_reward_func": 2.36328125, "rewards/python_count_reward_func": 1.984375, "rewards/python_reward_func": 0.8949389457702637, "rewards/tool_execution_reward_func": 0.8874861001968384, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3781896320171904, "grad_norm": 0.3000927049182839, "kl": 10.2838134765625, "learning_rate": 1e-06, "loss": 0.0897, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6073.0, "completions/max_terminated_length": 6073.0, "completions/mean_length": 776.5703125, "completions/mean_terminated_length": 776.5703125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.37926403438087564, "grad_norm": 0.37802206812128125, "kl": 5.5444793701171875, "learning_rate": 1e-06, "loss": 0.077, "num_tokens": 131887689.0, "reward": 1.1151009798049927, "reward_std": 0.21421536803245544, "rewards/correct_answer_reward_func": 0.728515625, "rewards/format_reward_func": 0.9991015195846558, "rewards/python_attempt_reward_func": 1.892578125, "rewards/python_count_reward_func": 1.736328125, "rewards/python_reward_func": 0.9385672211647034, "rewards/tool_execution_reward_func": 0.9338254928588867, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38033843674456086, "grad_norm": 0.36400528690096223, "kl": 5.8037261962890625, "learning_rate": 1e-06, "loss": 0.0772, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4976.0, "completions/max_terminated_length": 4976.0, "completions/mean_length": 1070.318359375, "completions/mean_terminated_length": 1070.318359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.381412839108246, "grad_norm": 0.5449594633422674, "kl": 9.595703125, "learning_rate": 1e-06, "loss": 0.1078, "num_tokens": 132719500.0, "reward": 1.0243639945983887, "reward_std": 0.2668827176094055, "rewards/correct_answer_reward_func": 0.65625, "rewards/format_reward_func": 0.9978342056274414, "rewards/python_attempt_reward_func": 2.4140625, "rewards/python_count_reward_func": 1.966796875, "rewards/python_reward_func": 0.8475562930107117, "rewards/tool_execution_reward_func": 0.8427354693412781, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38248724147193125, "grad_norm": 0.43568337979767363, "kl": 9.834716796875, "learning_rate": 1e-06, "loss": 0.1081, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6636.0, "completions/max_terminated_length": 6636.0, "completions/mean_length": 1044.587890625, "completions/mean_terminated_length": 1044.587890625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.3835616438356164, "grad_norm": 4.145727622015888, "kl": 11.164649963378906, "learning_rate": 1e-06, "loss": 0.0835, "num_tokens": 133520537.0, "reward": 1.0049166679382324, "reward_std": 0.163167804479599, "rewards/correct_answer_reward_func": 0.619140625, "rewards/format_reward_func": 0.9983029365539551, "rewards/python_attempt_reward_func": 2.40625, "rewards/python_count_reward_func": 2.103515625, "rewards/python_reward_func": 0.9382734894752502, "rewards/tool_execution_reward_func": 0.930577278137207, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38463604619930164, "grad_norm": 0.9940840761752334, "kl": 8.113685607910156, "learning_rate": 1e-06, "loss": 0.0805, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7136.0, "completions/max_terminated_length": 7136.0, "completions/mean_length": 1221.607421875, "completions/mean_terminated_length": 1221.607421875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.38571044856298686, "grad_norm": 0.34111260913335484, "kl": 6.197509765625, "learning_rate": 1e-06, "loss": 0.0815, "num_tokens": 134432176.0, "reward": 0.9131537675857544, "reward_std": 0.17117191851139069, "rewards/correct_answer_reward_func": 0.5390625, "rewards/format_reward_func": 0.9968973398208618, "rewards/python_attempt_reward_func": 2.2578125, "rewards/python_count_reward_func": 1.677734375, "rewards/python_reward_func": 0.8813345432281494, "rewards/tool_execution_reward_func": 0.8735591769218445, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38678485092667203, "grad_norm": 0.32833455546322304, "kl": 6.3212890625, "learning_rate": 1e-06, "loss": 0.0817, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7824.0, "completions/max_terminated_length": 7824.0, "completions/mean_length": 1297.662109375, "completions/mean_terminated_length": 1297.662109375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.38785925329035725, "grad_norm": 0.29229743398023267, "kl": 8.042236328125, "learning_rate": 1e-06, "loss": 0.0692, "num_tokens": 135379011.0, "reward": 0.9176347255706787, "reward_std": 0.21869143843650818, "rewards/correct_answer_reward_func": 0.5390625, "rewards/format_reward_func": 0.9919405579566956, "rewards/python_attempt_reward_func": 2.72265625, "rewards/python_count_reward_func": 2.33984375, "rewards/python_reward_func": 0.9095199108123779, "rewards/tool_execution_reward_func": 0.9009207487106323, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3889336556540424, "grad_norm": 0.2639650233981796, "kl": 7.91943359375, "learning_rate": 1e-06, "loss": 0.0691, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7294.0, "completions/max_terminated_length": 7294.0, "completions/mean_length": 1112.6484375, "completions/mean_terminated_length": 1112.6484375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.39000805801772764, "grad_norm": 0.38949734736543906, "kl": 8.78173828125, "learning_rate": 1e-06, "loss": 0.0715, "num_tokens": 136213423.0, "reward": 0.9991697669029236, "reward_std": 0.19649773836135864, "rewards/correct_answer_reward_func": 0.62109375, "rewards/format_reward_func": 0.9993359446525574, "rewards/python_attempt_reward_func": 2.345703125, "rewards/python_count_reward_func": 1.869140625, "rewards/python_reward_func": 0.8929563760757446, "rewards/tool_execution_reward_func": 0.8910443186759949, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.39108246038141287, "grad_norm": 0.31364462693655637, "kl": 8.279296875, "learning_rate": 1e-06, "loss": 0.071, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5512.0, "completions/max_terminated_length": 5512.0, "completions/mean_length": 916.0625, "completions/mean_terminated_length": 916.0625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.39215686274509803, "grad_norm": 0.5342141134731385, "kl": 12.38134765625, "learning_rate": 1e-06, "loss": 0.1298, "num_tokens": 136979183.0, "reward": 0.8892212510108948, "reward_std": 0.19064562022686005, "rewards/correct_answer_reward_func": 0.5078125, "rewards/format_reward_func": 0.9993554353713989, "rewards/python_attempt_reward_func": 1.962890625, "rewards/python_count_reward_func": 1.708984375, "rewards/python_reward_func": 0.9169914126396179, "rewards/tool_execution_reward_func": 0.9076884984970093, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.39323126510878326, "grad_norm": 0.5508788873723565, "kl": 12.25146484375, "learning_rate": 1e-06, "loss": 0.1297, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7440.0, "completions/max_terminated_length": 7440.0, "completions/mean_length": 1180.6484375, "completions/mean_terminated_length": 1180.6484375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3943056674724684, "grad_norm": 0.3417045677884488, "kl": 4.9375, "learning_rate": 1e-06, "loss": 0.0941, "num_tokens": 137872123.0, "reward": 0.8369513750076294, "reward_std": 0.1903403103351593, "rewards/correct_answer_reward_func": 0.458984375, "rewards/format_reward_func": 0.9968818426132202, "rewards/python_attempt_reward_func": 2.41015625, "rewards/python_count_reward_func": 1.99609375, "rewards/python_reward_func": 0.8967781066894531, "rewards/tool_execution_reward_func": 0.8929532766342163, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.39538006983615365, "grad_norm": 0.2932001991955244, "kl": 5.22216796875, "learning_rate": 1e-06, "loss": 0.0943, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7250.0, "completions/max_terminated_length": 7250.0, "completions/mean_length": 767.49609375, "completions/mean_terminated_length": 767.49609375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3964544721998388, "grad_norm": 0.4721144471871444, "kl": 8.129905700683594, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 138529433.0, "reward": 1.067448377609253, "reward_std": 0.13674569129943848, "rewards/correct_answer_reward_func": 0.68359375, "rewards/format_reward_func": 0.9936346411705017, "rewards/python_attempt_reward_func": 2.02734375, "rewards/python_count_reward_func": 1.748046875, "rewards/python_reward_func": 0.927971601486206, "rewards/tool_execution_reward_func": 0.9256386756896973, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.39752887456352404, "grad_norm": 0.6015168425919566, "kl": 8.754087448120117, "learning_rate": 1e-06, "loss": 0.0337, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6859.0, "completions/max_terminated_length": 6859.0, "completions/mean_length": 786.560546875, "completions/mean_terminated_length": 786.560546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.39860327692720926, "grad_norm": 0.5457970263433025, "kl": 9.26171875, "learning_rate": 1e-06, "loss": 0.0901, "num_tokens": 139206840.0, "reward": 0.7655340433120728, "reward_std": 0.11326411366462708, "rewards/correct_answer_reward_func": 0.380859375, "rewards/format_reward_func": 0.9947507381439209, "rewards/python_attempt_reward_func": 1.93359375, "rewards/python_count_reward_func": 1.673828125, "rewards/python_reward_func": 0.9299517869949341, "rewards/tool_execution_reward_func": 0.9286226034164429, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3996776792908944, "grad_norm": 0.4463638779729484, "kl": 9.008544921875, "learning_rate": 1e-06, "loss": 0.0898, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4818.0, "completions/max_terminated_length": 4818.0, "completions/mean_length": 974.51171875, "completions/mean_terminated_length": 974.51171875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.40075208165457965, "grad_norm": 0.43434247982822544, "kl": 9.3203125, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 139983806.0, "reward": 0.9120253324508667, "reward_std": 0.28385812044143677, "rewards/correct_answer_reward_func": 0.533203125, "rewards/format_reward_func": 0.9967364072799683, "rewards/python_attempt_reward_func": 1.92578125, "rewards/python_count_reward_func": 1.703125, "rewards/python_reward_func": 0.9067661762237549, "rewards/tool_execution_reward_func": 0.8973749279975891, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4018264840182648, "grad_norm": 0.35233316020250915, "kl": 8.46923828125, "learning_rate": 1e-06, "loss": 0.0642, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7349.0, "completions/max_terminated_length": 7349.0, "completions/mean_length": 1375.3046875, "completions/mean_terminated_length": 1375.3046875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.40290088638195004, "grad_norm": 0.5659163934060515, "kl": 13.7734375, "learning_rate": 1e-06, "loss": 0.1015, "num_tokens": 140960058.0, "reward": 0.8166643381118774, "reward_std": 0.18438641726970673, "rewards/correct_answer_reward_func": 0.44921875, "rewards/format_reward_func": 0.9967665076255798, "rewards/python_attempt_reward_func": 2.8828125, "rewards/python_count_reward_func": 2.26953125, "rewards/python_reward_func": 0.8470842838287354, "rewards/tool_execution_reward_func": 0.8404614925384521, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.40397528874563526, "grad_norm": 0.46560496206303575, "kl": 12.4609375, "learning_rate": 1e-06, "loss": 0.1002, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8307.0, "completions/max_terminated_length": 8307.0, "completions/mean_length": 1220.794921875, "completions/mean_terminated_length": 1220.794921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.40504969110932043, "grad_norm": 0.31509110489496217, "kl": 5.89453125, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 141859089.0, "reward": 1.0297322273254395, "reward_std": 0.22608184814453125, "rewards/correct_answer_reward_func": 0.65234375, "rewards/format_reward_func": 0.9929737448692322, "rewards/python_attempt_reward_func": 2.345703125, "rewards/python_count_reward_func": 1.958984375, "rewards/python_reward_func": 0.8996086120605469, "rewards/tool_execution_reward_func": 0.8939685821533203, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.40612409347300565, "grad_norm": 0.27878884069422705, "kl": 5.3076171875, "learning_rate": 1e-06, "loss": 0.0082, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6340.0, "completions/max_terminated_length": 6340.0, "completions/mean_length": 841.134765625, "completions/mean_terminated_length": 841.134765625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4071984958366908, "grad_norm": 0.5994894021018481, "kl": 6.22509765625, "learning_rate": 1e-06, "loss": 0.1121, "num_tokens": 142576630.0, "reward": 1.0130892992019653, "reward_std": 0.19868680834770203, "rewards/correct_answer_reward_func": 0.64453125, "rewards/format_reward_func": 0.9966406226158142, "rewards/python_attempt_reward_func": 1.79296875, "rewards/python_count_reward_func": 1.390625, "rewards/python_reward_func": 0.8495365381240845, "rewards/tool_execution_reward_func": 0.8461495637893677, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.40827289820037604, "grad_norm": 0.6038283376567827, "kl": 6.02197265625, "learning_rate": 1e-06, "loss": 0.1119, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4843.0, "completions/max_terminated_length": 4843.0, "completions/mean_length": 1040.14453125, "completions/mean_terminated_length": 1040.14453125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.40934730056406127, "grad_norm": 0.6896145236872965, "kl": 8.81640625, "learning_rate": 1e-06, "loss": 0.1133, "num_tokens": 143382272.0, "reward": 0.9671135544776917, "reward_std": 0.2141873836517334, "rewards/correct_answer_reward_func": 0.59765625, "rewards/format_reward_func": 0.9980273246765137, "rewards/python_attempt_reward_func": 2.177734375, "rewards/python_count_reward_func": 1.693359375, "rewards/python_reward_func": 0.8577156066894531, "rewards/tool_execution_reward_func": 0.849259078502655, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.41042170292774643, "grad_norm": 0.4641403765227848, "kl": 8.9716796875, "learning_rate": 1e-06, "loss": 0.1135, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7008.0, "completions/max_terminated_length": 7008.0, "completions/mean_length": 1189.697265625, "completions/mean_terminated_length": 1189.697265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.41149610529143166, "grad_norm": 0.2721016579985213, "kl": 4.708839416503906, "learning_rate": 1e-06, "loss": 0.067, "num_tokens": 144270053.0, "reward": 0.9727884531021118, "reward_std": 0.25386500358581543, "rewards/correct_answer_reward_func": 0.6015625, "rewards/format_reward_func": 0.996006965637207, "rewards/python_attempt_reward_func": 2.298828125, "rewards/python_count_reward_func": 1.828125, "rewards/python_reward_func": 0.8654289245605469, "rewards/tool_execution_reward_func": 0.8601229190826416, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4125705076551168, "grad_norm": 0.22903514439382955, "kl": 4.7505645751953125, "learning_rate": 1e-06, "loss": 0.067, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7722.0, "completions/max_terminated_length": 7722.0, "completions/mean_length": 1229.59375, "completions/mean_terminated_length": 1229.59375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.41364491001880205, "grad_norm": 0.2749868891858453, "kl": 4.117919921875, "learning_rate": 1e-06, "loss": 0.0686, "num_tokens": 145172981.0, "reward": 0.9498521089553833, "reward_std": 0.2620600163936615, "rewards/correct_answer_reward_func": 0.5703125, "rewards/format_reward_func": 0.9963566660881042, "rewards/python_attempt_reward_func": 2.56640625, "rewards/python_count_reward_func": 2.134765625, "rewards/python_reward_func": 0.9035388827323914, "rewards/tool_execution_reward_func": 0.9013415575027466, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4147193123824872, "grad_norm": 0.27474974446665057, "kl": 4.257080078125, "learning_rate": 1e-06, "loss": 0.0687, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7821.0, "completions/max_terminated_length": 7821.0, "completions/mean_length": 1266.783203125, "completions/mean_terminated_length": 1266.783203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.41579371474617244, "grad_norm": 0.42292459768638846, "kl": 10.78955078125, "learning_rate": 1e-06, "loss": 0.1063, "num_tokens": 146104262.0, "reward": 0.9169184565544128, "reward_std": 0.17725276947021484, "rewards/correct_answer_reward_func": 0.544921875, "rewards/format_reward_func": 0.9942820072174072, "rewards/python_attempt_reward_func": 2.578125, "rewards/python_count_reward_func": 2.041015625, "rewards/python_reward_func": 0.8749186396598816, "rewards/tool_execution_reward_func": 0.8657009601593018, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.41686811710985766, "grad_norm": 0.40856060089095525, "kl": 11.5791015625, "learning_rate": 1e-06, "loss": 0.1071, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5837.0, "completions/max_terminated_length": 5837.0, "completions/mean_length": 1043.08984375, "completions/mean_terminated_length": 1043.08984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.41794251947354283, "grad_norm": 0.37556160735074573, "kl": 5.61669921875, "learning_rate": 1e-06, "loss": 0.0891, "num_tokens": 146922548.0, "reward": 1.1119906902313232, "reward_std": 0.25168174505233765, "rewards/correct_answer_reward_func": 0.734375, "rewards/format_reward_func": 0.9933203458786011, "rewards/python_attempt_reward_func": 2.24609375, "rewards/python_count_reward_func": 1.9296875, "rewards/python_reward_func": 0.9015191197395325, "rewards/tool_execution_reward_func": 0.8947583436965942, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.41901692183722805, "grad_norm": 0.34604355743562704, "kl": 5.8466796875, "learning_rate": 1e-06, "loss": 0.0893, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5765.0, "completions/max_terminated_length": 5765.0, "completions/mean_length": 909.802734375, "completions/mean_terminated_length": 909.802734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4200913242009132, "grad_norm": 0.42371862211108097, "kl": 11.5537109375, "learning_rate": 1e-06, "loss": 0.0916, "num_tokens": 147651695.0, "reward": 1.0037949085235596, "reward_std": 0.21471911668777466, "rewards/correct_answer_reward_func": 0.623046875, "rewards/format_reward_func": 0.9937313795089722, "rewards/python_attempt_reward_func": 1.87890625, "rewards/python_count_reward_func": 1.6484375, "rewards/python_reward_func": 0.9180617928504944, "rewards/tool_execution_reward_func": 0.91000896692276, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.42116572656459844, "grad_norm": 0.39000718247199795, "kl": 11.37890625, "learning_rate": 1e-06, "loss": 0.0915, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6764.0, "completions/max_terminated_length": 6764.0, "completions/mean_length": 1262.998046875, "completions/mean_terminated_length": 1262.998046875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.42224012892828366, "grad_norm": 0.8496721635415044, "kl": 12.525390625, "learning_rate": 1e-06, "loss": 0.0495, "num_tokens": 148570670.0, "reward": 0.8867596983909607, "reward_std": 0.23902955651283264, "rewards/correct_answer_reward_func": 0.5234375, "rewards/format_reward_func": 0.9931454658508301, "rewards/python_attempt_reward_func": 2.322265625, "rewards/python_count_reward_func": 1.861328125, "rewards/python_reward_func": 0.8293898701667786, "rewards/tool_execution_reward_func": 0.8234653472900391, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.42331453129196883, "grad_norm": 0.6087130317835717, "kl": 11.171875, "learning_rate": 1e-06, "loss": 0.0482, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7385.0, "completions/max_terminated_length": 7385.0, "completions/mean_length": 952.228515625, "completions/mean_terminated_length": 952.228515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.42438893365565405, "grad_norm": 0.41896329417210093, "kl": 8.91748046875, "learning_rate": 1e-06, "loss": 0.1323, "num_tokens": 149341411.0, "reward": 1.0129499435424805, "reward_std": 0.24128751456737518, "rewards/correct_answer_reward_func": 0.634765625, "rewards/format_reward_func": 0.9908463954925537, "rewards/python_attempt_reward_func": 1.935546875, "rewards/python_count_reward_func": 1.6796875, "rewards/python_reward_func": 0.9034761190414429, "rewards/tool_execution_reward_func": 0.9000751972198486, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4254633360193392, "grad_norm": 0.4559605601670968, "kl": 8.287353515625, "learning_rate": 1e-06, "loss": 0.1317, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7306.0, "completions/max_terminated_length": 7306.0, "completions/mean_length": 1098.08203125, "completions/mean_terminated_length": 1098.08203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.42653773838302445, "grad_norm": 0.5572748295045867, "kl": 8.978515625, "learning_rate": 1e-06, "loss": 0.1157, "num_tokens": 150189965.0, "reward": 0.8198864459991455, "reward_std": 0.1334153264760971, "rewards/correct_answer_reward_func": 0.447265625, "rewards/format_reward_func": 0.9779911041259766, "rewards/python_attempt_reward_func": 1.689453125, "rewards/python_count_reward_func": 1.353515625, "rewards/python_reward_func": 0.8870388269424438, "rewards/tool_execution_reward_func": 0.8851128816604614, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.42761214074670967, "grad_norm": 0.4002572209674466, "kl": 8.32177734375, "learning_rate": 1e-06, "loss": 0.115, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7423.0, "completions/max_terminated_length": 7423.0, "completions/mean_length": 1142.119140625, "completions/mean_terminated_length": 1142.119140625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.42868654311039484, "grad_norm": 0.8482477281579598, "kl": 8.0244140625, "learning_rate": 1e-06, "loss": 0.1224, "num_tokens": 151041802.0, "reward": 0.9451149106025696, "reward_std": 0.22358539700508118, "rewards/correct_answer_reward_func": 0.572265625, "rewards/format_reward_func": 0.9678165316581726, "rewards/python_attempt_reward_func": 1.599609375, "rewards/python_count_reward_func": 1.419921875, "rewards/python_reward_func": 0.9023545980453491, "rewards/tool_execution_reward_func": 0.8964301347732544, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.42976094547408006, "grad_norm": 0.3981470021509354, "kl": 7.6171875, "learning_rate": 1e-06, "loss": 0.122, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7279.0, "completions/max_terminated_length": 7279.0, "completions/mean_length": 1174.095703125, "completions/mean_terminated_length": 1174.095703125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4308353478377652, "grad_norm": 1.5163171437831482, "kl": 1.90185546875, "learning_rate": 1e-06, "loss": 0.1258, "num_tokens": 151915611.0, "reward": 0.9475641250610352, "reward_std": 0.15621915459632874, "rewards/correct_answer_reward_func": 0.587890625, "rewards/format_reward_func": 0.971981942653656, "rewards/python_attempt_reward_func": 1.466796875, "rewards/python_count_reward_func": 1.228515625, "rewards/python_reward_func": 0.8294131755828857, "rewards/tool_execution_reward_func": 0.8263857960700989, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.43190975020145045, "grad_norm": 1.4902623175393022, "kl": 2.09765625, "learning_rate": 1e-06, "loss": 0.126, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7528.0, "completions/max_terminated_length": 7528.0, "completions/mean_length": 1387.15234375, "completions/mean_terminated_length": 1387.15234375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4329841525651356, "grad_norm": 0.8467122130093326, "kl": 1.98583984375, "learning_rate": 1e-06, "loss": 0.1584, "num_tokens": 152917641.0, "reward": 0.7541037797927856, "reward_std": 0.20857344567775726, "rewards/correct_answer_reward_func": 0.3828125, "rewards/format_reward_func": 0.9649066925048828, "rewards/python_attempt_reward_func": 1.861328125, "rewards/python_count_reward_func": 1.587890625, "rewards/python_reward_func": 0.8979259729385376, "rewards/tool_execution_reward_func": 0.8915496468544006, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.43405855492882084, "grad_norm": 0.7648404532702334, "kl": 2.54638671875, "learning_rate": 1e-06, "loss": 0.159, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7416.0, "completions/max_terminated_length": 7416.0, "completions/mean_length": 1215.2890625, "completions/mean_terminated_length": 1215.2890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.43513295729250606, "grad_norm": 0.48859863480836146, "kl": 2.8590087890625, "learning_rate": 1e-06, "loss": 0.0613, "num_tokens": 153818045.0, "reward": 0.8367078304290771, "reward_std": 0.20720063149929047, "rewards/correct_answer_reward_func": 0.4609375, "rewards/format_reward_func": 0.9680566787719727, "rewards/python_attempt_reward_func": 1.634765625, "rewards/python_count_reward_func": 1.453125, "rewards/python_reward_func": 0.9165349006652832, "rewards/tool_execution_reward_func": 0.9107949137687683, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.43620735965619123, "grad_norm": 0.4785852090217621, "kl": 4.11083984375, "learning_rate": 1e-06, "loss": 0.0626, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7314.0, "completions/max_terminated_length": 7314.0, "completions/mean_length": 1163.4453125, "completions/mean_terminated_length": 1163.4453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.43728176201987645, "grad_norm": 0.857703072627824, "kl": 11.4677734375, "learning_rate": 1e-06, "loss": 0.1261, "num_tokens": 154691521.0, "reward": 0.8185049295425415, "reward_std": 0.22291691601276398, "rewards/correct_answer_reward_func": 0.451171875, "rewards/format_reward_func": 0.964125394821167, "rewards/python_attempt_reward_func": 1.544921875, "rewards/python_count_reward_func": 1.337890625, "rewards/python_reward_func": 0.8856980800628662, "rewards/tool_execution_reward_func": 0.8725399971008301, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4383561643835616, "grad_norm": 1.2240777402761436, "kl": 15.052734375, "learning_rate": 1e-06, "loss": 0.1297, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6638.0, "completions/max_terminated_length": 6638.0, "completions/mean_length": 783.486328125, "completions/mean_terminated_length": 783.486328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.43943056674724684, "grad_norm": 1.7606379297316976, "kl": 15.983154296875, "learning_rate": 1e-06, "loss": 0.1641, "num_tokens": 155352410.0, "reward": 1.092618703842163, "reward_std": 0.15879718959331512, "rewards/correct_answer_reward_func": 0.70703125, "rewards/format_reward_func": 0.986177921295166, "rewards/python_attempt_reward_func": 1.345703125, "rewards/python_count_reward_func": 1.25, "rewards/python_reward_func": 0.9428989887237549, "rewards/tool_execution_reward_func": 0.9417597055435181, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.44050496911093207, "grad_norm": 1.459183541486716, "kl": 15.435302734375, "learning_rate": 1e-06, "loss": 0.1636, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6923.0, "completions/max_terminated_length": 6923.0, "completions/mean_length": 814.34765625, "completions/mean_terminated_length": 814.34765625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.44157937147461723, "grad_norm": 0.893179133891906, "kl": 9.367919921875, "learning_rate": 1e-06, "loss": 0.0871, "num_tokens": 156047916.0, "reward": 0.9838546514511108, "reward_std": 0.2533051371574402, "rewards/correct_answer_reward_func": 0.599609375, "rewards/format_reward_func": 0.9896372556686401, "rewards/python_attempt_reward_func": 1.2734375, "rewards/python_count_reward_func": 1.177734375, "rewards/python_reward_func": 0.9341611266136169, "rewards/tool_execution_reward_func": 0.9315894842147827, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.44265377383830246, "grad_norm": 0.7211598089725471, "kl": 8.404052734375, "learning_rate": 1e-06, "loss": 0.0862, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7540.0, "completions/max_terminated_length": 7540.0, "completions/mean_length": 1599.267578125, "completions/mean_terminated_length": 1599.267578125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4437281762019876, "grad_norm": 0.719248162217956, "kl": 9.1875, "learning_rate": 1e-06, "loss": 0.0771, "num_tokens": 157137429.0, "reward": 0.8265061974525452, "reward_std": 0.24827668070793152, "rewards/correct_answer_reward_func": 0.466796875, "rewards/format_reward_func": 0.9662487506866455, "rewards/python_attempt_reward_func": 1.724609375, "rewards/python_count_reward_func": 1.400390625, "rewards/python_reward_func": 0.8398592472076416, "rewards/tool_execution_reward_func": 0.8322979211807251, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.44480257856567285, "grad_norm": 0.563145293467672, "kl": 8.076171875, "learning_rate": 1e-06, "loss": 0.0759, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5947.0, "completions/max_terminated_length": 5947.0, "completions/mean_length": 951.9921875, "completions/mean_terminated_length": 951.9921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.44587698092935807, "grad_norm": 0.8399388565617641, "kl": 8.44384765625, "learning_rate": 1e-06, "loss": 0.1542, "num_tokens": 157888305.0, "reward": 0.8428256511688232, "reward_std": 0.1567794680595398, "rewards/correct_answer_reward_func": 0.470703125, "rewards/format_reward_func": 0.9769957065582275, "rewards/python_attempt_reward_func": 1.58203125, "rewards/python_count_reward_func": 1.3828125, "rewards/python_reward_func": 0.8898019194602966, "rewards/tool_execution_reward_func": 0.8836170434951782, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.44695138329304324, "grad_norm": 0.8444842352252304, "kl": 7.8857421875, "learning_rate": 1e-06, "loss": 0.1537, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7300.0, "completions/max_terminated_length": 7300.0, "completions/mean_length": 859.30859375, "completions/mean_terminated_length": 859.30859375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.44802578565672846, "grad_norm": 0.48231740037698473, "kl": 4.88330078125, "learning_rate": 1e-06, "loss": 0.1003, "num_tokens": 158590191.0, "reward": 1.0027650594711304, "reward_std": 0.22903390228748322, "rewards/correct_answer_reward_func": 0.626953125, "rewards/format_reward_func": 0.988644003868103, "rewards/python_attempt_reward_func": 1.4296875, "rewards/python_count_reward_func": 1.279296875, "rewards/python_reward_func": 0.891717791557312, "rewards/tool_execution_reward_func": 0.8904157280921936, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4491001880204136, "grad_norm": 0.4928822298111385, "kl": 5.16650390625, "learning_rate": 1e-06, "loss": 0.1006, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7419.0, "completions/max_terminated_length": 7419.0, "completions/mean_length": 1090.10546875, "completions/mean_terminated_length": 1090.10546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.45017459038409885, "grad_norm": 0.39739586377639624, "kl": 4.9619140625, "learning_rate": 1e-06, "loss": 0.0464, "num_tokens": 159430853.0, "reward": 0.799189031124115, "reward_std": 0.1780143678188324, "rewards/correct_answer_reward_func": 0.41796875, "rewards/format_reward_func": 0.9864081144332886, "rewards/python_attempt_reward_func": 1.798828125, "rewards/python_count_reward_func": 1.587890625, "rewards/python_reward_func": 0.9250604510307312, "rewards/tool_execution_reward_func": 0.9196932315826416, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4512489927477841, "grad_norm": 0.3546018708823422, "kl": 5.24462890625, "learning_rate": 1e-06, "loss": 0.0467, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7255.0, "completions/max_terminated_length": 7255.0, "completions/mean_length": 972.150390625, "completions/mean_terminated_length": 972.150390625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.45232339511146924, "grad_norm": 0.38294004041285595, "kl": 6.249755859375, "learning_rate": 1e-06, "loss": 0.0706, "num_tokens": 160216402.0, "reward": 0.9644666910171509, "reward_std": 0.25093817710876465, "rewards/correct_answer_reward_func": 0.583984375, "rewards/format_reward_func": 0.9898819923400879, "rewards/python_attempt_reward_func": 1.751953125, "rewards/python_count_reward_func": 1.486328125, "rewards/python_reward_func": 0.9176246523857117, "rewards/tool_execution_reward_func": 0.912529468536377, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.45339779747515446, "grad_norm": 0.3147879525504178, "kl": 6.7257080078125, "learning_rate": 1e-06, "loss": 0.0711, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5883.0, "completions/max_terminated_length": 5883.0, "completions/mean_length": 838.033203125, "completions/mean_terminated_length": 838.033203125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.45447219983883963, "grad_norm": 0.6946736237735285, "kl": 9.8671875, "learning_rate": 1e-06, "loss": 0.1364, "num_tokens": 160915939.0, "reward": 1.0099256038665771, "reward_std": 0.17703942954540253, "rewards/correct_answer_reward_func": 0.634765625, "rewards/format_reward_func": 0.9957422018051147, "rewards/python_attempt_reward_func": 1.521484375, "rewards/python_count_reward_func": 1.298828125, "rewards/python_reward_func": 0.8858367204666138, "rewards/tool_execution_reward_func": 0.8800579309463501, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.45554660220252485, "grad_norm": 0.6845909142427387, "kl": 10.9013671875, "learning_rate": 1e-06, "loss": 0.1374, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6855.0, "completions/max_terminated_length": 6855.0, "completions/mean_length": 967.880859375, "completions/mean_terminated_length": 967.880859375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.45662100456621, "grad_norm": 0.39366024367632396, "kl": 6.6337890625, "learning_rate": 1e-06, "loss": 0.0555, "num_tokens": 161691142.0, "reward": 0.8389904499053955, "reward_std": 0.11153280735015869, "rewards/correct_answer_reward_func": 0.46484375, "rewards/format_reward_func": 0.990221381187439, "rewards/python_attempt_reward_func": 1.7265625, "rewards/python_count_reward_func": 1.509765625, "rewards/python_reward_func": 0.8831760287284851, "rewards/tool_execution_reward_func": 0.8805121183395386, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.45769540692989524, "grad_norm": 0.3695207376074621, "kl": 6.752685546875, "learning_rate": 1e-06, "loss": 0.0556, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6626.0, "completions/max_terminated_length": 6626.0, "completions/mean_length": 1108.27734375, "completions/mean_terminated_length": 1108.27734375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.45876980929358047, "grad_norm": 0.3255729069113244, "kl": 7.5546875, "learning_rate": 1e-06, "loss": 0.0596, "num_tokens": 162531924.0, "reward": 0.893315315246582, "reward_std": 0.1861579716205597, "rewards/correct_answer_reward_func": 0.525390625, "rewards/format_reward_func": 0.9915060997009277, "rewards/python_attempt_reward_func": 1.927734375, "rewards/python_count_reward_func": 1.5390625, "rewards/python_reward_func": 0.8531893491744995, "rewards/tool_execution_reward_func": 0.8481173515319824, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.45984421165726563, "grad_norm": 0.2975633605386256, "kl": 7.40625, "learning_rate": 1e-06, "loss": 0.0594, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7518.0, "completions/max_terminated_length": 7518.0, "completions/mean_length": 1338.94140625, "completions/mean_terminated_length": 1338.94140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.46091861402095086, "grad_norm": 0.5673516741591959, "kl": 12.236328125, "learning_rate": 1e-06, "loss": 0.0608, "num_tokens": 163505654.0, "reward": 0.8565918207168579, "reward_std": 0.25661224126815796, "rewards/correct_answer_reward_func": 0.484375, "rewards/format_reward_func": 0.9898394346237183, "rewards/python_attempt_reward_func": 2.353515625, "rewards/python_count_reward_func": 1.875, "rewards/python_reward_func": 0.8745481967926025, "rewards/tool_execution_reward_func": 0.8712449073791504, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.461993016384636, "grad_norm": 0.45627472055127966, "kl": 11.447265625, "learning_rate": 1e-06, "loss": 0.06, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7293.0, "completions/max_terminated_length": 7293.0, "completions/mean_length": 826.15625, "completions/mean_terminated_length": 826.15625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.46306741874832125, "grad_norm": 0.32497860024308245, "kl": 4.6616668701171875, "learning_rate": 1e-06, "loss": 0.052, "num_tokens": 164198854.0, "reward": 0.790509819984436, "reward_std": 0.11571554839611053, "rewards/correct_answer_reward_func": 0.404296875, "rewards/format_reward_func": 0.9959378242492676, "rewards/python_attempt_reward_func": 1.591796875, "rewards/python_count_reward_func": 1.400390625, "rewards/python_reward_func": 0.9384230375289917, "rewards/tool_execution_reward_func": 0.935126781463623, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.46414182111200647, "grad_norm": 0.3269891807195216, "kl": 4.550498962402344, "learning_rate": 1e-06, "loss": 0.0519, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7401.0, "completions/max_terminated_length": 7401.0, "completions/mean_length": 1288.728515625, "completions/mean_terminated_length": 1288.728515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.46521622347569164, "grad_norm": 0.2906372457276669, "kl": 5.00732421875, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 165138971.0, "reward": 0.8372194170951843, "reward_std": 0.17393460869789124, "rewards/correct_answer_reward_func": 0.462890625, "rewards/format_reward_func": 0.9805840849876404, "rewards/python_attempt_reward_func": 2.22265625, "rewards/python_count_reward_func": 1.908203125, "rewards/python_reward_func": 0.8944855332374573, "rewards/tool_execution_reward_func": 0.8910598158836365, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.46629062583937686, "grad_norm": 0.27957296524653047, "kl": 5.21240234375, "learning_rate": 1e-06, "loss": 0.0427, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5792.0, "completions/max_terminated_length": 5792.0, "completions/mean_length": 980.365234375, "completions/mean_terminated_length": 980.365234375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.467365028203062, "grad_norm": 1.235209593157361, "kl": 13.6064453125, "learning_rate": 1e-06, "loss": 0.0884, "num_tokens": 165910454.0, "reward": 1.1525394916534424, "reward_std": 0.2790141701698303, "rewards/correct_answer_reward_func": 0.783203125, "rewards/format_reward_func": 0.9987890720367432, "rewards/python_attempt_reward_func": 2.072265625, "rewards/python_count_reward_func": 1.498046875, "rewards/python_reward_func": 0.8518337607383728, "rewards/tool_execution_reward_func": 0.8478926420211792, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.46843943056674725, "grad_norm": 0.8296284974023812, "kl": 12.970703125, "learning_rate": 1e-06, "loss": 0.0877, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5100.0, "completions/max_terminated_length": 5100.0, "completions/mean_length": 786.79296875, "completions/mean_terminated_length": 786.79296875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.4695138329304325, "grad_norm": 0.3252952882686684, "kl": 3.8620758056640625, "learning_rate": 1e-06, "loss": 0.0771, "num_tokens": 166587404.0, "reward": 1.1104719638824463, "reward_std": 0.22235365211963654, "rewards/correct_answer_reward_func": 0.736328125, "rewards/format_reward_func": 0.9989279508590698, "rewards/python_attempt_reward_func": 1.595703125, "rewards/python_count_reward_func": 1.419921875, "rewards/python_reward_func": 0.875285267829895, "rewards/tool_execution_reward_func": 0.8717913031578064, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.47058823529411764, "grad_norm": 0.3297332190414421, "kl": 3.87518310546875, "learning_rate": 1e-06, "loss": 0.0771, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5248.0, "completions/max_terminated_length": 5248.0, "completions/mean_length": 1125.23828125, "completions/mean_terminated_length": 1125.23828125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.47166263765780286, "grad_norm": 0.8205740182962288, "kl": 11.98828125, "learning_rate": 1e-06, "loss": 0.0615, "num_tokens": 167449990.0, "reward": 0.9037401676177979, "reward_std": 0.23192349076271057, "rewards/correct_answer_reward_func": 0.5390625, "rewards/format_reward_func": 0.9970746636390686, "rewards/python_attempt_reward_func": 2.177734375, "rewards/python_count_reward_func": 1.5390625, "rewards/python_reward_func": 0.8326869606971741, "rewards/tool_execution_reward_func": 0.8263137340545654, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.47273704002148803, "grad_norm": 0.7177822246707461, "kl": 11.3916015625, "learning_rate": 1e-06, "loss": 0.0609, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7252.0, "completions/max_terminated_length": 7252.0, "completions/mean_length": 1401.84765625, "completions/mean_terminated_length": 1401.84765625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.47381144238517325, "grad_norm": 0.7935594345444804, "kl": 12.8115234375, "learning_rate": 1e-06, "loss": 0.0742, "num_tokens": 168444792.0, "reward": 0.679136335849762, "reward_std": 0.2116124927997589, "rewards/correct_answer_reward_func": 0.314453125, "rewards/format_reward_func": 0.9954191446304321, "rewards/python_attempt_reward_func": 2.69140625, "rewards/python_count_reward_func": 1.9140625, "rewards/python_reward_func": 0.8365304470062256, "rewards/tool_execution_reward_func": 0.827997088432312, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4748858447488584, "grad_norm": 0.5448498924361322, "kl": 11.16015625, "learning_rate": 1e-06, "loss": 0.0725, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7635.0, "completions/max_terminated_length": 7635.0, "completions/mean_length": 1326.376953125, "completions/mean_terminated_length": 1326.376953125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.47596024711254364, "grad_norm": 0.27888588868944625, "kl": 4.80126953125, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 169408697.0, "reward": 0.7268152832984924, "reward_std": 0.22831228375434875, "rewards/correct_answer_reward_func": 0.349609375, "rewards/format_reward_func": 0.9922811388969421, "rewards/python_attempt_reward_func": 2.361328125, "rewards/python_count_reward_func": 1.98828125, "rewards/python_reward_func": 0.9005751013755798, "rewards/tool_execution_reward_func": 0.89374840259552, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.47703464947622887, "grad_norm": 0.25311751930717513, "kl": 4.2509765625, "learning_rate": 1e-06, "loss": 0.0369, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6783.0, "completions/max_terminated_length": 6783.0, "completions/mean_length": 1259.3515625, "completions/mean_terminated_length": 1259.3515625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.47810905183991403, "grad_norm": 0.2907339981818036, "kl": 5.59130859375, "learning_rate": 1e-06, "loss": 0.1172, "num_tokens": 170331021.0, "reward": 0.8922932147979736, "reward_std": 0.25376248359680176, "rewards/correct_answer_reward_func": 0.515625, "rewards/format_reward_func": 0.998828113079071, "rewards/python_attempt_reward_func": 2.453125, "rewards/python_count_reward_func": 1.91015625, "rewards/python_reward_func": 0.890878438949585, "rewards/tool_execution_reward_func": 0.8845130205154419, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.47918345420359926, "grad_norm": 0.2921299719625762, "kl": 5.3720703125, "learning_rate": 1e-06, "loss": 0.117, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7388.0, "completions/max_terminated_length": 7388.0, "completions/mean_length": 938.7890625, "completions/mean_terminated_length": 938.7890625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4802578565672844, "grad_norm": 0.4414875575355249, "kl": 3.458740234375, "learning_rate": 1e-06, "loss": 0.0604, "num_tokens": 171100417.0, "reward": 1.0892350673675537, "reward_std": 0.17149150371551514, "rewards/correct_answer_reward_func": 0.705078125, "rewards/format_reward_func": 0.9892274141311646, "rewards/python_attempt_reward_func": 1.9296875, "rewards/python_count_reward_func": 1.68359375, "rewards/python_reward_func": 0.9356538653373718, "rewards/tool_execution_reward_func": 0.9315576553344727, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.48133225893096965, "grad_norm": 0.4474265153958768, "kl": 3.55859375, "learning_rate": 1e-06, "loss": 0.0605, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6986.0, "completions/max_terminated_length": 6986.0, "completions/mean_length": 1054.224609375, "completions/mean_terminated_length": 1054.224609375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.48240666129465487, "grad_norm": 0.6096184930049604, "kl": 5.34375, "learning_rate": 1e-06, "loss": 0.0526, "num_tokens": 171928116.0, "reward": 0.7041119337081909, "reward_std": 0.2098621279001236, "rewards/correct_answer_reward_func": 0.330078125, "rewards/format_reward_func": 0.998033881187439, "rewards/python_attempt_reward_func": 2.296875, "rewards/python_count_reward_func": 1.7890625, "rewards/python_reward_func": 0.8757758140563965, "rewards/tool_execution_reward_func": 0.8721354007720947, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.48348106365834004, "grad_norm": 0.4471162565690379, "kl": 5.298828125, "learning_rate": 1e-06, "loss": 0.0526, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7208.0, "completions/max_terminated_length": 7208.0, "completions/mean_length": 1177.15625, "completions/mean_terminated_length": 1177.15625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.48455546602202526, "grad_norm": 0.3590293986551812, "kl": 4.01519775390625, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 172814212.0, "reward": 0.9726883172988892, "reward_std": 0.20862779021263123, "rewards/correct_answer_reward_func": 0.591796875, "rewards/format_reward_func": 0.9988281726837158, "rewards/python_attempt_reward_func": 2.421875, "rewards/python_count_reward_func": 2.01953125, "rewards/python_reward_func": 0.9108266830444336, "rewards/tool_execution_reward_func": 0.9056291580200195, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.48562986838571043, "grad_norm": 0.3474039290433552, "kl": 4.097137451171875, "learning_rate": 1e-06, "loss": 0.0444, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5633.0, "completions/max_terminated_length": 5633.0, "completions/mean_length": 876.7265625, "completions/mean_terminated_length": 876.7265625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.48670427074939565, "grad_norm": 0.351781411098542, "kl": 2.606170654296875, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 173541624.0, "reward": 0.942267656326294, "reward_std": 0.2027713656425476, "rewards/correct_answer_reward_func": 0.552734375, "rewards/format_reward_func": 0.9976041316986084, "rewards/python_attempt_reward_func": 1.978515625, "rewards/python_count_reward_func": 1.802734375, "rewards/python_reward_func": 0.9515811204910278, "rewards/tool_execution_reward_func": 0.9500620365142822, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4877786731130809, "grad_norm": 0.33258051538229044, "kl": 2.737548828125, "learning_rate": 1e-06, "loss": 0.0316, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5741.0, "completions/max_terminated_length": 5741.0, "completions/mean_length": 749.51953125, "completions/mean_terminated_length": 749.51953125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.48885307547676604, "grad_norm": 0.5796848128088582, "kl": 6.58984375, "learning_rate": 1e-06, "loss": 0.0792, "num_tokens": 174187426.0, "reward": 1.0876107215881348, "reward_std": 0.17974555492401123, "rewards/correct_answer_reward_func": 0.705078125, "rewards/format_reward_func": 0.9983593821525574, "rewards/python_attempt_reward_func": 1.744140625, "rewards/python_count_reward_func": 1.49609375, "rewards/python_reward_func": 0.9162194728851318, "rewards/tool_execution_reward_func": 0.9143043160438538, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.48992747784045126, "grad_norm": 0.5376775966757583, "kl": 7.2392578125, "learning_rate": 1e-06, "loss": 0.0799, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6805.0, "completions/max_terminated_length": 6805.0, "completions/mean_length": 1011.353515625, "completions/mean_terminated_length": 1011.353515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.49100188020413643, "grad_norm": 0.6022403099057663, "kl": 8.43408203125, "learning_rate": 1e-06, "loss": 0.0648, "num_tokens": 174975543.0, "reward": 1.1389744281768799, "reward_std": 0.18891996145248413, "rewards/correct_answer_reward_func": 0.7578125, "rewards/format_reward_func": 0.998264491558075, "rewards/python_attempt_reward_func": 2.13671875, "rewards/python_count_reward_func": 1.7421875, "rewards/python_reward_func": 0.9140943288803101, "rewards/tool_execution_reward_func": 0.9075450897216797, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.49207628256782165, "grad_norm": 0.575479340533468, "kl": 8.677734375, "learning_rate": 1e-06, "loss": 0.0651, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6320.0, "completions/max_terminated_length": 6320.0, "completions/mean_length": 923.1328125, "completions/mean_terminated_length": 923.1328125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4931506849315068, "grad_norm": 0.4903904498854708, "kl": 6.76318359375, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 175747323.0, "reward": 0.919517457485199, "reward_std": 0.16225914657115936, "rewards/correct_answer_reward_func": 0.54296875, "rewards/format_reward_func": 0.9952281713485718, "rewards/python_attempt_reward_func": 2.015625, "rewards/python_count_reward_func": 1.78125, "rewards/python_reward_func": 0.8901390433311462, "rewards/tool_execution_reward_func": 0.8875154852867126, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.49422508729519204, "grad_norm": 0.38010865407178795, "kl": 6.638671875, "learning_rate": 1e-06, "loss": 0.0219, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6824.0, "completions/max_terminated_length": 6824.0, "completions/mean_length": 1339.84765625, "completions/mean_terminated_length": 1339.84765625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.49529948965887727, "grad_norm": 0.5898639872754264, "kl": 9.0927734375, "learning_rate": 1e-06, "loss": 0.0441, "num_tokens": 176728941.0, "reward": 0.8869926333427429, "reward_std": 0.2906164526939392, "rewards/correct_answer_reward_func": 0.517578125, "rewards/format_reward_func": 0.982958197593689, "rewards/python_attempt_reward_func": 2.40625, "rewards/python_count_reward_func": 1.869140625, "rewards/python_reward_func": 0.8720191717147827, "rewards/tool_execution_reward_func": 0.8641144633293152, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.49637389202256244, "grad_norm": 0.5146544182336869, "kl": 8.482421875, "learning_rate": 1e-06, "loss": 0.0435, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6786.0, "completions/max_terminated_length": 6786.0, "completions/mean_length": 1239.328125, "completions/mean_terminated_length": 1239.328125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.49744829438624766, "grad_norm": 0.5830672998637806, "kl": 12.99560546875, "learning_rate": 1e-06, "loss": 0.1197, "num_tokens": 177634517.0, "reward": 0.8879570960998535, "reward_std": 0.2461840808391571, "rewards/correct_answer_reward_func": 0.51171875, "rewards/format_reward_func": 0.9978905916213989, "rewards/python_attempt_reward_func": 2.609375, "rewards/python_count_reward_func": 2.134765625, "rewards/python_reward_func": 0.8937438130378723, "rewards/tool_execution_reward_func": 0.8833016157150269, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4985226967499328, "grad_norm": 0.4677498455271072, "kl": 11.43798828125, "learning_rate": 1e-06, "loss": 0.1181, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7730.0, "completions/max_terminated_length": 7730.0, "completions/mean_length": 1200.953125, "completions/mean_terminated_length": 1200.953125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.49959709911361805, "grad_norm": 0.3501721138643843, "kl": 5.062744140625, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 178529021.0, "reward": 0.891830563545227, "reward_std": 0.19574615359306335, "rewards/correct_answer_reward_func": 0.509765625, "rewards/format_reward_func": 0.9951757788658142, "rewards/python_attempt_reward_func": 2.431640625, "rewards/python_count_reward_func": 1.982421875, "rewards/python_reward_func": 0.9173928499221802, "rewards/tool_execution_reward_func": 0.9151490926742554, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5006715014773032, "grad_norm": 0.3312696473928615, "kl": 4.749267578125, "learning_rate": 1e-06, "loss": 0.0315, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7204.0, "completions/max_terminated_length": 7204.0, "completions/mean_length": 957.708984375, "completions/mean_terminated_length": 957.708984375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5017459038409885, "grad_norm": 0.275040961907143, "kl": 5.71044921875, "learning_rate": 1e-06, "loss": 0.0656, "num_tokens": 179303624.0, "reward": 1.0462589263916016, "reward_std": 0.20277203619480133, "rewards/correct_answer_reward_func": 0.671875, "rewards/format_reward_func": 0.9992578029632568, "rewards/python_attempt_reward_func": 1.951171875, "rewards/python_count_reward_func": 1.525390625, "rewards/python_reward_func": 0.8801935911178589, "rewards/tool_execution_reward_func": 0.8726617097854614, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5028203062046737, "grad_norm": 0.46569952645322155, "kl": 5.86328125, "learning_rate": 1e-06, "loss": 0.0657, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4378.0, "completions/max_terminated_length": 4378.0, "completions/mean_length": 677.48828125, "completions/mean_terminated_length": 677.48828125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5038947085683588, "grad_norm": 0.6514100465591436, "kl": 7.1923828125, "learning_rate": 1e-06, "loss": 0.1192, "num_tokens": 179913442.0, "reward": 1.0979183912277222, "reward_std": 0.17656156420707703, "rewards/correct_answer_reward_func": 0.71875, "rewards/format_reward_func": 0.997083306312561, "rewards/python_attempt_reward_func": 1.6796875, "rewards/python_count_reward_func": 1.423828125, "rewards/python_reward_func": 0.9027816653251648, "rewards/tool_execution_reward_func": 0.898759126663208, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.504969110932044, "grad_norm": 0.5988374203077808, "kl": 7.11376953125, "learning_rate": 1e-06, "loss": 0.1192, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6767.0, "completions/max_terminated_length": 6767.0, "completions/mean_length": 1242.935546875, "completions/mean_terminated_length": 1242.935546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5060435132957293, "grad_norm": 0.4623499641546121, "kl": 5.009033203125, "learning_rate": 1e-06, "loss": 0.0685, "num_tokens": 180838977.0, "reward": 0.9254293441772461, "reward_std": 0.17848379909992218, "rewards/correct_answer_reward_func": 0.546875, "rewards/format_reward_func": 0.9969748258590698, "rewards/python_attempt_reward_func": 2.345703125, "rewards/python_count_reward_func": 1.90625, "rewards/python_reward_func": 0.902462363243103, "rewards/tool_execution_reward_func": 0.8957969546318054, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5071179156594144, "grad_norm": 0.3322510335707236, "kl": 4.985107421875, "learning_rate": 1e-06, "loss": 0.0685, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5444.0, "completions/max_terminated_length": 5444.0, "completions/mean_length": 901.16015625, "completions/mean_terminated_length": 901.16015625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5081923180230996, "grad_norm": 0.39200907870222, "kl": 4.582275390625, "learning_rate": 1e-06, "loss": 0.0538, "num_tokens": 181569971.0, "reward": 1.051785945892334, "reward_std": 0.11762835085391998, "rewards/correct_answer_reward_func": 0.666015625, "rewards/format_reward_func": 0.9994791746139526, "rewards/python_attempt_reward_func": 2.1484375, "rewards/python_count_reward_func": 1.861328125, "rewards/python_reward_func": 0.9327590465545654, "rewards/tool_execution_reward_func": 0.9293727874755859, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5092667203867849, "grad_norm": 0.39332283581080263, "kl": 4.875244140625, "learning_rate": 1e-06, "loss": 0.0541, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6923.0, "completions/max_terminated_length": 6923.0, "completions/mean_length": 1134.69140625, "completions/mean_terminated_length": 1134.69140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.51034112275047, "grad_norm": 0.3960849131507554, "kl": 6.9013671875, "learning_rate": 1e-06, "loss": 0.0657, "num_tokens": 182424309.0, "reward": 1.0580158233642578, "reward_std": 0.1437232792377472, "rewards/correct_answer_reward_func": 0.6796875, "rewards/format_reward_func": 0.9969990253448486, "rewards/python_attempt_reward_func": 2.41796875, "rewards/python_count_reward_func": 2.03515625, "rewards/python_reward_func": 0.8964154124259949, "rewards/tool_execution_reward_func": 0.8946428298950195, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5114155251141552, "grad_norm": 0.3746212844333868, "kl": 7.5107421875, "learning_rate": 1e-06, "loss": 0.0663, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7537.0, "completions/max_terminated_length": 7537.0, "completions/mean_length": 1255.80078125, "completions/mean_terminated_length": 1255.80078125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5124899274778405, "grad_norm": 0.44245190274137175, "kl": 5.5377197265625, "learning_rate": 1e-06, "loss": 0.0539, "num_tokens": 183344943.0, "reward": 1.0358142852783203, "reward_std": 0.2376510053873062, "rewards/correct_answer_reward_func": 0.658203125, "rewards/format_reward_func": 0.9928832054138184, "rewards/python_attempt_reward_func": 2.18359375, "rewards/python_count_reward_func": 1.857421875, "rewards/python_reward_func": 0.8996636867523193, "rewards/tool_execution_reward_func": 0.8951722383499146, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5135643298415257, "grad_norm": 0.3360757072100186, "kl": 5.686126708984375, "learning_rate": 1e-06, "loss": 0.0541, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7623.0, "completions/max_terminated_length": 7623.0, "completions/mean_length": 1119.021484375, "completions/mean_terminated_length": 1119.021484375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5146387322052108, "grad_norm": 0.6021145281161715, "kl": 6.6630859375, "learning_rate": 1e-06, "loss": 0.0763, "num_tokens": 184211450.0, "reward": 0.9173682928085327, "reward_std": 0.21335454285144806, "rewards/correct_answer_reward_func": 0.53515625, "rewards/format_reward_func": 0.9974851608276367, "rewards/python_attempt_reward_func": 2.0546875, "rewards/python_count_reward_func": 1.677734375, "rewards/python_reward_func": 0.9220237731933594, "rewards/tool_execution_reward_func": 0.9135749936103821, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.515713134568896, "grad_norm": 0.5449818060372019, "kl": 6.646484375, "learning_rate": 1e-06, "loss": 0.0763, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8098.0, "completions/max_terminated_length": 8098.0, "completions/mean_length": 934.068359375, "completions/mean_terminated_length": 934.068359375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5167875369325813, "grad_norm": 0.6610507895375427, "kl": 8.2978515625, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 184961117.0, "reward": 0.955240786075592, "reward_std": 0.22846873104572296, "rewards/correct_answer_reward_func": 0.572265625, "rewards/format_reward_func": 0.9979798793792725, "rewards/python_attempt_reward_func": 2.05078125, "rewards/python_count_reward_func": 1.841796875, "rewards/python_reward_func": 0.9223741292953491, "rewards/tool_execution_reward_func": 0.9168961048126221, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5178619392962664, "grad_norm": 0.49383531040121315, "kl": 7.5234375, "learning_rate": 1e-06, "loss": 0.0253, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7150.0, "completions/max_terminated_length": 7150.0, "completions/mean_length": 1261.158203125, "completions/mean_terminated_length": 1261.158203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5189363416599516, "grad_norm": 2.904590999790052, "kl": 11.07421875, "learning_rate": 1e-06, "loss": 0.0596, "num_tokens": 185887502.0, "reward": 1.089902639389038, "reward_std": 0.2128956913948059, "rewards/correct_answer_reward_func": 0.708984375, "rewards/format_reward_func": 0.9933854341506958, "rewards/python_attempt_reward_func": 2.9921875, "rewards/python_count_reward_func": 2.57421875, "rewards/python_reward_func": 0.9160459041595459, "rewards/tool_execution_reward_func": 0.9112056493759155, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5200107440236369, "grad_norm": 0.5391527313705903, "kl": 8.0146484375, "learning_rate": 1e-06, "loss": 0.0566, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7216.0, "completions/max_terminated_length": 7216.0, "completions/mean_length": 1220.154296875, "completions/mean_terminated_length": 1220.154296875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5210851463873221, "grad_norm": 0.2995454719033442, "kl": 6.1279296875, "learning_rate": 1e-06, "loss": 0.0364, "num_tokens": 186785309.0, "reward": 0.9274625778198242, "reward_std": 0.16902463138103485, "rewards/correct_answer_reward_func": 0.548828125, "rewards/format_reward_func": 0.9949132204055786, "rewards/python_attempt_reward_func": 2.33984375, "rewards/python_count_reward_func": 2.001953125, "rewards/python_reward_func": 0.9082690477371216, "rewards/tool_execution_reward_func": 0.8982592821121216, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5221595487510072, "grad_norm": 0.27671802309410687, "kl": 5.5458984375, "learning_rate": 1e-06, "loss": 0.0359, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6073.0, "completions/max_terminated_length": 6073.0, "completions/mean_length": 819.115234375, "completions/mean_terminated_length": 819.115234375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5232339511146924, "grad_norm": 0.621913002801839, "kl": 6.52880859375, "learning_rate": 1e-06, "loss": 0.0541, "num_tokens": 187476440.0, "reward": 1.1041650772094727, "reward_std": 0.2689473032951355, "rewards/correct_answer_reward_func": 0.72265625, "rewards/format_reward_func": 0.9951500296592712, "rewards/python_attempt_reward_func": 1.947265625, "rewards/python_count_reward_func": 1.79296875, "rewards/python_reward_func": 0.9198249578475952, "rewards/tool_execution_reward_func": 0.912394642829895, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5243083534783777, "grad_norm": 0.3495577855829143, "kl": 5.707763671875, "learning_rate": 1e-06, "loss": 0.0533, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4915.0, "completions/max_terminated_length": 4915.0, "completions/mean_length": 735.3828125, "completions/mean_terminated_length": 735.3828125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5253827558420628, "grad_norm": 0.41192167293129944, "kl": 5.993377685546875, "learning_rate": 1e-06, "loss": 0.0407, "num_tokens": 188123036.0, "reward": 1.0840206146240234, "reward_std": 0.18005895614624023, "rewards/correct_answer_reward_func": 0.697265625, "rewards/format_reward_func": 0.9972265958786011, "rewards/python_attempt_reward_func": 1.681640625, "rewards/python_count_reward_func": 1.55078125, "rewards/python_reward_func": 0.9411256909370422, "rewards/tool_execution_reward_func": 0.9365482330322266, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.526457158205748, "grad_norm": 0.41655161696528004, "kl": 6.10870361328125, "learning_rate": 1e-06, "loss": 0.0408, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5883.0, "completions/max_terminated_length": 5883.0, "completions/mean_length": 1259.501953125, "completions/mean_terminated_length": 1259.501953125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5275315605694333, "grad_norm": 0.5478688861140599, "kl": 5.99072265625, "learning_rate": 1e-06, "loss": 0.0558, "num_tokens": 189043421.0, "reward": 0.7476028203964233, "reward_std": 0.12572914361953735, "rewards/correct_answer_reward_func": 0.369140625, "rewards/format_reward_func": 0.9897526502609253, "rewards/python_attempt_reward_func": 2.68359375, "rewards/python_count_reward_func": 2.29296875, "rewards/python_reward_func": 0.9060895442962646, "rewards/tool_execution_reward_func": 0.902558445930481, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5286059629331185, "grad_norm": 0.4331343984328076, "kl": 6.1484375, "learning_rate": 1e-06, "loss": 0.0559, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6301.0, "completions/max_terminated_length": 6301.0, "completions/mean_length": 1106.638671875, "completions/mean_terminated_length": 1106.638671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5296803652968036, "grad_norm": 0.24916133567732382, "kl": 5.85205078125, "learning_rate": 1e-06, "loss": 0.0723, "num_tokens": 189874916.0, "reward": 0.7989369630813599, "reward_std": 0.2625983655452728, "rewards/correct_answer_reward_func": 0.4140625, "rewards/format_reward_func": 0.995573878288269, "rewards/python_attempt_reward_func": 2.380859375, "rewards/python_count_reward_func": 2.095703125, "rewards/python_reward_func": 0.9362584352493286, "rewards/tool_execution_reward_func": 0.9287985563278198, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5307547676604889, "grad_norm": 0.2620605089779678, "kl": 6.25830078125, "learning_rate": 1e-06, "loss": 0.0727, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7282.0, "completions/max_terminated_length": 7282.0, "completions/mean_length": 1164.169921875, "completions/mean_terminated_length": 1164.169921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5318291700241741, "grad_norm": 0.5408445993556799, "kl": 7.66943359375, "learning_rate": 1e-06, "loss": 0.0722, "num_tokens": 190754139.0, "reward": 0.821053147315979, "reward_std": 0.144608736038208, "rewards/correct_answer_reward_func": 0.439453125, "rewards/format_reward_func": 0.9941232800483704, "rewards/python_attempt_reward_func": 2.349609375, "rewards/python_count_reward_func": 1.998046875, "rewards/python_reward_func": 0.9270508289337158, "rewards/tool_execution_reward_func": 0.9138772487640381, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5329035723878592, "grad_norm": 0.526278590944302, "kl": 8.3056640625, "learning_rate": 1e-06, "loss": 0.0728, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6358.0, "completions/max_terminated_length": 6358.0, "completions/mean_length": 1238.17578125, "completions/mean_terminated_length": 1238.17578125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5339779747515444, "grad_norm": 0.4366578162813579, "kl": 10.10223388671875, "learning_rate": 1e-06, "loss": 0.068, "num_tokens": 191672885.0, "reward": 0.8804943561553955, "reward_std": 0.159805566072464, "rewards/correct_answer_reward_func": 0.498046875, "rewards/format_reward_func": 0.9949026107788086, "rewards/python_attempt_reward_func": 2.51171875, "rewards/python_count_reward_func": 2.185546875, "rewards/python_reward_func": 0.9294356107711792, "rewards/tool_execution_reward_func": 0.917334794998169, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5350523771152297, "grad_norm": 0.393904368071649, "kl": 10.2828369140625, "learning_rate": 1e-06, "loss": 0.0682, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5299.0, "completions/max_terminated_length": 5299.0, "completions/mean_length": 840.06640625, "completions/mean_terminated_length": 840.06640625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.5361267794789148, "grad_norm": 0.5493076193324969, "kl": 13.572265625, "learning_rate": 1e-06, "loss": 0.0987, "num_tokens": 192377175.0, "reward": 1.0707019567489624, "reward_std": 0.2522921860218048, "rewards/correct_answer_reward_func": 0.69140625, "rewards/format_reward_func": 0.9977995157241821, "rewards/python_attempt_reward_func": 1.751953125, "rewards/python_count_reward_func": 1.48046875, "rewards/python_reward_func": 0.9055501222610474, "rewards/tool_execution_reward_func": 0.8986793160438538, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5372011818426, "grad_norm": 0.5395406178327915, "kl": 13.01953125, "learning_rate": 1e-06, "loss": 0.0981, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 192377175, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }