{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9642857142857143, "eval_steps": 500, "global_step": 675, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 112.375, "completions/mean_terminated_length": 112.375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.23133236169815063, "epoch": 0.0014285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.5609793066978455, "learning_rate": 1e-05, "loss": -0.0326, "num_tokens": 3827.0, "reward": 3.325000047683716, "reward_std": 0.4949747622013092, "rewards/accuracy_reward/mean": 0.949999988079071, "rewards/accuracy_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 1, "step_time": 81.05236951820552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.08870352059602737, "epoch": 0.002857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1547786295413971, "learning_rate": 9.985714285714286e-06, "loss": -0.0235, "num_tokens": 8887.0, "reward": 2.25, "reward_std": 0.4105744957923889, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4105745255947113, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 2, "step_time": 89.81628125812858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.13035762310028076, "epoch": 0.004285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.4174831807613373, "learning_rate": 9.971428571428571e-06, "loss": -0.0269, "num_tokens": 13121.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 3, "step_time": 85.45075305597857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.10606934130191803, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.30262526869773865, "learning_rate": 9.957142857142858e-06, "loss": 0.0164, "num_tokens": 17148.0, "reward": 3.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 4, "step_time": 90.89193713059649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 370.625, "completions/mean_terminated_length": 370.625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.08899108320474625, "epoch": 0.007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1735095977783203, "learning_rate": 9.942857142857145e-06, "loss": 0.0459, "num_tokens": 23041.0, "reward": 2.0, "reward_std": 0.37796446681022644, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.37796446681022644, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 5, "step_time": 697.1950137680396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 1012.875, "completions/mean_terminated_length": 979.5, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "entropy": 0.08677688241004944, "epoch": 0.008571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.08864881098270416, "learning_rate": 9.92857142857143e-06, "loss": 0.0175, "num_tokens": 34176.0, "reward": 1.4155627489089966, "reward_std": 0.5720135569572449, "rewards/accuracy_reward/mean": 0.29056277871131897, "rewards/accuracy_reward/std": 0.34117603302001953, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 6, "step_time": 719.0970629318617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.12077358365058899, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.23833218216896057, "learning_rate": 9.914285714285715e-06, "loss": 0.0212, "num_tokens": 39181.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 7, "step_time": 90.16333405300975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.11874130368232727, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.373715341091156, "learning_rate": 9.9e-06, "loss": -0.0447, "num_tokens": 43519.0, "reward": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.2314550280570984, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 8, "step_time": 87.87877379404381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.5435105562210083, "epoch": 0.012857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.13074618577957153, "learning_rate": 9.885714285714287e-06, "loss": 0.0, "num_tokens": 54647.0, "reward": 0.07834997773170471, "reward_std": 0.0034144099336117506, "rewards/accuracy_reward/mean": 0.07834997773170471, "rewards/accuracy_reward/std": 0.0034144094679504633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 9, "step_time": 157.09298126818612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.06250333786010742, "epoch": 0.014285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.2078467607498169, "learning_rate": 9.871428571428572e-06, "loss": 0.0346, "num_tokens": 59818.0, "reward": 3.2083334922790527, "reward_std": 0.5473601222038269, "rewards/accuracy_reward/mean": 0.9583333730697632, "rewards/accuracy_reward/std": 0.117851123213768, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.75, "rewards/operation_reward/std": 0.4629100561141968, "step": 10, "step_time": 93.21766773890704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.0871114432811737, "epoch": 0.015714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2327105551958084, "learning_rate": 9.857142857142859e-06, "loss": -0.0003, "num_tokens": 64325.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 11, "step_time": 86.87931914674118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.26864996552467346, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.3883359134197235, "learning_rate": 9.842857142857144e-06, "loss": 0.385, "num_tokens": 68977.0, "reward": 2.0, "reward_std": 0.9258201122283936, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 12, "step_time": 102.92130590602756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 142.2857208251953, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.0667385458946228, "epoch": 0.018571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2593940198421478, "learning_rate": 9.828571428571429e-06, "loss": 1.0801, "num_tokens": 73933.0, "reward": 2.3125, "reward_std": 0.5303300619125366, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 13, "step_time": 714.8113870490342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 392.25, "completions/mean_terminated_length": 302.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.2337883710861206, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.21450196206569672, "learning_rate": 9.814285714285716e-06, "loss": 0.1915, "num_tokens": 79959.0, "reward": 1.6878886222839355, "reward_std": 0.5297552347183228, "rewards/accuracy_reward/mean": 0.25038856267929077, "rewards/accuracy_reward/std": 0.4626714885234833, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 14, "step_time": 125.55372785916552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.11117619276046753, "epoch": 0.02142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.32561758160591125, "learning_rate": 9.800000000000001e-06, "loss": 0.0081, "num_tokens": 84627.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 15, "step_time": 680.7911861459725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.17032423615455627, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23390084505081177, "learning_rate": 9.785714285714286e-06, "loss": -0.0683, "num_tokens": 89792.0, "reward": 1.589184284210205, "reward_std": 0.31382328271865845, "rewards/accuracy_reward/mean": 0.8391842842102051, "rewards/accuracy_reward/std": 0.1369878351688385, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.26726123690605164, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 16, "step_time": 114.87022498482838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.1287376582622528, "epoch": 0.024285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.771428571428571e-06, "loss": 0.0, "num_tokens": 94568.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 17, "step_time": 92.99945183610544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 910.375, "completions/mean_terminated_length": 842.2000122070312, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "entropy": 0.06268319487571716, "epoch": 0.025714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.0980449914932251, "learning_rate": 9.757142857142858e-06, "loss": 0.0038, "num_tokens": 104859.0, "reward": 1.2757670879364014, "reward_std": 0.4676734209060669, "rewards/accuracy_reward/mean": 0.21326696872711182, "rewards/accuracy_reward/std": 0.16570737957954407, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 18, "step_time": 132.60368068004027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 853.375, "completions/mean_terminated_length": 796.5, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "entropy": 0.049537647515535355, "epoch": 0.027142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.08340201526880264, "learning_rate": 9.742857142857143e-06, "loss": 0.0321, "num_tokens": 114654.0, "reward": 1.3232142925262451, "reward_std": 0.6274262070655823, "rewards/accuracy_reward/mean": 0.45932537317276, "rewards/accuracy_reward/std": 0.47194704413414, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.4888889193534851, "rewards/grounding_reward/std": 0.34646108746528625, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 19, "step_time": 123.64643307868391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 351.625, "completions/mean_terminated_length": 255.57144165039062, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.21501144766807556, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2841695547103882, "learning_rate": 9.72857142857143e-06, "loss": 0.6759, "num_tokens": 120411.0, "reward": 1.312804937362671, "reward_std": 0.5294674038887024, "rewards/accuracy_reward/mean": 0.8753049969673157, "rewards/accuracy_reward/std": 0.3526906967163086, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 20, "step_time": 143.7126657171175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.09548405557870865, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.2301633358001709, "learning_rate": 9.714285714285715e-06, "loss": 0.3801, "num_tokens": 125541.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 21, "step_time": 99.06531806755811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 727.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2721468210220337, "epoch": 0.03142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.11527972668409348, "learning_rate": 9.7e-06, "loss": 0.4514, "num_tokens": 134277.0, "reward": 1.6885387897491455, "reward_std": 1.1312021017074585, "rewards/accuracy_reward/mean": 0.25103887915611267, "rewards/accuracy_reward/std": 0.4622691869735718, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 22, "step_time": 116.43373018875718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 852.375, "completions/mean_terminated_length": 749.4000244140625, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "entropy": 0.11787652969360352, "epoch": 0.032857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.22188031673431396, "learning_rate": 9.685714285714285e-06, "loss": 0.1046, "num_tokens": 144056.0, "reward": 2.054612159729004, "reward_std": 0.924902081489563, "rewards/accuracy_reward/mean": 0.6171120405197144, "rewards/accuracy_reward/std": 0.49517878890037537, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 23, "step_time": 124.99735841434449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.10812810063362122, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.1546686738729477, "learning_rate": 9.671428571428572e-06, "loss": -0.0902, "num_tokens": 150549.0, "reward": 1.6666667461395264, "reward_std": 0.6172134280204773, "rewards/accuracy_reward/mean": 0.4166666865348816, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 24, "step_time": 114.49993559718132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 389.5, "completions/mean_terminated_length": 178.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5456370711326599, "epoch": 0.03571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.3191417157649994, "learning_rate": 9.657142857142859e-06, "loss": 0.4217, "num_tokens": 156617.0, "reward": 1.6251497268676758, "reward_std": 0.7904342412948608, "rewards/accuracy_reward/mean": 0.7501497268676758, "rewards/accuracy_reward/std": 0.46263283491134644, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 25, "step_time": 135.23820763733238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.14174124598503113, "epoch": 0.037142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2856772541999817, "learning_rate": 9.642857142857144e-06, "loss": -0.0002, "num_tokens": 161123.0, "reward": 1.546875, "reward_std": 0.09300297498703003, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.046875, "rewards/grounding_reward/std": 0.09300298243761063, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 26, "step_time": 91.84412011411041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.10954196006059647, "epoch": 0.03857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3335597813129425, "learning_rate": 9.62857142857143e-06, "loss": -0.0399, "num_tokens": 165282.0, "reward": 2.3499999046325684, "reward_std": 0.09258206933736801, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8500000238418579, "rewards/grounding_reward/std": 0.09258200973272324, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 27, "step_time": 86.53698744438589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1836365908384323, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.35037392377853394, "learning_rate": 9.614285714285714e-06, "loss": -0.05, "num_tokens": 169460.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 28, "step_time": 82.60879946127534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.08885609358549118, "epoch": 0.041428571428571426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.600000000000001e-06, "loss": 0.0, "num_tokens": 174517.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 29, "step_time": 87.62849596422166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.17060568928718567, "epoch": 0.04285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1992589682340622, "learning_rate": 9.585714285714286e-06, "loss": -0.0423, "num_tokens": 180420.0, "reward": 1.542932391166687, "reward_std": 0.005239177029579878, "rewards/accuracy_reward/mean": 0.042932383716106415, "rewards/accuracy_reward/std": 0.0052391584031283855, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 30, "step_time": 98.84185803029686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 100.5, "completions/mean_terminated_length": 100.5, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.21763299405574799, "epoch": 0.04428571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.571428571428573e-06, "loss": 0.0, "num_tokens": 184104.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 31, "step_time": 79.05830257758498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.09709011018276215, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3196500241756439, "learning_rate": 9.557142857142858e-06, "loss": -0.0133, "num_tokens": 188381.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 32, "step_time": 81.70562272053212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 646.875, "completions/mean_terminated_length": 521.1666870117188, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.3217156231403351, "epoch": 0.047142857142857146, "frac_reward_zero_std": 0.0, "grad_norm": 0.1914556622505188, "learning_rate": 9.542857142857143e-06, "loss": 0.3452, "num_tokens": 196492.0, "reward": 1.9968438148498535, "reward_std": 0.6515178680419922, "rewards/accuracy_reward/mean": 0.6218438148498535, "rewards/accuracy_reward/std": 0.4419746994972229, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 33, "step_time": 114.12832602113485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.18561649322509766, "epoch": 0.04857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.47770956158638, "learning_rate": 9.528571428571429e-06, "loss": 0.0432, "num_tokens": 200921.0, "reward": 1.341723918914795, "reward_std": 0.05108967050909996, "rewards/accuracy_reward/mean": 0.34172385931015015, "rewards/accuracy_reward/std": 0.051089636981487274, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 34, "step_time": 87.17553806956857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.16631567478179932, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.3586568832397461, "learning_rate": 9.514285714285715e-06, "loss": 0.0204, "num_tokens": 204893.0, "reward": 2.4761905670166016, "reward_std": 0.7741366028785706, "rewards/accuracy_reward/mean": 0.976190447807312, "rewards/accuracy_reward/std": 0.02545345015823841, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 35, "step_time": 91.66520551871508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.08494611829519272, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.20365875959396362, "learning_rate": 9.5e-06, "loss": 0.0486, "num_tokens": 210014.0, "reward": 2.274350643157959, "reward_std": 0.532804012298584, "rewards/accuracy_reward/mean": 0.274350643157959, "rewards/accuracy_reward/std": 0.004591604229062796, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.625, "rewards/operation_reward/std": 0.5175492167472839, "step": 36, "step_time": 97.86779625713825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.20248670876026154, "epoch": 0.05285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.31842201948165894, "learning_rate": 9.485714285714287e-06, "loss": -0.1655, "num_tokens": 214418.0, "reward": 2.9375, "reward_std": 0.4172614812850952, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 37, "step_time": 127.67231516726315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 105.5, "completions/mean_terminated_length": 105.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1842021346092224, "epoch": 0.054285714285714284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.471428571428572e-06, "loss": 0.0, "num_tokens": 218198.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 38, "step_time": 84.15025832597166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.17668366432189941, "epoch": 0.055714285714285716, "frac_reward_zero_std": 0.0, "grad_norm": 0.22432270646095276, "learning_rate": 9.457142857142858e-06, "loss": 0.0026, "num_tokens": 222620.0, "reward": 2.5734267234802246, "reward_std": 0.3829680383205414, "rewards/accuracy_reward/mean": 0.9484266042709351, "rewards/accuracy_reward/std": 0.08324217051267624, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 39, "step_time": 98.37827914953232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.08562007546424866, "epoch": 0.05714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.442857142857144e-06, "loss": 0.0, "num_tokens": 227225.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 40, "step_time": 93.08426951617002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 553.5, "completions/mean_terminated_length": 553.5, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.1576833873987198, "epoch": 0.05857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.17112162709236145, "learning_rate": 9.42857142857143e-06, "loss": -0.0397, "num_tokens": 234669.0, "reward": 1.644230842590332, "reward_std": 0.9234774112701416, "rewards/accuracy_reward/mean": 0.7692307829856873, "rewards/accuracy_reward/std": 0.31849178671836853, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 41, "step_time": 137.63717511948198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07285911589860916, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.19383585453033447, "learning_rate": 9.414285714285715e-06, "loss": 0.002, "num_tokens": 239408.0, "reward": 2.6136364936828613, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.8636363744735718, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 42, "step_time": 96.3718024333939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.11704403907060623, "epoch": 0.06142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1829719990491867, "learning_rate": 9.4e-06, "loss": -0.0325, "num_tokens": 245168.0, "reward": 2.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 43, "step_time": 93.2459330232814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 964.375, "completions/mean_terminated_length": 865.0, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "entropy": 0.16659630835056305, "epoch": 0.06285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.10716080665588379, "learning_rate": 9.385714285714287e-06, "loss": 0.0382, "num_tokens": 256059.0, "reward": 1.0696983337402344, "reward_std": 0.6811153888702393, "rewards/accuracy_reward/mean": 0.12803159654140472, "rewards/accuracy_reward/std": 0.3523353040218353, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.7541666626930237, "rewards/grounding_reward/std": 0.351386159658432, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 44, "step_time": 163.55650261882693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 933.75, "completions/mean_terminated_length": 783.3333740234375, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "entropy": 0.42897742986679077, "epoch": 0.06428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.11840242892503738, "learning_rate": 9.371428571428572e-06, "loss": 0.0528, "num_tokens": 266385.0, "reward": 1.3599274158477783, "reward_std": 0.5405157208442688, "rewards/accuracy_reward/mean": 0.17242743074893951, "rewards/accuracy_reward/std": 0.3449694514274597, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 45, "step_time": 137.06672361958772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.0770406723022461, "epoch": 0.06571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.15384772419929504, "learning_rate": 9.357142857142859e-06, "loss": 0.0456, "num_tokens": 271690.0, "reward": 3.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 46, "step_time": 97.86804219707847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 245.875, "completions/mean_terminated_length": 245.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06801162660121918, "epoch": 0.06714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25861090421676636, "learning_rate": 9.342857142857144e-06, "loss": -0.0731, "num_tokens": 276497.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 47, "step_time": 102.91827464476228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.11873869597911835, "epoch": 0.06857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.328571428571429e-06, "loss": 0.0, "num_tokens": 281700.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 48, "step_time": 97.9573125699535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 82.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.16869713366031647, "epoch": 0.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.668472945690155, "learning_rate": 9.314285714285714e-06, "loss": 0.1143, "num_tokens": 285242.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 49, "step_time": 117.90744758304209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 438.125, "completions/mean_terminated_length": 438.125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "entropy": 0.06206681579351425, "epoch": 0.07142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.3e-06, "loss": 0.0, "num_tokens": 291643.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 50, "step_time": 94.75874059461057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 903.375, "completions/mean_terminated_length": 541.5, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.27943116426467896, "epoch": 0.07285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.22287119925022125, "learning_rate": 9.285714285714288e-06, "loss": 0.1028, "num_tokens": 301902.0, "reward": 0.9188361167907715, "reward_std": 0.7099953293800354, "rewards/accuracy_reward/mean": 0.16883611679077148, "rewards/accuracy_reward/std": 0.3466890752315521, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.41547447443008423, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 51, "step_time": 120.14278678875417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 798.375, "completions/mean_terminated_length": 798.375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "entropy": 0.05044899508357048, "epoch": 0.07428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.0866091251373291, "learning_rate": 9.271428571428573e-06, "loss": 0.0476, "num_tokens": 311289.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 52, "step_time": 132.25309958308935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 398.25, "completions/mean_terminated_length": 398.25, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.1963346153497696, "epoch": 0.07571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.20512594282627106, "learning_rate": 9.257142857142858e-06, "loss": 0.0169, "num_tokens": 317491.0, "reward": 2.0399160385131836, "reward_std": 0.3346431255340576, "rewards/accuracy_reward/mean": 0.9149159789085388, "rewards/accuracy_reward/std": 0.24065397679805756, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.17251639068126678, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 53, "step_time": 132.33114616107196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1364986002445221, "epoch": 0.07714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2639947235584259, "learning_rate": 9.242857142857143e-06, "loss": -0.0636, "num_tokens": 322487.0, "reward": 2.362628936767578, "reward_std": 0.0630231574177742, "rewards/accuracy_reward/mean": 0.8626289367675781, "rewards/accuracy_reward/std": 0.0630231648683548, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 54, "step_time": 97.81996355392039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.08144776523113251, "epoch": 0.07857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.22857142857143e-06, "loss": 0.0, "num_tokens": 326591.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 55, "step_time": 101.88790861424059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 321.875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.24485667049884796, "epoch": 0.08, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.214285714285715e-06, "loss": 0.0, "num_tokens": 332222.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 56, "step_time": 136.90945351403207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 852.875, "completions/mean_terminated_length": 681.75, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "entropy": 0.18334731459617615, "epoch": 0.08142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.11860033869743347, "learning_rate": 9.200000000000002e-06, "loss": 0.1733, "num_tokens": 341981.0, "reward": 1.3192213773727417, "reward_std": 0.535727858543396, "rewards/accuracy_reward/mean": 0.4025547504425049, "rewards/accuracy_reward/std": 0.4286941587924957, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.6666666865348816, "rewards/grounding_reward/std": 0.38832157850265503, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 57, "step_time": 142.56203333940357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 537.0, "completions/mean_terminated_length": 537.0, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.2271047979593277, "epoch": 0.08285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.1726161241531372, "learning_rate": 9.185714285714287e-06, "loss": 0.0792, "num_tokens": 349197.0, "reward": 2.2757225036621094, "reward_std": 0.12304234504699707, "rewards/accuracy_reward/mean": 0.9090559482574463, "rewards/accuracy_reward/std": 0.06343474984169006, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8666666746139526, "rewards/grounding_reward/std": 0.08357109129428864, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 58, "step_time": 144.03747341316193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.08533079922199249, "epoch": 0.08428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.256120890378952, "learning_rate": 9.171428571428572e-06, "loss": 0.0491, "num_tokens": 353589.0, "reward": 2.3125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 59, "step_time": 123.08634363114834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.1712101697921753, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.6350891590118408, "learning_rate": 9.157142857142857e-06, "loss": -0.0089, "num_tokens": 357461.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 60, "step_time": 79.79172434844077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 515.625, "completions/mean_terminated_length": 515.625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.06621311604976654, "epoch": 0.08714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.10512148588895798, "learning_rate": 9.142857142857144e-06, "loss": 0.0174, "num_tokens": 364610.0, "reward": 2.4583334922790527, "reward_std": 0.11785107105970383, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9583333730697632, "rewards/grounding_reward/std": 0.117851123213768, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 61, "step_time": 123.0260962229222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 843.375, "completions/mean_terminated_length": 301.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.24902333319187164, "epoch": 0.08857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.08440423756837845, "learning_rate": 9.128571428571429e-06, "loss": 0.2916, "num_tokens": 374301.0, "reward": 1.066637635231018, "reward_std": 0.3182269334793091, "rewards/accuracy_reward/mean": 0.004137582145631313, "rewards/accuracy_reward/std": 0.0025756575632840395, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 62, "step_time": 126.51343337632716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09049366414546967, "epoch": 0.09, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.114285714285716e-06, "loss": 0.0, "num_tokens": 378948.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 63, "step_time": 91.7749366313219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1459290087223053, "epoch": 0.09142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.38221633434295654, "learning_rate": 9.100000000000001e-06, "loss": 0.0432, "num_tokens": 382816.0, "reward": 2.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 64, "step_time": 86.96864161081612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 533.25, "completions/mean_terminated_length": 463.14288330078125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.3073796331882477, "epoch": 0.09285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1839277148246765, "learning_rate": 9.085714285714286e-06, "loss": 0.3396, "num_tokens": 390010.0, "reward": 1.0878241062164307, "reward_std": 0.6003710031509399, "rewards/accuracy_reward/mean": 0.00032404405646957457, "rewards/accuracy_reward/std": 0.000916534976568073, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.6499999761581421, "rewards/grounding_reward/std": 0.4869731366634369, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 65, "step_time": 161.9173893192783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 614.75, "completions/mean_terminated_length": 478.3333435058594, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.32148277759552, "epoch": 0.09428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.19237640500068665, "learning_rate": 9.071428571428573e-06, "loss": 0.3121, "num_tokens": 397808.0, "reward": 1.437806248664856, "reward_std": 0.8629680871963501, "rewards/accuracy_reward/mean": 0.00030629197135567665, "rewards/accuracy_reward/std": 0.0005681972252205014, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.3720119297504425, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 66, "step_time": 126.58065752778202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 852.5, "completions/mean_terminated_length": 749.6000366210938, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "entropy": 0.2532240152359009, "epoch": 0.09571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.11307408660650253, "learning_rate": 9.057142857142858e-06, "loss": 0.1385, "num_tokens": 407596.0, "reward": 1.8168237209320068, "reward_std": 0.9921762943267822, "rewards/accuracy_reward/mean": 0.6293236613273621, "rewards/accuracy_reward/std": 0.5115820169448853, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 67, "step_time": 128.3620256781578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06792756170034409, "epoch": 0.09714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1800834834575653, "learning_rate": 9.042857142857143e-06, "loss": 0.0399, "num_tokens": 412312.0, "reward": 2.174999952316284, "reward_std": 0.4527692496776581, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.675000011920929, "rewards/grounding_reward/std": 0.45276927947998047, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 68, "step_time": 95.23643351346254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.1600443720817566, "epoch": 0.09857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2858518958091736, "learning_rate": 9.028571428571428e-06, "loss": 0.0261, "num_tokens": 416679.0, "reward": 3.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 69, "step_time": 90.39904900547117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 113.625, "completions/mean_terminated_length": 113.625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.17616945505142212, "epoch": 0.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.014285714285715e-06, "loss": 0.0, "num_tokens": 420540.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 70, "step_time": 86.97959507443011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.11190485209226608, "epoch": 0.10142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2689554691314697, "learning_rate": 9e-06, "loss": -0.0569, "num_tokens": 424878.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 71, "step_time": 88.15774825774133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.16341981291770935, "epoch": 0.10285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 1.1403242349624634, "learning_rate": 8.985714285714287e-06, "loss": -0.035, "num_tokens": 429141.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 72, "step_time": 100.82904171943665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 266.5, "completions/mean_terminated_length": 266.5, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.0776268020272255, "epoch": 0.10428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.14891530573368073, "learning_rate": 8.971428571428572e-06, "loss": -0.0457, "num_tokens": 434217.0, "reward": 2.2291665077209473, "reward_std": 0.4537104368209839, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7291666269302368, "rewards/grounding_reward/std": 0.4537104666233063, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 73, "step_time": 95.4099006447941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 467.375, "completions/mean_terminated_length": 387.8571472167969, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.16079676151275635, "epoch": 0.10571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.12474522739648819, "learning_rate": 8.957142857142857e-06, "loss": -0.0808, "num_tokens": 440908.0, "reward": 1.0625, "reward_std": 0.4955156147480011, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 74, "step_time": 118.6404134016484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 418.125, "completions/mean_terminated_length": 418.125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 0.10123102366924286, "epoch": 0.10714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.942857142857142e-06, "loss": 0.0, "num_tokens": 447253.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 75, "step_time": 110.05331820715219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.18553991615772247, "epoch": 0.10857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26509395241737366, "learning_rate": 8.92857142857143e-06, "loss": 0.0468, "num_tokens": 451523.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 76, "step_time": 85.76915708836168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 710.25, "completions/mean_terminated_length": 396.5, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.36522552371025085, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.13628381490707397, "learning_rate": 8.914285714285716e-06, "loss": 0.2965, "num_tokens": 460269.0, "reward": 0.8761724233627319, "reward_std": 0.9151667356491089, "rewards/accuracy_reward/mean": 0.25117242336273193, "rewards/accuracy_reward/std": 0.46218761801719666, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 77, "step_time": 113.52084034122527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.14609159529209137, "epoch": 0.11142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.21424590051174164, "learning_rate": 8.900000000000001e-06, "loss": 0.1014, "num_tokens": 465632.0, "reward": 0.9008945226669312, "reward_std": 0.46617382764816284, "rewards/accuracy_reward/mean": 0.15089452266693115, "rewards/accuracy_reward/std": 0.01680837944149971, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 78, "step_time": 108.75656279921532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 140.75, "completions/mean_terminated_length": 140.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.19741252064704895, "epoch": 0.11285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.30121487379074097, "learning_rate": 8.885714285714286e-06, "loss": 0.0195, "num_tokens": 469718.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 79, "step_time": 86.29160126764327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.08462002873420715, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.18662147223949432, "learning_rate": 8.871428571428571e-06, "loss": -0.0054, "num_tokens": 474673.0, "reward": 1.6500000953674316, "reward_std": 0.2070196568965912, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.15000000596046448, "rewards/grounding_reward/std": 0.20701968669891357, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 80, "step_time": 88.59023492224514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1645142138004303, "epoch": 0.11571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.40052080154418945, "learning_rate": 8.857142857142858e-06, "loss": 0.0096, "num_tokens": 478834.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 81, "step_time": 82.73973150271922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 604.0, "completions/mean_terminated_length": 464.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.12116430699825287, "epoch": 0.11714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.14770115911960602, "learning_rate": 8.842857142857143e-06, "loss": 0.3833, "num_tokens": 486538.0, "reward": 1.000359296798706, "reward_std": 0.7065262198448181, "rewards/accuracy_reward/mean": 0.00035923265386372805, "rewards/accuracy_reward/std": 0.0006698482902720571, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 82, "step_time": 119.60564264282584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2559327483177185, "epoch": 0.11857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.3326825797557831, "learning_rate": 8.82857142857143e-06, "loss": -0.0074, "num_tokens": 490465.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 83, "step_time": 97.44464872498065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 415.625, "completions/mean_terminated_length": 415.625, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 0.05120926722884178, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.12439953535795212, "learning_rate": 8.814285714285715e-06, "loss": -0.005, "num_tokens": 496686.0, "reward": 1.9749999046325684, "reward_std": 0.3845219016075134, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4750000238418579, "rewards/grounding_reward/std": 0.3845219910144806, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 84, "step_time": 96.68272523209453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.11289907246828079, "epoch": 0.12142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3292028605937958, "learning_rate": 8.8e-06, "loss": -0.0342, "num_tokens": 501167.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 85, "step_time": 86.01116502285004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.18836544454097748, "epoch": 0.12285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.4383413791656494, "learning_rate": 8.785714285714286e-06, "loss": -0.0155, "num_tokens": 504998.0, "reward": 3.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 86, "step_time": 92.52169709745795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 159.85714721679688, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.457160621881485, "epoch": 0.12428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3643335998058319, "learning_rate": 8.771428571428572e-06, "loss": 0.693, "num_tokens": 510045.0, "reward": 1.0627312660217285, "reward_std": 0.6227613091468811, "rewards/accuracy_reward/mean": 0.6252312660217285, "rewards/accuracy_reward/std": 0.5172303318977356, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 87, "step_time": 118.5987621517852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.10162097215652466, "epoch": 0.12571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.252505362033844, "learning_rate": 8.757142857142858e-06, "loss": -0.0441, "num_tokens": 514542.0, "reward": 1.8125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3125, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 88, "step_time": 91.53248781617731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 336.14288330078125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.1335451900959015, "epoch": 0.12714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.15003372728824615, "learning_rate": 8.742857142857144e-06, "loss": 0.2166, "num_tokens": 520791.0, "reward": 0.9116697907447815, "reward_std": 0.632841944694519, "rewards/accuracy_reward/mean": 0.0991697907447815, "rewards/accuracy_reward/std": 0.182888001203537, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 89, "step_time": 108.27178625762463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 988.375, "completions/mean_terminated_length": 929.0, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "entropy": 0.06276170164346695, "epoch": 0.12857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.10925080627202988, "learning_rate": 8.72857142857143e-06, "loss": 0.0251, "num_tokens": 531754.0, "reward": 1.252626657485962, "reward_std": 0.6535099148750305, "rewards/accuracy_reward/mean": 0.2526266574859619, "rewards/accuracy_reward/std": 0.4612911641597748, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.3720119297504425, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 90, "step_time": 126.27851253468543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.1819160431623459, "epoch": 0.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.36465147137641907, "learning_rate": 8.714285714285715e-06, "loss": -0.068, "num_tokens": 536042.0, "reward": 1.25, "reward_std": 0.6546536684036255, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 91, "step_time": 147.1874639140442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 525.25, "completions/mean_terminated_length": 454.0000305175781, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.13710999488830566, "epoch": 0.13142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2809256911277771, "learning_rate": 8.700000000000001e-06, "loss": 0.2385, "num_tokens": 543204.0, "reward": 1.4025330543518066, "reward_std": 0.17621482908725739, "rewards/accuracy_reward/mean": 0.00044964029802940786, "rewards/accuracy_reward/std": 0.0012717748759314418, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.9645833373069763, "rewards/grounding_reward/std": 0.0726141706109047, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 92, "step_time": 146.49298314284533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 495.25, "completions/mean_terminated_length": 495.25, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.084578737616539, "epoch": 0.13285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.11676168441772461, "learning_rate": 8.685714285714287e-06, "loss": 0.0484, "num_tokens": 550086.0, "reward": 1.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 93, "step_time": 121.96319874562323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 926.125, "completions/mean_terminated_length": 241.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.23720373213291168, "epoch": 0.13428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.1267077922821045, "learning_rate": 8.671428571428572e-06, "loss": 0.0132, "num_tokens": 560375.0, "reward": 0.4381479024887085, "reward_std": 0.4955137372016907, "rewards/accuracy_reward/mean": 0.0006478829891420901, "rewards/accuracy_reward/std": 0.000264011905528605, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 94, "step_time": 112.9975449219346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 301.0, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.40875014662742615, "epoch": 0.1357142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3473125696182251, "learning_rate": 8.657142857142858e-06, "loss": -0.3363, "num_tokens": 565735.0, "reward": 2.489285945892334, "reward_std": 0.014787063002586365, "rewards/accuracy_reward/mean": 0.9892857074737549, "rewards/accuracy_reward/std": 0.014787118881940842, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 95, "step_time": 106.59222892764956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.26024559140205383, "epoch": 0.13714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2051348239183426, "learning_rate": 8.642857142857144e-06, "loss": 0.0, "num_tokens": 577039.0, "reward": 0.7549313306808472, "reward_std": 0.4626747667789459, "rewards/accuracy_reward/mean": 0.004931293427944183, "rewards/accuracy_reward/std": 0.0006177047034725547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 96, "step_time": 133.38634660374373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 241.57144165039062, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.20878978073596954, "epoch": 0.13857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.17487692832946777, "learning_rate": 8.628571428571429e-06, "loss": -0.1708, "num_tokens": 582754.0, "reward": 0.8128564953804016, "reward_std": 0.45823055505752563, "rewards/accuracy_reward/mean": 0.0003565062361303717, "rewards/accuracy_reward/std": 0.0010083519155159593, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 97, "step_time": 148.55678519699723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 892.5, "completions/mean_terminated_length": 498.0, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.3192614018917084, "epoch": 0.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.12858900427818298, "learning_rate": 8.614285714285716e-06, "loss": 0.1037, "num_tokens": 592870.0, "reward": 0.5110350847244263, "reward_std": 0.4501999020576477, "rewards/accuracy_reward/mean": 0.05652119964361191, "rewards/accuracy_reward/std": 0.06810212880373001, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.32951390743255615, "rewards/grounding_reward/std": 0.42077064514160156, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 98, "step_time": 119.9899192256853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.0758453980088234, "epoch": 0.14142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.25051748752593994, "learning_rate": 8.6e-06, "loss": 0.0036, "num_tokens": 597619.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 99, "step_time": 118.52312322612852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.5670102834701538, "epoch": 0.14285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.585714285714286e-06, "loss": 0.0, "num_tokens": 601752.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 100, "step_time": 95.6491855494678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.23758427798748016, "epoch": 0.1442857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.43120306730270386, "learning_rate": 8.571428571428571e-06, "loss": 0.0899, "num_tokens": 606179.0, "reward": 2.0235958099365234, "reward_std": 0.36950331926345825, "rewards/accuracy_reward/mean": 0.6485958099365234, "rewards/accuracy_reward/std": 0.3025321662425995, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 101, "step_time": 95.32027253229171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06751192361116409, "epoch": 0.1457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.29203125834465027, "learning_rate": 8.557142857142858e-06, "loss": 0.0702, "num_tokens": 610205.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 102, "step_time": 99.65379269979894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 567.25, "completions/mean_terminated_length": 415.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.320734441280365, "epoch": 0.14714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.12473223358392715, "learning_rate": 8.542857142857145e-06, "loss": 0.2879, "num_tokens": 617663.0, "reward": 0.7755331993103027, "reward_std": 0.4975958466529846, "rewards/accuracy_reward/mean": 0.000533201964572072, "rewards/accuracy_reward/std": 0.00099246297031641, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.4000000059604645, "rewards/grounding_reward/std": 0.5014265775680542, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 103, "step_time": 114.02963600400835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.05871732160449028, "epoch": 0.14857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.16408930718898773, "learning_rate": 8.52857142857143e-06, "loss": 0.0437, "num_tokens": 622788.0, "reward": 2.2750000953674316, "reward_std": 0.4200340211391449, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7749999761581421, "rewards/grounding_reward/std": 0.4200340509414673, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 104, "step_time": 88.240812596865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 246.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.11919582635164261, "epoch": 0.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.24893322587013245, "learning_rate": 8.514285714285715e-06, "loss": 0.0293, "num_tokens": 627669.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.75, "rewards/operation_reward/std": 0.4629100561141968, "step": 105, "step_time": 84.16843869443983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.08868174999952316, "epoch": 0.15142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.28486931324005127, "learning_rate": 8.5e-06, "loss": 0.0288, "num_tokens": 631939.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.26726123690605164, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 106, "step_time": 90.43439272232354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06419923156499863, "epoch": 0.15285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2061934918165207, "learning_rate": 8.485714285714287e-06, "loss": -0.0145, "num_tokens": 636066.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 107, "step_time": 120.31422007083893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 172.375, "completions/mean_terminated_length": 172.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1059049665927887, "epoch": 0.15428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.4916057884693146, "learning_rate": 8.471428571428572e-06, "loss": 0.0197, "num_tokens": 640397.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 108, "step_time": 89.63283574953675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.10357363522052765, "epoch": 0.15571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.2132723480463028, "learning_rate": 8.457142857142859e-06, "loss": -0.0577, "num_tokens": 644399.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 109, "step_time": 112.59133645799011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 468.625, "completions/mean_terminated_length": 135.40000915527344, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.36064720153808594, "epoch": 0.15714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22405728697776794, "learning_rate": 8.442857142857144e-06, "loss": 0.8277, "num_tokens": 651092.0, "reward": 1.6896615028381348, "reward_std": 1.1602227687835693, "rewards/accuracy_reward/mean": 0.6271615028381348, "rewards/accuracy_reward/std": 0.5145661234855652, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.625, "rewards/operation_reward/std": 0.5175492167472839, "step": 110, "step_time": 108.37543955724686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 225.57144165039062, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.05266247317194939, "epoch": 0.15857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.17620418965816498, "learning_rate": 8.428571428571429e-06, "loss": 0.5296, "num_tokens": 656719.0, "reward": 2.0625, "reward_std": 0.6232117414474487, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 111, "step_time": 121.34094641916454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.1378675401210785, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.30466270446777344, "learning_rate": 8.414285714285714e-06, "loss": 0.0911, "num_tokens": 661131.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 112, "step_time": 86.5736198509112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 289.375, "completions/mean_terminated_length": 289.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.1874208301305771, "epoch": 0.16142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.400000000000001e-06, "loss": 0.0, "num_tokens": 666398.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 113, "step_time": 93.74828237295151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.12319270521402359, "epoch": 0.16285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.23560529947280884, "learning_rate": 8.385714285714286e-06, "loss": 0.0714, "num_tokens": 671168.0, "reward": 2.2916667461395264, "reward_std": 0.39591163396835327, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7916666865348816, "rewards/grounding_reward/std": 0.39591163396835327, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 114, "step_time": 97.55058407597244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.10089050978422165, "epoch": 0.16428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.21488057076931, "learning_rate": 8.371428571428573e-06, "loss": -0.034, "num_tokens": 675573.0, "reward": 2.0, "reward_std": 0.30860671401023865, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.30860671401023865, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 115, "step_time": 99.83588039502501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.10674983263015747, "epoch": 0.1657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2118188738822937, "learning_rate": 8.357142857142858e-06, "loss": -0.0775, "num_tokens": 680581.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 116, "step_time": 107.36115024331957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 753.0, "completions/mean_terminated_length": 301.3333435058594, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.6388698220252991, "epoch": 0.16714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.15741875767707825, "learning_rate": 8.342857142857143e-06, "loss": 0.3877, "num_tokens": 689493.0, "reward": 0.6939762234687805, "reward_std": 0.7481394410133362, "rewards/accuracy_reward/mean": 0.3814762234687805, "rewards/accuracy_reward/std": 0.5121868848800659, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 117, "step_time": 162.39103727415204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 429.0, "completions/mean_terminated_length": 429.0, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.07965029031038284, "epoch": 0.16857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.14059671759605408, "learning_rate": 8.32857142857143e-06, "loss": -0.0362, "num_tokens": 695805.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 118, "step_time": 108.206977378577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 570.375, "completions/mean_terminated_length": 570.375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.08841745555400848, "epoch": 0.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.11828936636447906, "learning_rate": 8.314285714285715e-06, "loss": 0.0245, "num_tokens": 703296.0, "reward": 0.699999988079071, "reward_std": 0.37032803893089294, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.20000000298023224, "rewards/grounding_reward/std": 0.37032803893089294, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 119, "step_time": 148.0318749975413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.10273439437150955, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2176894247531891, "learning_rate": 8.3e-06, "loss": -0.011, "num_tokens": 707882.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 120, "step_time": 104.2109593115747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 790.375, "completions/mean_terminated_length": 401.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.3290751576423645, "epoch": 0.17285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.13862474262714386, "learning_rate": 8.285714285714287e-06, "loss": 0.1465, "num_tokens": 717125.0, "reward": 0.3160521984100342, "reward_std": 0.37100398540496826, "rewards/accuracy_reward/mean": 0.003552202833816409, "rewards/accuracy_reward/std": 0.003002971177920699, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 121, "step_time": 169.6139349779114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.05967307090759277, "epoch": 0.1742857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.271428571428572e-06, "loss": 0.0, "num_tokens": 721761.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 122, "step_time": 118.27381069678813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.08497795462608337, "epoch": 0.1757142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2635441720485687, "learning_rate": 8.257142857142857e-06, "loss": -0.019, "num_tokens": 726663.0, "reward": 2.25, "reward_std": 0.3162277638912201, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.3162277638912201, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 123, "step_time": 97.96446728240699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2566013038158417, "epoch": 0.17714285714285713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.242857142857144e-06, "loss": 0.0, "num_tokens": 731152.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 124, "step_time": 92.54829493071884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.06510167568922043, "epoch": 0.17857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.14015209674835205, "learning_rate": 8.22857142857143e-06, "loss": 0.0072, "num_tokens": 737247.0, "reward": 1.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 125, "step_time": 98.35086017753929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.12479384988546371, "epoch": 0.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.6050871014595032, "learning_rate": 8.214285714285714e-06, "loss": 0.0066, "num_tokens": 741978.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 126, "step_time": 93.12019778229296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 428.5, "completions/mean_terminated_length": 428.5, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.07376912981271744, "epoch": 0.18142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.11926544457674026, "learning_rate": 8.2e-06, "loss": -0.0169, "num_tokens": 748406.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 127, "step_time": 105.19307953026146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.1485872119665146, "epoch": 0.18285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.47473326325416565, "learning_rate": 8.185714285714286e-06, "loss": 0.0941, "num_tokens": 752386.0, "reward": 1.8125, "reward_std": 0.45806270837783813, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3125, "rewards/grounding_reward/std": 0.45806270837783813, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 128, "step_time": 111.3804728500545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.12467299401760101, "epoch": 0.18428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.24957238137722015, "learning_rate": 8.171428571428573e-06, "loss": 0.0227, "num_tokens": 757087.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 129, "step_time": 83.63682367093861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.10671253502368927, "epoch": 0.18571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.23765110969543457, "learning_rate": 8.157142857142858e-06, "loss": 0.0, "num_tokens": 761279.0, "reward": 3.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 130, "step_time": 90.96835188753903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 520.625, "completions/mean_terminated_length": 520.625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.04965426027774811, "epoch": 0.18714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1412106156349182, "learning_rate": 8.142857142857143e-06, "loss": 0.0215, "num_tokens": 768348.0, "reward": 1.5, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.37796446681022644, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 131, "step_time": 105.27328850887716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.0710904449224472, "epoch": 0.18857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.1773725152015686, "learning_rate": 8.128571428571428e-06, "loss": -0.0429, "num_tokens": 773429.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 132, "step_time": 109.3459500707686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 417.375, "completions/mean_terminated_length": 417.375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.1292380541563034, "epoch": 0.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.16048607230186462, "learning_rate": 8.114285714285715e-06, "loss": 0.1015, "num_tokens": 779712.0, "reward": 1.274999976158142, "reward_std": 0.4200340211391449, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7749999761581421, "rewards/grounding_reward/std": 0.4200340509414673, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 133, "step_time": 152.14071056619287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.09207155555486679, "epoch": 0.19142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.22988741099834442, "learning_rate": 8.1e-06, "loss": 0.0457, "num_tokens": 783983.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 134, "step_time": 108.01903111673892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 750.75, "completions/mean_terminated_length": 586.7999877929688, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.13372229039669037, "epoch": 0.19285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.12350621819496155, "learning_rate": 8.085714285714287e-06, "loss": 0.136, "num_tokens": 792925.0, "reward": 0.5634796023368835, "reward_std": 0.6222019195556641, "rewards/accuracy_reward/mean": 0.25097960233688354, "rewards/accuracy_reward/std": 0.46230706572532654, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 135, "step_time": 144.12644540332258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 986.0, "completions/mean_terminated_length": 872.0, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "entropy": 0.10838879644870758, "epoch": 0.19428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.09341466426849365, "learning_rate": 8.071428571428572e-06, "loss": 0.0568, "num_tokens": 803773.0, "reward": 1.2765616178512573, "reward_std": 0.8295198082923889, "rewards/accuracy_reward/mean": 0.27656158804893494, "rewards/accuracy_reward/std": 0.4465547204017639, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 136, "step_time": 126.5005202004686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 710.5, "completions/mean_terminated_length": 397.0, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.6385374069213867, "epoch": 0.1957142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1644010841846466, "learning_rate": 8.057142857142857e-06, "loss": 0.4054, "num_tokens": 812425.0, "reward": 1.1965110301971436, "reward_std": 0.8744415044784546, "rewards/accuracy_reward/mean": 0.5090109705924988, "rewards/accuracy_reward/std": 0.5248900055885315, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 137, "step_time": 126.14292619004846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 376.5, "completions/mean_terminated_length": 376.5, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10121016204357147, "epoch": 0.19714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.152639701962471, "learning_rate": 8.042857142857143e-06, "loss": -0.1793, "num_tokens": 818365.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 138, "step_time": 122.59612354822457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 309.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.15650776028633118, "epoch": 0.19857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24198485910892487, "learning_rate": 8.02857142857143e-06, "loss": 0.0577, "num_tokens": 823762.0, "reward": 1.0521926879882812, "reward_std": 0.4218154847621918, "rewards/accuracy_reward/mean": 0.5313594341278076, "rewards/accuracy_reward/std": 0.39005112648010254, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.02083333395421505, "rewards/grounding_reward/std": 0.0589255727827549, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 139, "step_time": 106.1180177712813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.18638494610786438, "epoch": 0.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.014285714285715e-06, "loss": 0.0, "num_tokens": 827536.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 140, "step_time": 94.13873984757811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.15946143865585327, "epoch": 0.20142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3451526165008545, "learning_rate": 8.000000000000001e-06, "loss": -0.0558, "num_tokens": 831674.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 141, "step_time": 120.09800847619772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.0618864968419075, "epoch": 0.20285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.08004584163427353, "learning_rate": 7.985714285714286e-06, "loss": 0.0, "num_tokens": 843178.0, "reward": 0.7631186842918396, "reward_std": 0.46289578080177307, "rewards/accuracy_reward/mean": 0.013118676841259003, "rewards/accuracy_reward/std": 0.0007514850585721433, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 142, "step_time": 113.5411642594263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.06652359664440155, "epoch": 0.2042857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.14460888504981995, "learning_rate": 7.971428571428572e-06, "loss": -0.0082, "num_tokens": 848367.0, "reward": 1.8499999046325684, "reward_std": 0.1414213478565216, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3500000238418579, "rewards/grounding_reward/std": 0.1414213627576828, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 143, "step_time": 104.35038753785193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 931.75, "completions/mean_terminated_length": 839.5, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "entropy": 0.2225959450006485, "epoch": 0.2057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1349080353975296, "learning_rate": 7.957142857142858e-06, "loss": 0.0867, "num_tokens": 858821.0, "reward": 0.6472489833831787, "reward_std": 0.6429103016853333, "rewards/accuracy_reward/mean": 0.3243323266506195, "rewards/accuracy_reward/std": 0.38473644852638245, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.0729166716337204, "rewards/grounding_reward/std": 0.1368400603532791, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 144, "step_time": 113.6561745647341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 559.75, "completions/mean_terminated_length": 405.0, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.4520179331302643, "epoch": 0.20714285714285716, "frac_reward_zero_std": 0.0, "grad_norm": 0.24841828644275665, "learning_rate": 7.942857142857144e-06, "loss": 0.2826, "num_tokens": 866219.0, "reward": 1.032818078994751, "reward_std": 0.4024044871330261, "rewards/accuracy_reward/mean": 0.5953180193901062, "rewards/accuracy_reward/std": 0.22659818828105927, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 145, "step_time": 127.30421171244234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.10201386362314224, "epoch": 0.20857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2341623604297638, "learning_rate": 7.928571428571429e-06, "loss": -0.0731, "num_tokens": 871242.0, "reward": 1.9500000476837158, "reward_std": 0.4375254809856415, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.44999998807907104, "rewards/grounding_reward/std": 0.43752551078796387, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 146, "step_time": 110.23288121260703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 727.0, "completions/mean_terminated_length": 684.5714721679688, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.20161105692386627, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.1350470632314682, "learning_rate": 7.914285714285715e-06, "loss": 0.1444, "num_tokens": 879978.0, "reward": 1.3131625652313232, "reward_std": 0.5284561514854431, "rewards/accuracy_reward/mean": 0.0006625441601499915, "rewards/accuracy_reward/std": 0.0018739579245448112, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 147, "step_time": 118.27045733574778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.14037369191646576, "epoch": 0.21142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2326105684041977, "learning_rate": 7.9e-06, "loss": 0.0361, "num_tokens": 884581.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 148, "step_time": 108.85961087979376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.09318720549345016, "epoch": 0.21285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.885714285714286e-06, "loss": 0.0, "num_tokens": 889173.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 149, "step_time": 128.5420345943421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.18943683803081512, "epoch": 0.21428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.38934800028800964, "learning_rate": 7.871428571428573e-06, "loss": -0.111, "num_tokens": 893290.0, "reward": 2.120065689086914, "reward_std": 0.7181240916252136, "rewards/accuracy_reward/mean": 0.8700658082962036, "rewards/accuracy_reward/std": 0.28048568964004517, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.75, "rewards/operation_reward/std": 0.4629100561141968, "step": 150, "step_time": 93.28034245967865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 601.25, "completions/mean_terminated_length": 540.857177734375, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.17506033182144165, "epoch": 0.21571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.16977496445178986, "learning_rate": 7.857142857142858e-06, "loss": 0.0041, "num_tokens": 901132.0, "reward": 0.829439640045166, "reward_std": 0.31318554282188416, "rewards/accuracy_reward/mean": 0.0005705591756850481, "rewards/accuracy_reward/std": 0.001613785163499415, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.3913690447807312, "rewards/grounding_reward/std": 0.33632269501686096, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 151, "step_time": 113.37390258070081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.059623170644044876, "epoch": 0.21714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1786813884973526, "learning_rate": 7.842857142857143e-06, "loss": -0.0016, "num_tokens": 905362.0, "reward": 1.8499999046325684, "reward_std": 0.1414213478565216, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3500000238418579, "rewards/grounding_reward/std": 0.1414213627576828, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 152, "step_time": 123.83584580849856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 971.75, "completions/mean_terminated_length": 606.0, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "entropy": 0.4257845878601074, "epoch": 0.21857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.10119715332984924, "learning_rate": 7.828571428571428e-06, "loss": 0.1276, "num_tokens": 916080.0, "reward": 0.4035584032535553, "reward_std": 0.6723901629447937, "rewards/accuracy_reward/mean": 0.16605839133262634, "rewards/accuracy_reward/std": 0.34501785039901733, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.17499999701976776, "rewards/grounding_reward/std": 0.24348658323287964, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 153, "step_time": 123.94775429274887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.0762934535741806, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.2134830355644226, "learning_rate": 7.814285714285715e-06, "loss": -0.0036, "num_tokens": 920269.0, "reward": 2.2083332538604736, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.8333333134651184, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 154, "step_time": 108.55017540976405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.175050288438797, "epoch": 0.22142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.28974586725234985, "learning_rate": 7.800000000000002e-06, "loss": 0.0305, "num_tokens": 924133.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 155, "step_time": 107.81516058091074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.11981189996004105, "epoch": 0.22285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24609951674938202, "learning_rate": 7.785714285714287e-06, "loss": -0.116, "num_tokens": 928884.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 156, "step_time": 99.8142679752782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.10481072962284088, "epoch": 0.22428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.08602815866470337, "learning_rate": 7.771428571428572e-06, "loss": 0.0, "num_tokens": 940260.0, "reward": 0.6334397792816162, "reward_std": 0.5171712636947632, "rewards/accuracy_reward/mean": 0.008439762517809868, "rewards/accuracy_reward/std": 0.0007426533848047256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 157, "step_time": 115.80566631164402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.21386635303497314, "epoch": 0.2257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.41002750396728516, "learning_rate": 7.757142857142857e-06, "loss": 0.0609, "num_tokens": 944420.0, "reward": 2.875, "reward_std": 1.0606601238250732, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.75, "rewards/operation_reward/std": 0.4629100561141968, "step": 158, "step_time": 106.39868670515716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 909.125, "completions/mean_terminated_length": 909.125, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "entropy": 0.04930857941508293, "epoch": 0.22714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.08279015123844147, "learning_rate": 7.742857142857144e-06, "loss": 0.0038, "num_tokens": 954621.0, "reward": 2.375, "reward_std": 0.6408699750900269, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 159, "step_time": 149.22598814684898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 627.0, "completions/mean_terminated_length": 627.0, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.06312626600265503, "epoch": 0.22857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.10042259097099304, "learning_rate": 7.72857142857143e-06, "loss": -0.0616, "num_tokens": 962597.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 160, "step_time": 111.79892319999635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.16681933403015137, "epoch": 0.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.2954513430595398, "learning_rate": 7.714285714285716e-06, "loss": 0.003, "num_tokens": 966975.0, "reward": 2.9375, "reward_std": 0.5629958510398865, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.8125, "rewards/operation_reward/std": 0.25877460837364197, "step": 161, "step_time": 83.33250047639012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.1500943899154663, "epoch": 0.23142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2104010134935379, "learning_rate": 7.7e-06, "loss": -0.011, "num_tokens": 971897.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 162, "step_time": 135.79718682263047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.22203457355499268, "epoch": 0.23285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.4033657908439636, "learning_rate": 7.685714285714286e-06, "loss": 0.071, "num_tokens": 976057.0, "reward": 1.9206080436706543, "reward_std": 0.514500081539154, "rewards/accuracy_reward/mean": 0.9831081032752991, "rewards/accuracy_reward/std": 0.04777747765183449, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.4955156147480011, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 163, "step_time": 92.99575714301318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 721.75, "completions/mean_terminated_length": 540.4000244140625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.250613808631897, "epoch": 0.2342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.17119565606117249, "learning_rate": 7.671428571428571e-06, "loss": 0.2296, "num_tokens": 984727.0, "reward": 1.4729119539260864, "reward_std": 0.6807180047035217, "rewards/accuracy_reward/mean": 0.3770785927772522, "rewards/accuracy_reward/std": 0.5158332586288452, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.7833333015441895, "rewards/grounding_reward/std": 0.24364949762821198, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 164, "step_time": 114.74686012603343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 388.875, "completions/mean_terminated_length": 298.14288330078125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.31181374192237854, "epoch": 0.2357142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23113813996315002, "learning_rate": 7.657142857142858e-06, "loss": 0.3978, "num_tokens": 990838.0, "reward": 1.0971328020095825, "reward_std": 0.5871338248252869, "rewards/accuracy_reward/mean": 0.15963278710842133, "rewards/accuracy_reward/std": 0.06260015815496445, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 165, "step_time": 115.617346980609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 405.625, "completions/mean_terminated_length": 405.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.11043526977300644, "epoch": 0.23714285714285716, "frac_reward_zero_std": 0.0, "grad_norm": 0.178274467587471, "learning_rate": 7.642857142857143e-06, "loss": -0.108, "num_tokens": 996963.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 166, "step_time": 95.77211893815547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 525.5, "completions/mean_terminated_length": 359.3333435058594, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.17971976101398468, "epoch": 0.23857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.13971373438835144, "learning_rate": 7.62857142857143e-06, "loss": 0.2199, "num_tokens": 1004087.0, "reward": 1.1137179136276245, "reward_std": 0.926508903503418, "rewards/accuracy_reward/mean": 0.25121790170669556, "rewards/accuracy_reward/std": 0.4621632397174835, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.36250001192092896, "rewards/grounding_reward/std": 0.4274091422557831, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 167, "step_time": 131.51784649305046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 309.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.14225251972675323, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.324728786945343, "learning_rate": 7.614285714285715e-06, "loss": -0.0759, "num_tokens": 1009524.0, "reward": 1.3740233182907104, "reward_std": 0.5243081450462341, "rewards/accuracy_reward/mean": 0.24902330338954926, "rewards/accuracy_reward/std": 0.03411359712481499, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 168, "step_time": 92.62070437520742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 621.5, "completions/mean_terminated_length": 621.5, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 0.13306055963039398, "epoch": 0.24142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.134237140417099, "learning_rate": 7.600000000000001e-06, "loss": 0.1132, "num_tokens": 1017576.0, "reward": 1.9528954029083252, "reward_std": 0.6732385754585266, "rewards/accuracy_reward/mean": 0.5778954029083252, "rewards/accuracy_reward/std": 0.4513205885887146, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 169, "step_time": 170.91024039592594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 105.375, "completions/mean_terminated_length": 105.375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.21110115945339203, "epoch": 0.24285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.3927519619464874, "learning_rate": 7.585714285714286e-06, "loss": -0.0464, "num_tokens": 1021331.0, "reward": 3.125, "reward_std": 0.7440237998962402, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 170, "step_time": 100.34201695676893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 345.625, "completions/mean_terminated_length": 345.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.16438893973827362, "epoch": 0.24428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.21379634737968445, "learning_rate": 7.571428571428572e-06, "loss": -0.0099, "num_tokens": 1027016.0, "reward": 0.9634195566177368, "reward_std": 0.13825497031211853, "rewards/accuracy_reward/mean": 0.4634195566177368, "rewards/accuracy_reward/std": 0.13825495541095734, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 171, "step_time": 97.92254608310759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.0717618316411972, "epoch": 0.24571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.22420817613601685, "learning_rate": 7.557142857142857e-06, "loss": -0.0594, "num_tokens": 1032432.0, "reward": 2.174999952316284, "reward_std": 0.45276927947998047, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.675000011920929, "rewards/grounding_reward/std": 0.4527692496776581, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 172, "step_time": 93.00328980758786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 343.875, "completions/mean_terminated_length": 343.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09624980390071869, "epoch": 0.24714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1901111751794815, "learning_rate": 7.542857142857144e-06, "loss": 0.0141, "num_tokens": 1038103.0, "reward": 1.84375, "reward_std": 0.32561755180358887, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.34375, "rewards/grounding_reward/std": 0.32561755180358887, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 173, "step_time": 93.02210504747927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2514188885688782, "epoch": 0.24857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.25093501806259155, "learning_rate": 7.52857142857143e-06, "loss": 0.2598, "num_tokens": 1043232.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 174, "step_time": 96.45345011167228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 728.375, "completions/mean_terminated_length": 551.0, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "entropy": 0.1691139042377472, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.13976016640663147, "learning_rate": 7.514285714285715e-06, "loss": 0.0896, "num_tokens": 1052019.0, "reward": 0.4884132146835327, "reward_std": 0.5375287532806396, "rewards/accuracy_reward/mean": 0.12591318786144257, "rewards/accuracy_reward/std": 0.3531864583492279, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.05000000074505806, "rewards/grounding_reward/std": 0.09258200973272324, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 175, "step_time": 131.22127964068204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.21104377508163452, "epoch": 0.25142857142857145, "frac_reward_zero_std": 0.0, "grad_norm": 0.3912598490715027, "learning_rate": 7.500000000000001e-06, "loss": -0.0265, "num_tokens": 1055771.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 176, "step_time": 82.53990142792463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 198.00001525878906, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2937633991241455, "epoch": 0.25285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.4066801071166992, "learning_rate": 7.485714285714286e-06, "loss": 0.4326, "num_tokens": 1061157.0, "reward": 1.661783218383789, "reward_std": 0.5422177910804749, "rewards/accuracy_reward/mean": 0.34928324818611145, "rewards/accuracy_reward/std": 0.31116655468940735, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 177, "step_time": 113.28634999878705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.05974893644452095, "epoch": 0.2542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2881625294685364, "learning_rate": 7.471428571428571e-06, "loss": -0.0127, "num_tokens": 1066218.0, "reward": 0.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 178, "step_time": 95.75408035889268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.0878022313117981, "epoch": 0.2557142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2052929550409317, "learning_rate": 7.457142857142857e-06, "loss": -0.049, "num_tokens": 1071462.0, "reward": 1.875, "reward_std": 0.3918818235397339, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.39188191294670105, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 179, "step_time": 92.35964590031654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.37554845213890076, "epoch": 0.2571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.3601829409599304, "learning_rate": 7.442857142857144e-06, "loss": 0.0564, "num_tokens": 1076363.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 180, "step_time": 88.46484798379242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 424.25, "completions/mean_terminated_length": 424.25, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.0824270099401474, "epoch": 0.25857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.11418511718511581, "learning_rate": 7.428571428571429e-06, "loss": 0.0219, "num_tokens": 1082613.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 181, "step_time": 132.4688660753891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 135.25, "completions/mean_terminated_length": 135.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.16730442643165588, "epoch": 0.26, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.414285714285715e-06, "loss": 0.0, "num_tokens": 1086615.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 182, "step_time": 97.80183923523873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 455.5, "completions/mean_terminated_length": 455.5, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.08423888683319092, "epoch": 0.26142857142857145, "frac_reward_zero_std": 0.0, "grad_norm": 0.1169370710849762, "learning_rate": 7.4e-06, "loss": 0.0021, "num_tokens": 1093171.0, "reward": 1.9132883548736572, "reward_std": 0.4171941578388214, "rewards/accuracy_reward/mean": 0.9966216087341309, "rewards/accuracy_reward/std": 0.009555491618812084, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4166666865348816, "rewards/grounding_reward/std": 0.41785547137260437, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 183, "step_time": 96.98696964699775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 654.125, "completions/mean_terminated_length": 432.20001220703125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.513742208480835, "epoch": 0.26285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.15131260454654694, "learning_rate": 7.385714285714286e-06, "loss": 0.0463, "num_tokens": 1101396.0, "reward": 1.0450974702835083, "reward_std": 0.24861082434654236, "rewards/accuracy_reward/mean": 0.4200974702835083, "rewards/accuracy_reward/std": 0.16924671828746796, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.1875, "rewards/grounding_reward/std": 0.27368009090423584, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 184, "step_time": 117.91254101507366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 250.25, "completions/mean_terminated_length": 250.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.15549129247665405, "epoch": 0.2642857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.23577702045440674, "learning_rate": 7.371428571428571e-06, "loss": -0.1239, "num_tokens": 1106318.0, "reward": 1.5771780014038086, "reward_std": 0.5039523243904114, "rewards/accuracy_reward/mean": 0.5771780610084534, "rewards/accuracy_reward/std": 0.08169227093458176, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 185, "step_time": 96.8809107253328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 163.5, "completions/mean_terminated_length": 163.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.10796267539262772, "epoch": 0.26571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.30049431324005127, "learning_rate": 7.357142857142858e-06, "loss": -0.0362, "num_tokens": 1110562.0, "reward": 2.0625, "reward_std": 0.4172614812850952, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5625, "rewards/grounding_reward/std": 0.4172614812850952, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 186, "step_time": 108.19057198893279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.13554511964321136, "epoch": 0.2671428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.29596301913261414, "learning_rate": 7.342857142857144e-06, "loss": -0.0401, "num_tokens": 1116155.0, "reward": 1.4666666984558105, "reward_std": 0.18516398966312408, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.30000001192092896, "rewards/grounding_reward/std": 0.18516401946544647, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 187, "step_time": 97.14119611028582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 357.25, "completions/mean_terminated_length": 357.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1416522115468979, "epoch": 0.26857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3061387240886688, "learning_rate": 7.328571428571429e-06, "loss": -0.2221, "num_tokens": 1121853.0, "reward": 1.1239721775054932, "reward_std": 0.2853962779045105, "rewards/accuracy_reward/mean": 0.6239722371101379, "rewards/accuracy_reward/std": 0.2853962779045105, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 188, "step_time": 116.26321797817945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.1409805417060852, "epoch": 0.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.1909896731376648, "learning_rate": 7.314285714285715e-06, "loss": 0.028, "num_tokens": 1127403.0, "reward": 1.8524179458618164, "reward_std": 0.768547773361206, "rewards/accuracy_reward/mean": 0.47741788625717163, "rewards/accuracy_reward/std": 0.3233497738838196, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 189, "step_time": 110.89608141873032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 302.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07140741497278214, "epoch": 0.2714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19972780346870422, "learning_rate": 7.3e-06, "loss": -0.0547, "num_tokens": 1132679.0, "reward": 2.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.625, "rewards/operation_reward/std": 0.5175492167472839, "step": 190, "step_time": 95.97787058539689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 355.875, "completions/mean_terminated_length": 355.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.21988117694854736, "epoch": 0.27285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.19956769049167633, "learning_rate": 7.285714285714286e-06, "loss": -0.0133, "num_tokens": 1138414.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 191, "step_time": 117.28837149776518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.12967602908611298, "epoch": 0.2742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2177973836660385, "learning_rate": 7.2714285714285715e-06, "loss": 0.0169, "num_tokens": 1142727.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 192, "step_time": 90.06448155641556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.4682021141052246, "epoch": 0.2757142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3574574291706085, "learning_rate": 7.257142857142858e-06, "loss": -0.0952, "num_tokens": 1148253.0, "reward": 1.305624008178711, "reward_std": 0.36649584770202637, "rewards/accuracy_reward/mean": 0.6806240081787109, "rewards/accuracy_reward/std": 0.23929700255393982, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 193, "step_time": 151.8442096542567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.07406872510910034, "epoch": 0.27714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.12632793188095093, "learning_rate": 7.2428571428571435e-06, "loss": -0.033, "num_tokens": 1153751.0, "reward": 1.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 194, "step_time": 119.87765848170966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.18150988221168518, "epoch": 0.2785714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2853614091873169, "learning_rate": 7.2285714285714294e-06, "loss": 0.0634, "num_tokens": 1158170.0, "reward": 1.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 195, "step_time": 120.75552370026708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.15732227265834808, "epoch": 0.28, "frac_reward_zero_std": 0.0, "grad_norm": 0.3282487690448761, "learning_rate": 7.2142857142857145e-06, "loss": 0.0636, "num_tokens": 1162928.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 196, "step_time": 109.80539019964635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 510.875, "completions/mean_terminated_length": 437.5714416503906, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.2516075074672699, "epoch": 0.2814285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1754855066537857, "learning_rate": 7.2000000000000005e-06, "loss": 0.0876, "num_tokens": 1169975.0, "reward": 1.2597222328186035, "reward_std": 0.31223931908607483, "rewards/accuracy_reward/mean": 0.8222222328186035, "rewards/accuracy_reward/std": 0.2511350214481354, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 197, "step_time": 113.99515003710985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.08543406426906586, "epoch": 0.28285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.17975474894046783, "learning_rate": 7.185714285714286e-06, "loss": -0.1577, "num_tokens": 1175329.0, "reward": 1.6964285373687744, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 198, "step_time": 102.89266025274992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.10208296775817871, "epoch": 0.2842857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.31624987721443176, "learning_rate": 7.1714285714285725e-06, "loss": 0.0247, "num_tokens": 1179231.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 199, "step_time": 107.43766647018492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.10205210000276566, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2608705163002014, "learning_rate": 7.1571428571428584e-06, "loss": 0.0078, "num_tokens": 1183508.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 200, "step_time": 101.84218778181821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.09535986930131912, "epoch": 0.28714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.281034916639328, "learning_rate": 7.1428571428571436e-06, "loss": -0.0281, "num_tokens": 1187623.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 201, "step_time": 140.5002045566216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.08243212848901749, "epoch": 0.2885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.18037806451320648, "learning_rate": 7.1285714285714295e-06, "loss": -0.0175, "num_tokens": 1192890.0, "reward": 3.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 202, "step_time": 104.05032335594296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 838.25, "completions/mean_terminated_length": 528.6666870117188, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.22520610690116882, "epoch": 0.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.1209401786327362, "learning_rate": 7.114285714285715e-06, "loss": 0.2383, "num_tokens": 1202492.0, "reward": 1.0873271226882935, "reward_std": 0.810232937335968, "rewards/accuracy_reward/mean": 0.19149379432201385, "rewards/accuracy_reward/std": 0.21891999244689941, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.7083333730697632, "rewards/grounding_reward/std": 0.4520675837993622, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 203, "step_time": 114.33225334715098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 614.625, "completions/mean_terminated_length": 614.625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.07396433502435684, "epoch": 0.2914285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.100000000000001e-06, "loss": 0.0, "num_tokens": 1210337.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 204, "step_time": 189.59784684702754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.1333886682987213, "epoch": 0.29285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.21549591422080994, "learning_rate": 7.085714285714286e-06, "loss": 0.0606, "num_tokens": 1215476.0, "reward": 1.7433710098266602, "reward_std": 0.763467013835907, "rewards/accuracy_reward/mean": 0.24337098002433777, "rewards/accuracy_reward/std": 0.3111148476600647, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.1875, "rewards/operation_reward/std": 0.3720119297504425, "step": 205, "step_time": 210.40307482797652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 486.75, "completions/mean_terminated_length": 486.75, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.06482422351837158, "epoch": 0.29428571428571426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.0714285714285726e-06, "loss": 0.0, "num_tokens": 1222322.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 206, "step_time": 119.07825474161655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.32007530331611633, "epoch": 0.2957142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.5273745059967041, "learning_rate": 7.057142857142858e-06, "loss": 0.2167, "num_tokens": 1226120.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 207, "step_time": 94.25689273513854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.11718853563070297, "epoch": 0.29714285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.042857142857144e-06, "loss": 0.0, "num_tokens": 1230347.0, "reward": 1.899999976158142, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4000000059604645, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 208, "step_time": 98.97587581537664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07306508719921112, "epoch": 0.2985714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.16013386845588684, "learning_rate": 7.028571428571429e-06, "loss": -0.0725, "num_tokens": 1236130.0, "reward": 2.2750000953674316, "reward_std": 0.4200340211391449, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8999999761581421, "rewards/grounding_reward/std": 0.2828427255153656, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 209, "step_time": 101.90437845233828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 753.625, "completions/mean_terminated_length": 663.5, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.27542930841445923, "epoch": 0.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.12505589425563812, "learning_rate": 7.014285714285715e-06, "loss": 0.167, "num_tokens": 1245175.0, "reward": 0.8656542301177979, "reward_std": 0.2851469814777374, "rewards/accuracy_reward/mean": 0.09898757934570312, "rewards/accuracy_reward/std": 0.057302530854940414, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.3916666507720947, "rewards/grounding_reward/std": 0.3215784430503845, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 210, "step_time": 429.0901242066175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.1313607096672058, "epoch": 0.30142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.37306874990463257, "learning_rate": 7e-06, "loss": 0.1351, "num_tokens": 1249492.0, "reward": 2.3125, "reward_std": 0.5303300619125366, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 211, "step_time": 89.02462222985923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.19250163435935974, "epoch": 0.3028571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.985714285714287e-06, "loss": 0.0, "num_tokens": 1254043.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 212, "step_time": 88.26334451697767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 865.125, "completions/mean_terminated_length": 769.7999877929688, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "entropy": 0.14896418154239655, "epoch": 0.30428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.11357002705335617, "learning_rate": 6.971428571428573e-06, "loss": 0.0888, "num_tokens": 1264076.0, "reward": 1.3555889129638672, "reward_std": 0.5871680378913879, "rewards/accuracy_reward/mean": 0.21808886528015137, "rewards/accuracy_reward/std": 0.32769495248794556, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.824999988079071, "rewards/grounding_reward/std": 0.36154431104660034, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 213, "step_time": 154.288779550232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 396.125, "completions/mean_terminated_length": 306.4285888671875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.15727941691875458, "epoch": 0.3057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.30435577034950256, "learning_rate": 6.957142857142858e-06, "loss": 0.5558, "num_tokens": 1270149.0, "reward": 1.271369218826294, "reward_std": 0.5250977873802185, "rewards/accuracy_reward/mean": 0.8338690996170044, "rewards/accuracy_reward/std": 0.3549167215824127, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 214, "step_time": 113.47562491893768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 253.71429443359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2605278789997101, "epoch": 0.30714285714285716, "frac_reward_zero_std": 0.0, "grad_norm": 0.22135335206985474, "learning_rate": 6.942857142857144e-06, "loss": 0.4049, "num_tokens": 1275877.0, "reward": 0.9378570318222046, "reward_std": 0.5833368897438049, "rewards/accuracy_reward/mean": 0.2503569722175598, "rewards/accuracy_reward/std": 0.4626907706260681, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.15430335700511932, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 215, "step_time": 131.2989306487143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 501.125, "completions/mean_terminated_length": 426.4285888671875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.29668357968330383, "epoch": 0.30857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.18517917394638062, "learning_rate": 6.928571428571429e-06, "loss": 0.0502, "num_tokens": 1282982.0, "reward": 2.147566795349121, "reward_std": 0.40787947177886963, "rewards/accuracy_reward/mean": 0.7725667953491211, "rewards/accuracy_reward/std": 0.42163392901420593, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 216, "step_time": 110.42299941368401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.18551692366600037, "epoch": 0.31, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.914285714285715e-06, "loss": 0.0, "num_tokens": 1287224.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 217, "step_time": 88.14317219890654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 445.0, "completions/mean_terminated_length": 445.0, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.09522602707147598, "epoch": 0.31142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2831375002861023, "learning_rate": 6.9e-06, "loss": -0.1524, "num_tokens": 1293704.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 218, "step_time": 116.86546133644879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 295.375, "completions/mean_terminated_length": 295.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.1050739735364914, "epoch": 0.31285714285714283, "frac_reward_zero_std": 0.0, "grad_norm": 0.2180635631084442, "learning_rate": 6.885714285714287e-06, "loss": 0.0796, "num_tokens": 1298979.0, "reward": 2.75, "reward_std": 0.8864052295684814, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.625, "rewards/operation_reward/std": 0.5175492167472839, "step": 219, "step_time": 107.46769511327147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 556.625, "completions/mean_terminated_length": 489.857177734375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.22209443151950836, "epoch": 0.3142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1835758090019226, "learning_rate": 6.871428571428572e-06, "loss": 0.1307, "num_tokens": 1306424.0, "reward": 1.3937722444534302, "reward_std": 0.760150134563446, "rewards/accuracy_reward/mean": 0.6437722444534302, "rewards/accuracy_reward/std": 0.39997974038124084, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.3125, "rewards/grounding_reward/std": 0.45806270837783813, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 220, "step_time": 138.66276363004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 522.0, "completions/mean_terminated_length": 522.0, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.08762712776660919, "epoch": 0.3157142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.20099212229251862, "learning_rate": 6.857142857142858e-06, "loss": -0.0298, "num_tokens": 1313656.0, "reward": 2.3079776763916016, "reward_std": 0.3558672368526459, "rewards/accuracy_reward/mean": 0.8079777359962463, "rewards/accuracy_reward/std": 0.3558671772480011, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 221, "step_time": 115.11021490953863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 476.5, "completions/mean_terminated_length": 476.5, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.04907120019197464, "epoch": 0.3171428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.10534542798995972, "learning_rate": 6.842857142857143e-06, "loss": 0.0504, "num_tokens": 1320420.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 222, "step_time": 103.15050622541457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.1106850877404213, "epoch": 0.31857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.20896078646183014, "learning_rate": 6.828571428571429e-06, "loss": -0.0022, "num_tokens": 1325055.0, "reward": 1.876046895980835, "reward_std": 0.7971658110618591, "rewards/accuracy_reward/mean": 0.5010469555854797, "rewards/accuracy_reward/std": 0.20195674896240234, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 223, "step_time": 94.4069345574826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1872149407863617, "epoch": 0.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.42372897267341614, "learning_rate": 6.814285714285714e-06, "loss": -0.3299, "num_tokens": 1329426.0, "reward": 2.0438218116760254, "reward_std": 0.5920701622962952, "rewards/accuracy_reward/mean": 0.418821781873703, "rewards/accuracy_reward/std": 0.24391838908195496, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 224, "step_time": 100.16496253199875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 125.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.09221871942281723, "epoch": 0.32142857142857145, "frac_reward_zero_std": 0.0, "grad_norm": 0.22872525453567505, "learning_rate": 6.800000000000001e-06, "loss": 0.0021, "num_tokens": 1333324.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 225, "step_time": 117.0215228702873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.13724680244922638, "epoch": 0.32285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.1898733228445053, "learning_rate": 6.785714285714287e-06, "loss": -0.0576, "num_tokens": 1338477.0, "reward": 2.375553607940674, "reward_std": 0.1014266237616539, "rewards/accuracy_reward/mean": 0.8755534887313843, "rewards/accuracy_reward/std": 0.10142658650875092, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 226, "step_time": 89.40807595290244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 546.0, "completions/mean_terminated_length": 386.66668701171875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.19185291230678558, "epoch": 0.3242857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.16705448925495148, "learning_rate": 6.771428571428572e-06, "loss": 0.3539, "num_tokens": 1345765.0, "reward": 1.818047285079956, "reward_std": 0.6507388353347778, "rewards/accuracy_reward/mean": 0.6930473446846008, "rewards/accuracy_reward/std": 0.45387041568756104, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 227, "step_time": 147.1598353208974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1425115466117859, "epoch": 0.32571428571428573, "frac_reward_zero_std": 0.0, "grad_norm": 0.29549986124038696, "learning_rate": 6.757142857142858e-06, "loss": -0.0821, "num_tokens": 1350251.0, "reward": 1.71875, "reward_std": 0.33905068039894104, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.21875, "rewards/grounding_reward/std": 0.33905068039894104, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 228, "step_time": 96.16343982424587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.14206786453723907, "epoch": 0.3271428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.17550525069236755, "learning_rate": 6.742857142857143e-06, "loss": -0.0646, "num_tokens": 1354851.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 229, "step_time": 96.94461223576218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.2923702001571655, "epoch": 0.32857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.06961303949356079, "learning_rate": 6.728571428571429e-06, "loss": 0.0, "num_tokens": 1366091.0, "reward": 0.874756932258606, "reward_std": 0.3641372621059418, "rewards/accuracy_reward/mean": 0.031006891280412674, "rewards/accuracy_reward/std": 0.050467599183321, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.84375, "rewards/grounding_reward/std": 0.35197150707244873, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 230, "step_time": 138.74982790090144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 863.5, "completions/mean_terminated_length": 767.2000122070312, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.2987423539161682, "epoch": 0.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.14853180944919586, "learning_rate": 6.714285714285714e-06, "loss": 0.1333, "num_tokens": 1375943.0, "reward": 0.9040465950965881, "reward_std": 0.7479480504989624, "rewards/accuracy_reward/mean": 0.5915466547012329, "rewards/accuracy_reward/std": 0.49267277121543884, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 231, "step_time": 179.08875807747245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.09132001549005508, "epoch": 0.3314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19882583618164062, "learning_rate": 6.700000000000001e-06, "loss": 0.0581, "num_tokens": 1381032.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 232, "step_time": 101.61445819307119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.0848408117890358, "epoch": 0.33285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.20376913249492645, "learning_rate": 6.685714285714286e-06, "loss": -0.0002, "num_tokens": 1384947.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 233, "step_time": 84.50852105673403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 903.0, "completions/mean_terminated_length": 903.0, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.07216983288526535, "epoch": 0.3342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.06549791991710663, "learning_rate": 6.671428571428572e-06, "loss": -0.0206, "num_tokens": 1395099.0, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.2314550280570984, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 234, "step_time": 137.99700894206762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 441.375, "completions/mean_terminated_length": 441.375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "entropy": 0.0775366723537445, "epoch": 0.3357142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.11284122616052628, "learning_rate": 6.657142857142857e-06, "loss": -0.0598, "num_tokens": 1401654.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 235, "step_time": 109.94016147404909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.28198614716529846, "epoch": 0.33714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.3086221516132355, "learning_rate": 6.642857142857143e-06, "loss": -0.0602, "num_tokens": 1407279.0, "reward": 1.6440476179122925, "reward_std": 0.44322454929351807, "rewards/accuracy_reward/mean": 0.8690476417541504, "rewards/accuracy_reward/std": 0.2424767017364502, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.2750000059604645, "rewards/grounding_reward/std": 0.45276927947998047, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 236, "step_time": 107.80534801539034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 738.875, "completions/mean_terminated_length": 453.75, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.5669732093811035, "epoch": 0.3385714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.17627164721488953, "learning_rate": 6.628571428571428e-06, "loss": 0.3323, "num_tokens": 1416126.0, "reward": 0.8562651872634888, "reward_std": 0.7342113852500916, "rewards/accuracy_reward/mean": 0.5437651872634888, "rewards/accuracy_reward/std": 0.4665282368659973, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 237, "step_time": 117.50217020045966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 468.125, "completions/mean_terminated_length": 468.125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.04191993176937103, "epoch": 0.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.06060321629047394, "learning_rate": 6.614285714285715e-06, "loss": 0.0348, "num_tokens": 1422759.0, "reward": 2.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 238, "step_time": 101.57906604651362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2109057605266571, "epoch": 0.3414285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.48759058117866516, "learning_rate": 6.600000000000001e-06, "loss": -0.0416, "num_tokens": 1426704.0, "reward": 1.6875, "reward_std": 0.8425089716911316, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.3125, "rewards/operation_reward/std": 0.45806270837783813, "step": 239, "step_time": 87.50367985479534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.10896190255880356, "epoch": 0.34285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.29899248480796814, "learning_rate": 6.585714285714286e-06, "loss": 0.0076, "num_tokens": 1430650.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 240, "step_time": 96.61721079051495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.13942338526248932, "epoch": 0.3442857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.23247669637203217, "learning_rate": 6.571428571428572e-06, "loss": 0.0277, "num_tokens": 1436301.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 241, "step_time": 118.17823203187436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07486304640769958, "epoch": 0.3457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.22523538768291473, "learning_rate": 6.557142857142857e-06, "loss": -0.0927, "num_tokens": 1441139.0, "reward": 1.2374999523162842, "reward_std": 0.46885114908218384, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.612500011920929, "rewards/grounding_reward/std": 0.3136763870716095, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 242, "step_time": 96.342564788647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.2037595957517624, "epoch": 0.34714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.32628464698791504, "learning_rate": 6.542857142857143e-06, "loss": 0.1461, "num_tokens": 1446525.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 243, "step_time": 107.90291623212397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 630.75, "completions/mean_terminated_length": 237.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.5622125267982483, "epoch": 0.3485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1667434126138687, "learning_rate": 6.5285714285714285e-06, "loss": 0.523, "num_tokens": 1454499.0, "reward": 1.1920902729034424, "reward_std": 1.074339747428894, "rewards/accuracy_reward/mean": 0.4733402729034424, "rewards/accuracy_reward/std": 0.47660142183303833, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.46875, "rewards/grounding_reward/std": 0.5077524185180664, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 244, "step_time": 172.164781793952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 424.5, "completions/mean_terminated_length": 224.6666717529297, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2332962453365326, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.15910933911800385, "learning_rate": 6.514285714285715e-06, "loss": 0.0814, "num_tokens": 1460903.0, "reward": 1.1256957054138184, "reward_std": 0.35327452421188354, "rewards/accuracy_reward/mean": 0.0006957199657335877, "rewards/accuracy_reward/std": 0.0012884063180536032, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.37796446681022644, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 245, "step_time": 144.0546431997791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.20346792042255402, "epoch": 0.3514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23391832411289215, "learning_rate": 6.5000000000000004e-06, "loss": 0.0348, "num_tokens": 1465877.0, "reward": 1.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 246, "step_time": 88.95531635731459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.15597401559352875, "epoch": 0.35285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.21732300519943237, "learning_rate": 6.485714285714286e-06, "loss": -0.0376, "num_tokens": 1470774.0, "reward": 2.5, "reward_std": 0.9258201122283936, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.375, "rewards/operation_reward/std": 0.5175492167472839, "step": 247, "step_time": 107.24863768927753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 623.25, "completions/mean_terminated_length": 489.66668701171875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.31913095712661743, "epoch": 0.35428571428571426, "frac_reward_zero_std": 0.0, "grad_norm": 0.11946365237236023, "learning_rate": 6.4714285714285715e-06, "loss": -0.0225, "num_tokens": 1478752.0, "reward": 1.3541667461395264, "reward_std": 0.4403957426548004, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.1041666716337204, "rewards/grounding_reward/std": 0.19795581698417664, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 248, "step_time": 138.8202039944008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 773.25, "completions/mean_terminated_length": 689.6666870117188, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.15493589639663696, "epoch": 0.3557142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.12354554980993271, "learning_rate": 6.4571428571428575e-06, "loss": 0.2019, "num_tokens": 1487802.0, "reward": 1.009792685508728, "reward_std": 0.7838221192359924, "rewards/accuracy_reward/mean": 0.47229260206222534, "rewards/accuracy_reward/std": 0.45917850732803345, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.16249999403953552, "rewards/grounding_reward/std": 0.3113909065723419, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 249, "step_time": 134.7116874055937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.12285947054624557, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.4171614944934845, "learning_rate": 6.442857142857143e-06, "loss": -0.0053, "num_tokens": 1492014.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 250, "step_time": 118.44043390639126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 608.875, "completions/mean_terminated_length": 193.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4640027582645416, "epoch": 0.3585714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.27646270394325256, "learning_rate": 6.4285714285714295e-06, "loss": 0.3992, "num_tokens": 1499797.0, "reward": 0.4389868974685669, "reward_std": 0.3193558156490326, "rewards/accuracy_reward/mean": 0.00148690864443779, "rewards/accuracy_reward/std": 0.0015965604688972235, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.1875, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 251, "step_time": 167.27337414305657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 520.625, "completions/mean_terminated_length": 448.71429443359375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.18999572098255157, "epoch": 0.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.22893737256526947, "learning_rate": 6.4142857142857154e-06, "loss": 0.2142, "num_tokens": 1506866.0, "reward": 1.2900638580322266, "reward_std": 0.41810306906700134, "rewards/accuracy_reward/mean": 0.35256385803222656, "rewards/accuracy_reward/std": 0.06573264300823212, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.30860671401023865, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 252, "step_time": 148.1062897928059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 431.75, "completions/mean_terminated_length": 431.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.1195523738861084, "epoch": 0.36142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.172006294131279, "learning_rate": 6.4000000000000006e-06, "loss": -0.0388, "num_tokens": 1513192.0, "reward": 2.545454502105713, "reward_std": 0.44536179304122925, "rewards/accuracy_reward/mean": 0.9204545617103577, "rewards/accuracy_reward/std": 0.22498852014541626, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 253, "step_time": 124.84982101246715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 402.0, "completions/mean_terminated_length": 402.0, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.07699377834796906, "epoch": 0.3628571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.14132805168628693, "learning_rate": 6.3857142857142865e-06, "loss": -0.0378, "num_tokens": 1519280.0, "reward": 2.049999952316284, "reward_std": 0.4869731366634369, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.550000011920929, "rewards/grounding_reward/std": 0.4869731664657593, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 254, "step_time": 114.59101029112935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 509.375, "completions/mean_terminated_length": 337.8333435058594, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.33835235238075256, "epoch": 0.36428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.23019614815711975, "learning_rate": 6.371428571428572e-06, "loss": 0.5066, "num_tokens": 1526315.0, "reward": 1.4465279579162598, "reward_std": 0.9493622779846191, "rewards/accuracy_reward/mean": 0.6340280771255493, "rewards/accuracy_reward/std": 0.390634685754776, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.4172614812850952, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 255, "step_time": 147.12454421445727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.16550230979919434, "epoch": 0.3657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23417384922504425, "learning_rate": 6.357142857142858e-06, "loss": -0.0781, "num_tokens": 1531553.0, "reward": 0.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 256, "step_time": 113.23498296365142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 495.0, "completions/mean_terminated_length": 419.4285888671875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.21109943091869354, "epoch": 0.36714285714285716, "frac_reward_zero_std": 0.0, "grad_norm": 0.19288760423660278, "learning_rate": 6.342857142857143e-06, "loss": 0.3778, "num_tokens": 1538361.0, "reward": 1.31259286403656, "reward_std": 0.5300674438476562, "rewards/accuracy_reward/mean": 9.286775457439944e-05, "rewards/accuracy_reward/std": 0.00026266969507560134, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 257, "step_time": 120.44442941248417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.09607282280921936, "epoch": 0.36857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.1811019778251648, "learning_rate": 6.3285714285714296e-06, "loss": -0.027, "num_tokens": 1542543.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 258, "step_time": 120.94650852214545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.12151424586772919, "epoch": 0.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.17685288190841675, "learning_rate": 6.314285714285715e-06, "loss": -0.0002, "num_tokens": 1548251.0, "reward": 1.3887581825256348, "reward_std": 0.2295731008052826, "rewards/accuracy_reward/mean": 0.8887581825256348, "rewards/accuracy_reward/std": 0.2295731157064438, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 259, "step_time": 134.6504706768319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 912.5, "completions/mean_terminated_length": 726.6666870117188, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.3710029721260071, "epoch": 0.37142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.11894029378890991, "learning_rate": 6.300000000000001e-06, "loss": 0.1262, "num_tokens": 1558471.0, "reward": 0.8805587291717529, "reward_std": 0.40766921639442444, "rewards/accuracy_reward/mean": 0.12535038590431213, "rewards/accuracy_reward/std": 0.353412002325058, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.5677083730697632, "rewards/grounding_reward/std": 0.3347718417644501, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 260, "step_time": 170.0247854143381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.09980087727308273, "epoch": 0.37285714285714283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.285714285714286e-06, "loss": 0.0, "num_tokens": 1563698.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 261, "step_time": 143.6660772152245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.11465887725353241, "epoch": 0.3742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.20133906602859497, "learning_rate": 6.271428571428572e-06, "loss": -0.0177, "num_tokens": 1567890.0, "reward": 2.4166667461395264, "reward_std": 0.23570220172405243, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9166666865348816, "rewards/grounding_reward/std": 0.2357022762298584, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 262, "step_time": 119.66161243990064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 651.375, "completions/mean_terminated_length": 527.1666870117188, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.30259203910827637, "epoch": 0.3757142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.19265615940093994, "learning_rate": 6.257142857142857e-06, "loss": 0.193, "num_tokens": 1576021.0, "reward": 0.9575269222259521, "reward_std": 0.519311249256134, "rewards/accuracy_reward/mean": 0.5408601760864258, "rewards/accuracy_reward/std": 0.3708358407020569, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0416666679084301, "rewards/grounding_reward/std": 0.1178511455655098, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 263, "step_time": 151.8479955671355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.15238486230373383, "epoch": 0.37714285714285717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.242857142857144e-06, "loss": 0.0, "num_tokens": 1580234.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 264, "step_time": 94.70478617865592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.15371672809123993, "epoch": 0.37857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.34374427795410156, "learning_rate": 6.22857142857143e-06, "loss": 0.0403, "num_tokens": 1584353.0, "reward": 1.8472223281860352, "reward_std": 0.35355344414711, "rewards/accuracy_reward/mean": 0.2222222238779068, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 265, "step_time": 128.70079882908612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 277.125, "completions/mean_terminated_length": 277.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.07339337468147278, "epoch": 0.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.18161039054393768, "learning_rate": 6.214285714285715e-06, "loss": -0.0265, "num_tokens": 1589522.0, "reward": 1.8250000476837158, "reward_std": 0.5824823379516602, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.574999988079071, "rewards/grounding_reward/std": 0.37321004271507263, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 266, "step_time": 111.84662083350122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.24232271313667297, "epoch": 0.38142857142857145, "frac_reward_zero_std": 0.0, "grad_norm": 0.3504336178302765, "learning_rate": 6.200000000000001e-06, "loss": 0.0362, "num_tokens": 1593656.0, "reward": 0.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 267, "step_time": 89.1805018549785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.12326209992170334, "epoch": 0.38285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.22101189196109772, "learning_rate": 6.185714285714286e-06, "loss": 0.0678, "num_tokens": 1598796.0, "reward": 1.8442740440368652, "reward_std": 0.04992423579096794, "rewards/accuracy_reward/mean": 0.3442741632461548, "rewards/accuracy_reward/std": 0.04992423951625824, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 268, "step_time": 113.66387099027634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 339.625, "completions/mean_terminated_length": 339.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.1037033274769783, "epoch": 0.3842857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.25760045647621155, "learning_rate": 6.171428571428572e-06, "loss": 0.243, "num_tokens": 1604393.0, "reward": 1.3333333730697632, "reward_std": 0.30860665440559387, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8333333730697632, "rewards/grounding_reward/std": 0.30860668420791626, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 269, "step_time": 101.67486560158432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 245.875, "completions/mean_terminated_length": 245.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.08907440304756165, "epoch": 0.38571428571428573, "frac_reward_zero_std": 0.0, "grad_norm": 0.2256542295217514, "learning_rate": 6.157142857142858e-06, "loss": 0.0145, "num_tokens": 1609216.0, "reward": 1.7916667461395264, "reward_std": 0.1178511381149292, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.2916666865348816, "rewards/grounding_reward/std": 0.1178511381149292, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 270, "step_time": 104.75664068199694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.06289774924516678, "epoch": 0.3871428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.142857142857144e-06, "loss": 0.0, "num_tokens": 1614487.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 271, "step_time": 105.5651737684384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.12962238490581512, "epoch": 0.38857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2795787453651428, "learning_rate": 6.128571428571429e-06, "loss": -0.0562, "num_tokens": 1619163.0, "reward": 2.0640244483947754, "reward_std": 0.5046629905700684, "rewards/accuracy_reward/mean": 0.9390243887901306, "rewards/accuracy_reward/std": 0.06518567353487015, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 272, "step_time": 91.81545367091894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 441.625, "completions/mean_terminated_length": 358.4285888671875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "entropy": 0.18849173188209534, "epoch": 0.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.2262653112411499, "learning_rate": 6.114285714285715e-06, "loss": 0.313, "num_tokens": 1625744.0, "reward": 1.3823280334472656, "reward_std": 0.5678916573524475, "rewards/accuracy_reward/mean": 0.2573280334472656, "rewards/accuracy_reward/std": 0.09976859390735626, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.45806270837783813, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 273, "step_time": 133.67752522230148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.218600332736969, "epoch": 0.3914285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.1e-06, "loss": 0.0, "num_tokens": 1629807.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 274, "step_time": 103.5773759894073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1857619434595108, "epoch": 0.39285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.085714285714286e-06, "loss": 0.0, "num_tokens": 1633597.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 275, "step_time": 93.13557959068567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2762109339237213, "epoch": 0.3942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.6956732869148254, "learning_rate": 6.071428571428571e-06, "loss": 0.0699, "num_tokens": 1637470.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 276, "step_time": 145.5153569066897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 592.375, "completions/mean_terminated_length": 592.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.11088769137859344, "epoch": 0.39571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.14249448478221893, "learning_rate": 6.057142857142858e-06, "loss": 0.0212, "num_tokens": 1645057.0, "reward": 2.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.375, "rewards/operation_reward/std": 0.5175492167472839, "step": 277, "step_time": 135.13213667552918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.1943972408771515, "epoch": 0.39714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.2702440023422241, "learning_rate": 6.042857142857144e-06, "loss": -0.045, "num_tokens": 1649113.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 278, "step_time": 92.14134633541107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.10661964118480682, "epoch": 0.3985714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.22317181527614594, "learning_rate": 6.028571428571429e-06, "loss": -0.003, "num_tokens": 1653429.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 279, "step_time": 95.79408053774387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 329.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.06693993508815765, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.1617380827665329, "learning_rate": 6.014285714285715e-06, "loss": -0.1017, "num_tokens": 1658979.0, "reward": 2.0863635540008545, "reward_std": 0.47330132126808167, "rewards/accuracy_reward/mean": 0.7613636255264282, "rewards/accuracy_reward/std": 0.3698734641075134, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.824999988079071, "rewards/grounding_reward/std": 0.36154431104660034, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 280, "step_time": 391.73062446154654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.06661412119865417, "epoch": 0.4014285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6e-06, "loss": 0.0, "num_tokens": 1663868.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 281, "step_time": 134.82532940991223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 716.375, "completions/mean_terminated_length": 613.8333740234375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.1607910841703415, "epoch": 0.40285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1706024557352066, "learning_rate": 5.985714285714286e-06, "loss": 0.2318, "num_tokens": 1672511.0, "reward": 0.3752371668815613, "reward_std": 0.23101592063903809, "rewards/accuracy_reward/mean": 0.0002371412847423926, "rewards/accuracy_reward/std": 0.0004391075926832855, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 282, "step_time": 150.32232478819788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 99.625, "completions/mean_terminated_length": 99.625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.17544737458229065, "epoch": 0.4042857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.971428571428572e-06, "loss": 0.0, "num_tokens": 1676204.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 283, "step_time": 90.77323262393475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.16574332118034363, "epoch": 0.4057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.43985140323638916, "learning_rate": 5.957142857142858e-06, "loss": -0.0333, "num_tokens": 1680540.0, "reward": 2.362499952316284, "reward_std": 0.6610327363014221, "rewards/accuracy_reward/mean": 0.987500011920929, "rewards/accuracy_reward/std": 0.0353553481400013, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 284, "step_time": 123.59509504493326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.2194938361644745, "epoch": 0.40714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.27138733863830566, "learning_rate": 5.942857142857143e-06, "loss": -0.0523, "num_tokens": 1685457.0, "reward": 2.1610403060913086, "reward_std": 0.4021814167499542, "rewards/accuracy_reward/mean": 0.4110404849052429, "rewards/accuracy_reward/std": 0.3752099573612213, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 285, "step_time": 103.7129638036713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.08261042088270187, "epoch": 0.4085714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.928571428571429e-06, "loss": 0.0, "num_tokens": 1690802.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 286, "step_time": 88.81161779817194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.09792943298816681, "epoch": 0.41, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.914285714285714e-06, "loss": 0.0, "num_tokens": 1694912.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 287, "step_time": 80.8060501711443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.1453815996646881, "epoch": 0.4114285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.9e-06, "loss": 0.0, "num_tokens": 1699706.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 288, "step_time": 83.3653932614252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1393689215183258, "epoch": 0.41285714285714287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.885714285714285e-06, "loss": 0.0, "num_tokens": 1703722.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 289, "step_time": 80.75743385776877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 601.25, "completions/mean_terminated_length": 460.3333435058594, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "entropy": 0.21861734986305237, "epoch": 0.4142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.22022299468517303, "learning_rate": 5.871428571428572e-06, "loss": 0.3276, "num_tokens": 1711524.0, "reward": 2.0114614963531494, "reward_std": 0.6971944570541382, "rewards/accuracy_reward/mean": 0.6781282424926758, "rewards/accuracy_reward/std": 0.4617396593093872, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.9583333730697632, "rewards/grounding_reward/std": 0.117851123213768, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 290, "step_time": 106.18556519318372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 477.0, "completions/mean_terminated_length": 477.0, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.0925491601228714, "epoch": 0.4157142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.20097152888774872, "learning_rate": 5.857142857142858e-06, "loss": -0.1028, "num_tokens": 1718412.0, "reward": 0.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 291, "step_time": 94.35727734304965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07921489328145981, "epoch": 0.41714285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.842857142857143e-06, "loss": 0.0, "num_tokens": 1722963.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 292, "step_time": 81.5851726429537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08620074391365051, "epoch": 0.4185714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1729973554611206, "learning_rate": 5.828571428571429e-06, "loss": -0.0633, "num_tokens": 1728367.0, "reward": 1.6749999523162842, "reward_std": 0.8310921788215637, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.42500001192092896, "rewards/grounding_reward/std": 0.48329228162765503, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 293, "step_time": 86.53971769567579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 333.625, "completions/mean_terminated_length": 333.625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "entropy": 0.06557262688875198, "epoch": 0.42, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.814285714285714e-06, "loss": 0.0, "num_tokens": 1733932.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 294, "step_time": 85.50759428367019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.09081494808197021, "epoch": 0.42142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.8e-06, "loss": 0.0, "num_tokens": 1738469.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 295, "step_time": 81.57451949734241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.09056322276592255, "epoch": 0.4228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.21764631569385529, "learning_rate": 5.785714285714286e-06, "loss": -0.0152, "num_tokens": 1742691.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 296, "step_time": 80.8235503192991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 359.75, "completions/mean_terminated_length": 359.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.12046889215707779, "epoch": 0.42428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.31888943910598755, "learning_rate": 5.771428571428572e-06, "loss": 0.3079, "num_tokens": 1748473.0, "reward": 3.0927419662475586, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.9677419066429138, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.625, "rewards/operation_reward/std": 0.5175492167472839, "step": 297, "step_time": 695.5311180250719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 683.0, "completions/mean_terminated_length": 683.0, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "entropy": 0.045590389519929886, "epoch": 0.4257142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.7571428571428574e-06, "loss": 0.0, "num_tokens": 1756897.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 298, "step_time": 116.00876551680267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.13814496994018555, "epoch": 0.42714285714285716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.742857142857143e-06, "loss": 0.0, "num_tokens": 1761682.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 299, "step_time": 95.2720364490524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.14185230433940887, "epoch": 0.42857142857142855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.7285714285714285e-06, "loss": 0.0, "num_tokens": 1765511.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 300, "step_time": 85.07541698683053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 316.75, "completions/mean_terminated_length": 316.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.13845282793045044, "epoch": 0.43, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.7142857142857145e-06, "loss": 0.0, "num_tokens": 1771005.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 301, "step_time": 89.39987386856228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 927.25, "completions/mean_terminated_length": 637.0, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "entropy": 0.1450492888689041, "epoch": 0.43142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.09205259382724762, "learning_rate": 5.7e-06, "loss": 0.1688, "num_tokens": 1781343.0, "reward": 0.6264548301696777, "reward_std": 0.9535268545150757, "rewards/accuracy_reward/mean": 0.12645485997200012, "rewards/accuracy_reward/std": 0.3529662489891052, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 302, "step_time": 108.41632421687245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.10106422007083893, "epoch": 0.4328571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.6857142857142865e-06, "loss": 0.0, "num_tokens": 1786227.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 303, "step_time": 81.66641617193818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1948394924402237, "epoch": 0.4342857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.6714285714285724e-06, "loss": 0.0, "num_tokens": 1790120.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 304, "step_time": 79.1003645574674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 513.5, "completions/mean_terminated_length": 513.5, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "entropy": 0.07180444151163101, "epoch": 0.4357142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.11121643334627151, "learning_rate": 5.6571428571428576e-06, "loss": 0.0774, "num_tokens": 1797172.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 305, "step_time": 96.3856557207182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.09811820834875107, "epoch": 0.43714285714285717, "frac_reward_zero_std": 0.0, "grad_norm": 0.1482565999031067, "learning_rate": 5.6428571428571435e-06, "loss": 0.1775, "num_tokens": 1802824.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 306, "step_time": 91.2905070213601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 1002.625, "completions/mean_terminated_length": 938.5, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "entropy": 0.25560957193374634, "epoch": 0.43857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.11949700117111206, "learning_rate": 5.628571428571429e-06, "loss": 0.0313, "num_tokens": 1813805.0, "reward": 0.43509018421173096, "reward_std": 0.5328572988510132, "rewards/accuracy_reward/mean": 0.12259017676115036, "rewards/accuracy_reward/std": 0.2655349671840668, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.1875, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 307, "step_time": 108.3583048088476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.24562232196331024, "epoch": 0.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.39410164952278137, "learning_rate": 5.614285714285715e-06, "loss": -0.0843, "num_tokens": 1817892.0, "reward": 2.1480770111083984, "reward_std": 0.7043128609657288, "rewards/accuracy_reward/mean": 0.7730768918991089, "rewards/accuracy_reward/std": 0.42017990350723267, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 308, "step_time": 78.74011703580618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 396.125, "completions/mean_terminated_length": 186.83334350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3078053295612335, "epoch": 0.44142857142857145, "frac_reward_zero_std": 0.0, "grad_norm": 0.25609371066093445, "learning_rate": 5.600000000000001e-06, "loss": 0.6453, "num_tokens": 1824021.0, "reward": 1.4684356451034546, "reward_std": 1.1467634439468384, "rewards/accuracy_reward/mean": 0.5934355854988098, "rewards/accuracy_reward/std": 0.45240336656570435, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 309, "step_time": 106.86584649700671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.08801992982625961, "epoch": 0.44285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2394624501466751, "learning_rate": 5.5857142857142866e-06, "loss": -0.0294, "num_tokens": 1828981.0, "reward": 2.625, "reward_std": 0.6408699750900269, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.75, "rewards/operation_reward/std": 0.4629100561141968, "step": 310, "step_time": 85.19178391247988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.29295244812965393, "epoch": 0.4442857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.571428571428572e-06, "loss": 0.0, "num_tokens": 1834202.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 311, "step_time": 85.68836874049157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 576.25, "completions/mean_terminated_length": 427.0, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "entropy": 0.2588648200035095, "epoch": 0.44571428571428573, "frac_reward_zero_std": 0.0, "grad_norm": 0.18962594866752625, "learning_rate": 5.557142857142858e-06, "loss": 0.3788, "num_tokens": 1841716.0, "reward": 1.0778131484985352, "reward_std": 0.7173824310302734, "rewards/accuracy_reward/mean": 0.5778130888938904, "rewards/accuracy_reward/std": 0.4648881256580353, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 312, "step_time": 113.66553610935807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 292.125, "completions/mean_terminated_length": 292.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0637860894203186, "epoch": 0.4471428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.542857142857143e-06, "loss": 0.0, "num_tokens": 1846901.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 313, "step_time": 85.04669517371804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.091190867125988, "epoch": 0.44857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.22758698463439941, "learning_rate": 5.528571428571429e-06, "loss": -0.0271, "num_tokens": 1851774.0, "reward": 1.106295108795166, "reward_std": 0.4029449224472046, "rewards/accuracy_reward/mean": 0.48129504919052124, "rewards/accuracy_reward/std": 0.21666964888572693, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 314, "step_time": 88.4146149288863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.16172252595424652, "epoch": 0.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.514285714285714e-06, "loss": 0.0, "num_tokens": 1855811.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 315, "step_time": 82.34942184668034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 945.75, "completions/mean_terminated_length": 815.3333740234375, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "entropy": 0.3342733085155487, "epoch": 0.4514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.11040376871824265, "learning_rate": 5.500000000000001e-06, "loss": 0.0802, "num_tokens": 1866329.0, "reward": 0.31435853242874146, "reward_std": 0.5290795564651489, "rewards/accuracy_reward/mean": 0.0018585438374429941, "rewards/accuracy_reward/std": 0.0016425790963694453, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 316, "step_time": 108.49847334623337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 418.875, "completions/mean_terminated_length": 418.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.0556558333337307, "epoch": 0.45285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.1621052473783493, "learning_rate": 5.485714285714287e-06, "loss": 0.0739, "num_tokens": 1872592.0, "reward": 1.9375, "reward_std": 0.3622843623161316, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.36228442192077637, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 317, "step_time": 91.16439379006624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4460548162460327, "epoch": 0.4542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.4177941679954529, "learning_rate": 5.471428571428572e-06, "loss": 0.5825, "num_tokens": 1877148.0, "reward": 2.875, "reward_std": 0.9161254167556763, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.625, "rewards/operation_reward/std": 0.5175492167472839, "step": 318, "step_time": 96.6501649664715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 462.375, "completions/mean_terminated_length": 462.375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.05090540647506714, "epoch": 0.45571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.14670976996421814, "learning_rate": 5.457142857142858e-06, "loss": -0.0529, "num_tokens": 1883727.0, "reward": 1.5625, "reward_std": 0.6781013607978821, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.3720119297504425, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 319, "step_time": 91.65309605374932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.18054519593715668, "epoch": 0.45714285714285713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.442857142857143e-06, "loss": 0.0, "num_tokens": 1887549.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 320, "step_time": 78.58841337263584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.13449527323246002, "epoch": 0.4585714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3636111617088318, "learning_rate": 5.428571428571429e-06, "loss": 0.2016, "num_tokens": 1892062.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 321, "step_time": 84.68098646961153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 502.375, "completions/mean_terminated_length": 328.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.3684599697589874, "epoch": 0.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.22353702783584595, "learning_rate": 5.414285714285715e-06, "loss": 0.3179, "num_tokens": 1898953.0, "reward": 1.1253588199615479, "reward_std": 0.5816907286643982, "rewards/accuracy_reward/mean": 0.7503588199615479, "rewards/accuracy_reward/std": 0.462246298789978, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 322, "step_time": 108.08072219602764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.07807213813066483, "epoch": 0.4614285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.400000000000001e-06, "loss": 0.0, "num_tokens": 1903781.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 323, "step_time": 83.87782618589699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.19943636655807495, "epoch": 0.46285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.45199495553970337, "learning_rate": 5.385714285714286e-06, "loss": 0.0977, "num_tokens": 1907668.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 324, "step_time": 80.0099050309509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.0998234748840332, "epoch": 0.4642857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.371428571428572e-06, "loss": 0.0, "num_tokens": 1912424.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 325, "step_time": 96.10579085629433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.07416698336601257, "epoch": 0.4657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.143167644739151, "learning_rate": 5.357142857142857e-06, "loss": -0.0319, "num_tokens": 1918179.0, "reward": 1.962499976158142, "reward_std": 0.7726346254348755, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8374999761581421, "rewards/grounding_reward/std": 0.3113909065723419, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 326, "step_time": 90.75957877840847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 544.25, "completions/mean_terminated_length": 475.71429443359375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.14309974014759064, "epoch": 0.46714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1660163700580597, "learning_rate": 5.342857142857143e-06, "loss": -0.1375, "num_tokens": 1925405.0, "reward": 1.8125, "reward_std": 0.45806270837783813, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 327, "step_time": 107.8089497871697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 874.125, "completions/mean_terminated_length": 624.3333740234375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.2873357832431793, "epoch": 0.4685714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1177389845252037, "learning_rate": 5.328571428571428e-06, "loss": 0.1975, "num_tokens": 1935342.0, "reward": 0.9203185439109802, "reward_std": 0.3358921408653259, "rewards/accuracy_reward/mean": 0.0015685728285461664, "rewards/accuracy_reward/std": 0.0013361676828935742, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.731249988079071, "rewards/grounding_reward/std": 0.19628241658210754, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 328, "step_time": 110.411645129323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 477.25, "completions/mean_terminated_length": 295.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4092617630958557, "epoch": 0.47, "frac_reward_zero_std": 0.0, "grad_norm": 0.3293648064136505, "learning_rate": 5.314285714285715e-06, "loss": 0.3643, "num_tokens": 1942128.0, "reward": 0.5003708600997925, "reward_std": 0.4624525010585785, "rewards/accuracy_reward/mean": 0.12537087500095367, "rewards/accuracy_reward/std": 0.3534041941165924, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 329, "step_time": 109.859365045093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.06908700615167618, "epoch": 0.4714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.300000000000001e-06, "loss": 0.0, "num_tokens": 1946530.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 330, "step_time": 81.65244208462536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 531.5, "completions/mean_terminated_length": 236.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.48020946979522705, "epoch": 0.47285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.23644711077213287, "learning_rate": 5.285714285714286e-06, "loss": 0.6379, "num_tokens": 1953622.0, "reward": 0.740496039390564, "reward_std": 0.49741432070732117, "rewards/accuracy_reward/mean": 0.052996061742305756, "rewards/accuracy_reward/std": 0.041794534772634506, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.2314550280570984, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 331, "step_time": 107.54421783145517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.16624370217323303, "epoch": 0.4742857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.271428571428572e-06, "loss": 0.0, "num_tokens": 1957601.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 332, "step_time": 80.4516717530787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 429.625, "completions/mean_terminated_length": 429.625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.0768512561917305, "epoch": 0.4757142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.13952581584453583, "learning_rate": 5.257142857142857e-06, "loss": -0.0601, "num_tokens": 1963918.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 333, "step_time": 94.04439156409353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.10065537691116333, "epoch": 0.47714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.1716795116662979, "learning_rate": 5.242857142857143e-06, "loss": 0.0662, "num_tokens": 1968836.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 334, "step_time": 88.92750156391412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.25379443168640137, "epoch": 0.4785714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2043207734823227, "learning_rate": 5.22857142857143e-06, "loss": -0.172, "num_tokens": 1974323.0, "reward": 1.210714340209961, "reward_std": 0.23925906419754028, "rewards/accuracy_reward/mean": 0.7107142806053162, "rewards/accuracy_reward/std": 0.23925906419754028, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 335, "step_time": 90.69626473728567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2067444920539856, "epoch": 0.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.20479804277420044, "learning_rate": 5.214285714285715e-06, "loss": 0.1775, "num_tokens": 1979518.0, "reward": 1.8692474365234375, "reward_std": 0.7196186780929565, "rewards/accuracy_reward/mean": 0.8275808095932007, "rewards/accuracy_reward/std": 0.055065419524908066, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.2916666865348816, "rewards/grounding_reward/std": 0.4520675837993622, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 336, "step_time": 89.7854459034279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 665.25, "completions/mean_terminated_length": 306.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.5223882794380188, "epoch": 0.48142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.17977525293827057, "learning_rate": 5.2e-06, "loss": 0.3859, "num_tokens": 1987936.0, "reward": 0.7761197686195374, "reward_std": 0.6990944147109985, "rewards/accuracy_reward/mean": 0.5261198282241821, "rewards/accuracy_reward/std": 0.5106872916221619, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 337, "step_time": 109.0526313399896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3856544494628906, "epoch": 0.4828571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.34295758605003357, "learning_rate": 5.185714285714286e-06, "loss": -0.0467, "num_tokens": 1992084.0, "reward": 0.5750000476837158, "reward_std": 0.14880475401878357, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.14880475401878357, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 338, "step_time": 81.71290989406407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 913.0, "completions/mean_terminated_length": 580.0, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.48561716079711914, "epoch": 0.48428571428571426, "frac_reward_zero_std": 0.0, "grad_norm": 0.29187867045402527, "learning_rate": 5.171428571428571e-06, "loss": 0.1089, "num_tokens": 2002476.0, "reward": 1.3884371519088745, "reward_std": 0.8251060843467712, "rewards/accuracy_reward/mean": 0.5134371519088745, "rewards/accuracy_reward/std": 0.41921794414520264, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.33034375309944153, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 339, "step_time": 108.43336297478527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.15208344161510468, "epoch": 0.4857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.157142857142857e-06, "loss": 0.0, "num_tokens": 2006987.0, "reward": 2.026315689086914, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5263158082962036, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 340, "step_time": 82.58900241181254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 833.125, "completions/mean_terminated_length": 718.6000366210938, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "entropy": 0.234132319688797, "epoch": 0.48714285714285716, "frac_reward_zero_std": 0.0, "grad_norm": 0.17716288566589355, "learning_rate": 5.142857142857142e-06, "loss": 0.1209, "num_tokens": 2016652.0, "reward": 0.9798884391784668, "reward_std": 0.4409838020801544, "rewards/accuracy_reward/mean": 0.0007217368693090975, "rewards/accuracy_reward/std": 0.001024101278744638, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.6666666269302368, "rewards/grounding_reward/std": 0.300264447927475, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 341, "step_time": 108.94888549111784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.14578329026699066, "epoch": 0.48857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.31978365778923035, "learning_rate": 5.128571428571429e-06, "loss": 0.0342, "num_tokens": 2020948.0, "reward": 3.0, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.5625, "rewards/operation_reward/std": 0.3204349875450134, "step": 342, "step_time": 81.42138738464564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.1634679138660431, "epoch": 0.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.210111603140831, "learning_rate": 5.114285714285715e-06, "loss": -0.0123, "num_tokens": 2025838.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 343, "step_time": 81.8571712821722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.1720392107963562, "epoch": 0.49142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.24906319379806519, "learning_rate": 5.1e-06, "loss": -0.3776, "num_tokens": 2031543.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 344, "step_time": 98.97594387549907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 572.875, "completions/mean_terminated_length": 572.875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "entropy": 0.057159289717674255, "epoch": 0.4928571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.085714285714286e-06, "loss": 0.0, "num_tokens": 2039046.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 345, "step_time": 98.79822462052107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 805.375, "completions/mean_terminated_length": 732.5, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.33927732706069946, "epoch": 0.4942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.16156505048274994, "learning_rate": 5.071428571428571e-06, "loss": 0.1465, "num_tokens": 2048417.0, "reward": 0.375218003988266, "reward_std": 0.23105135560035706, "rewards/accuracy_reward/mean": 0.00021800250397063792, "rewards/accuracy_reward/std": 0.0004040660278405994, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 346, "step_time": 110.3149678222835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.08368082344532013, "epoch": 0.4957142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.18708279728889465, "learning_rate": 5.057142857142857e-06, "loss": 0.0297, "num_tokens": 2053157.0, "reward": 2.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.375, "rewards/operation_reward/std": 0.5175492167472839, "step": 347, "step_time": 83.07846222538501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1274299919605255, "epoch": 0.49714285714285716, "frac_reward_zero_std": 0.0, "grad_norm": 0.2509407103061676, "learning_rate": 5.042857142857144e-06, "loss": -0.0424, "num_tokens": 2057443.0, "reward": 2.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 348, "step_time": 82.67346658185124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 388.75, "completions/mean_terminated_length": 298.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.12687963247299194, "epoch": 0.49857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.17779722809791565, "learning_rate": 5.028571428571429e-06, "loss": 0.4407, "num_tokens": 2063545.0, "reward": 1.1245797872543335, "reward_std": 0.2437800168991089, "rewards/accuracy_reward/mean": 0.05478813126683235, "rewards/accuracy_reward/std": 0.10096567869186401, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.6322916746139526, "rewards/grounding_reward/std": 0.21017931401729584, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 349, "step_time": 110.17454860545695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 424.125, "completions/mean_terminated_length": 424.125, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 0.08469823002815247, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.2236138880252838, "learning_rate": 5.014285714285715e-06, "loss": -0.0059, "num_tokens": 2069786.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 350, "step_time": 90.65471547655761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 338.75, "completions/mean_terminated_length": 338.75, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.06973297148942947, "epoch": 0.5014285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1639648675918579, "learning_rate": 5e-06, "loss": 0.0217, "num_tokens": 2075432.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 351, "step_time": 86.52713022939861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 496.5, "completions/mean_terminated_length": 320.66668701171875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.2825261950492859, "epoch": 0.5028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24200797080993652, "learning_rate": 4.9857142857142855e-06, "loss": 0.5737, "num_tokens": 2082476.0, "reward": 1.1308690309524536, "reward_std": 0.6834977269172668, "rewards/accuracy_reward/mean": 0.7558690309524536, "rewards/accuracy_reward/std": 0.45204275846481323, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 352, "step_time": 107.87771362997591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.275177001953125, "epoch": 0.5042857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.19696879386901855, "learning_rate": 4.971428571428572e-06, "loss": 0.0569, "num_tokens": 2088417.0, "reward": 1.6961207389831543, "reward_std": 0.35417595505714417, "rewards/accuracy_reward/mean": 0.8836206793785095, "rewards/accuracy_reward/std": 0.3291704058647156, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3125, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 353, "step_time": 98.0072986735031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 452.0, "completions/mean_terminated_length": 452.0, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.28941360116004944, "epoch": 0.5057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.20418857038021088, "learning_rate": 4.9571428571428575e-06, "loss": -0.0048, "num_tokens": 2094961.0, "reward": 1.390625, "reward_std": 0.3435470163822174, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.32346823811531067, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 354, "step_time": 94.58356203138828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 621.875, "completions/mean_terminated_length": 621.875, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "entropy": 0.06911177933216095, "epoch": 0.5071428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.08909384161233902, "learning_rate": 4.9428571428571435e-06, "loss": 0.0053, "num_tokens": 2102816.0, "reward": 2.5658602714538574, "reward_std": 0.9763643145561218, "rewards/accuracy_reward/mean": 0.8158602118492126, "rewards/accuracy_reward/std": 0.34300124645233154, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.375, "rewards/operation_reward/std": 0.5175492167472839, "step": 355, "step_time": 94.76900970004499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.2002309113740921, "epoch": 0.5085714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.928571428571429e-06, "loss": 0.0, "num_tokens": 2107349.0, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 356, "step_time": 80.43627527542412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.08629763126373291, "epoch": 0.51, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.9142857142857145e-06, "loss": 0.0, "num_tokens": 2111727.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 357, "step_time": 83.28634659387171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 534.625, "completions/mean_terminated_length": 534.625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.09510516375303268, "epoch": 0.5114285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.1066727340221405, "learning_rate": 4.9000000000000005e-06, "loss": -0.0308, "num_tokens": 2118932.0, "reward": 2.4728260040283203, "reward_std": 0.07685937732458115, "rewards/accuracy_reward/mean": 0.9728261232376099, "rewards/accuracy_reward/std": 0.07685943692922592, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 358, "step_time": 98.57070247177035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.11337698251008987, "epoch": 0.5128571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.216094970703125, "learning_rate": 4.885714285714286e-06, "loss": 0.071, "num_tokens": 2123629.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 359, "step_time": 83.55797086283565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 548.875, "completions/mean_terminated_length": 548.875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.07197457551956177, "epoch": 0.5142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.11595383286476135, "learning_rate": 4.871428571428572e-06, "loss": 0.0253, "num_tokens": 2130916.0, "reward": 1.5772463083267212, "reward_std": 0.3933787941932678, "rewards/accuracy_reward/mean": 0.20224624872207642, "rewards/accuracy_reward/std": 0.07613687962293625, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 360, "step_time": 99.92622360959649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 108.0, "completions/mean_terminated_length": 108.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.23716841638088226, "epoch": 0.5157142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.857142857142858e-06, "loss": 0.0, "num_tokens": 2134644.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 361, "step_time": 77.23853746987879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 526.5, "completions/mean_terminated_length": 526.5, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.10912216454744339, "epoch": 0.5171428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.14888733625411987, "learning_rate": 4.842857142857143e-06, "loss": -0.0003, "num_tokens": 2141856.0, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 362, "step_time": 94.96193360071629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.11820974200963974, "epoch": 0.5185714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.8285714285714295e-06, "loss": 0.0, "num_tokens": 2146016.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 363, "step_time": 79.85544044803828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.08899860084056854, "epoch": 0.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.1667742133140564, "learning_rate": 4.814285714285715e-06, "loss": 0.0032, "num_tokens": 2150692.0, "reward": 1.7999999523162842, "reward_std": 0.32071343064308167, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.30000001192092896, "rewards/grounding_reward/std": 0.32071349024772644, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 364, "step_time": 81.02982580754906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2183244228363037, "epoch": 0.5214285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.800000000000001e-06, "loss": 0.0, "num_tokens": 2154703.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 365, "step_time": 79.73928829282522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.13758637011051178, "epoch": 0.5228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3615933954715729, "learning_rate": 4.785714285714287e-06, "loss": -0.0215, "num_tokens": 2159163.0, "reward": 2.3125, "reward_std": 0.7039429545402527, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5625, "rewards/grounding_reward/std": 0.4172614812850952, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 366, "step_time": 81.37669351324439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.09020798653364182, "epoch": 0.5242857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.35233521461486816, "learning_rate": 4.771428571428572e-06, "loss": 0.0147, "num_tokens": 2163513.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.2314550280570984, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 367, "step_time": 81.48486040439457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1627700924873352, "epoch": 0.5257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.31724971532821655, "learning_rate": 4.757142857142858e-06, "loss": -0.042, "num_tokens": 2168178.0, "reward": 2.325000047683716, "reward_std": 0.6486248970031738, "rewards/accuracy_reward/mean": 0.824999988079071, "rewards/accuracy_reward/std": 0.18708288669586182, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 368, "step_time": 82.47741038817912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 959.5, "completions/mean_terminated_length": 766.0, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "entropy": 0.24048393964767456, "epoch": 0.5271428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.11293372511863708, "learning_rate": 4.742857142857144e-06, "loss": 0.0575, "num_tokens": 2178822.0, "reward": 1.0301704406738281, "reward_std": 0.8183145523071289, "rewards/accuracy_reward/mean": 0.15517035126686096, "rewards/accuracy_reward/std": 0.3472544252872467, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 369, "step_time": 108.31306333839893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.09037115424871445, "epoch": 0.5285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.728571428571429e-06, "loss": 0.0, "num_tokens": 2184384.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 370, "step_time": 88.12405969109386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 533.875, "completions/mean_terminated_length": 533.875, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "entropy": 0.09036929905414581, "epoch": 0.53, "frac_reward_zero_std": 0.0, "grad_norm": 0.14285658299922943, "learning_rate": 4.714285714285715e-06, "loss": -0.1187, "num_tokens": 2191575.0, "reward": 1.384270429611206, "reward_std": 0.27638399600982666, "rewards/accuracy_reward/mean": 0.8217703700065613, "rewards/accuracy_reward/std": 0.3325445055961609, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 371, "step_time": 95.40447119250894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 549.125, "completions/mean_terminated_length": 481.2857360839844, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.1340697556734085, "epoch": 0.5314285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.14310255646705627, "learning_rate": 4.7e-06, "loss": 0.3057, "num_tokens": 2198936.0, "reward": 2.314953565597534, "reward_std": 0.5233904719352722, "rewards/accuracy_reward/mean": 0.8774535059928894, "rewards/accuracy_reward/std": 0.3466137945652008, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 372, "step_time": 106.84352217707783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 343.25, "completions/mean_terminated_length": 343.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.14163881540298462, "epoch": 0.5328571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.20691581070423126, "learning_rate": 4.685714285714286e-06, "loss": -0.0363, "num_tokens": 2204562.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 373, "step_time": 91.90449252352118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 126.125, "completions/mean_terminated_length": 126.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23959629237651825, "epoch": 0.5342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.400040864944458, "learning_rate": 4.671428571428572e-06, "loss": -0.0928, "num_tokens": 2208419.0, "reward": 2.3712120056152344, "reward_std": 0.36426711082458496, "rewards/accuracy_reward/mean": 0.9962121248245239, "rewards/accuracy_reward/std": 0.010713729076087475, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 374, "step_time": 79.71350055001676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08795344829559326, "epoch": 0.5357142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.18178671598434448, "learning_rate": 4.657142857142857e-06, "loss": 0.0602, "num_tokens": 2213727.0, "reward": 1.9583333730697632, "reward_std": 0.39591166377067566, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4583333432674408, "rewards/grounding_reward/std": 0.39591163396835327, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 375, "step_time": 84.63644297700375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 284.0, "completions/mean_terminated_length": 284.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.20001855492591858, "epoch": 0.5371428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.263007789850235, "learning_rate": 4.642857142857144e-06, "loss": 0.022, "num_tokens": 2218967.0, "reward": 2.2339730262756348, "reward_std": 0.5358698964118958, "rewards/accuracy_reward/mean": 0.4214729368686676, "rewards/accuracy_reward/std": 0.014228623360395432, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.8125, "rewards/operation_reward/std": 0.25877460837364197, "step": 376, "step_time": 93.72045604884624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 704.75, "completions/mean_terminated_length": 385.5, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.1953766942024231, "epoch": 0.5385714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1562398076057434, "learning_rate": 4.628571428571429e-06, "loss": 0.3612, "num_tokens": 2227813.0, "reward": 0.5602192282676697, "reward_std": 0.5114845037460327, "rewards/accuracy_reward/mean": 0.0018858867697417736, "rewards/accuracy_reward/std": 0.002016653073951602, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.3083333373069763, "rewards/grounding_reward/std": 0.3110402524471283, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 377, "step_time": 109.61993592977524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 572.125, "completions/mean_terminated_length": 507.5714416503906, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.11668851226568222, "epoch": 0.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.11408373713493347, "learning_rate": 4.614285714285715e-06, "loss": 0.0598, "num_tokens": 2235334.0, "reward": 1.3130428791046143, "reward_std": 0.2632715702056885, "rewards/accuracy_reward/mean": 0.0005428881850093603, "rewards/accuracy_reward/std": 0.0015355197247117758, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.20528724789619446, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 378, "step_time": 107.66972521133721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 633.0, "completions/mean_terminated_length": 633.0, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.06453396379947662, "epoch": 0.5414285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.600000000000001e-06, "loss": 0.0, "num_tokens": 2243470.0, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 379, "step_time": 99.48494122736156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 668.125, "completions/mean_terminated_length": 549.5, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.21093037724494934, "epoch": 0.5428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.15899193286895752, "learning_rate": 4.585714285714286e-06, "loss": 0.2874, "num_tokens": 2251695.0, "reward": 1.7522048950195312, "reward_std": 1.130005121231079, "rewards/accuracy_reward/mean": 0.7522048950195312, "rewards/accuracy_reward/std": 0.45882752537727356, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 380, "step_time": 108.47639122512192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06998585909605026, "epoch": 0.5442857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.571428571428572e-06, "loss": 0.0, "num_tokens": 2256351.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 381, "step_time": 86.32714539393783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.383566290140152, "epoch": 0.5457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1196913868188858, "learning_rate": 4.557142857142858e-06, "loss": -0.0, "num_tokens": 2267511.0, "reward": 0.8769937753677368, "reward_std": 0.3535957634449005, "rewards/accuracy_reward/mean": 0.0019938317127525806, "rewards/accuracy_reward/std": 0.000248028984060511, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 382, "step_time": 106.81384201347828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.17995400726795197, "epoch": 0.5471428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.34754467010498047, "learning_rate": 4.542857142857143e-06, "loss": 0.0163, "num_tokens": 2272358.0, "reward": 2.450373649597168, "reward_std": 0.013073522597551346, "rewards/accuracy_reward/mean": 0.9503734707832336, "rewards/accuracy_reward/std": 0.013073522597551346, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 383, "step_time": 82.90536423306912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.7232475280761719, "epoch": 0.5485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.11718789488077164, "learning_rate": 4.528571428571429e-06, "loss": -0.0, "num_tokens": 2283502.0, "reward": 0.7111884951591492, "reward_std": 0.45781564712524414, "rewards/accuracy_reward/mean": 0.023688504472374916, "rewards/accuracy_reward/std": 0.0019768024794757366, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.45806270837783813, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 384, "step_time": 107.708890279755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 613.625, "completions/mean_terminated_length": 555.0, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.15135657787322998, "epoch": 0.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.12659938633441925, "learning_rate": 4.514285714285714e-06, "loss": -0.0026, "num_tokens": 2291331.0, "reward": 1.0625828504562378, "reward_std": 0.9297215938568115, "rewards/accuracy_reward/mean": 0.3750828504562378, "rewards/accuracy_reward/std": 0.5174806118011475, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.34503278136253357, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 385, "step_time": 109.26243858970702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 490.25, "completions/mean_terminated_length": 414.0000305175781, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.22300294041633606, "epoch": 0.5514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19719012081623077, "learning_rate": 4.5e-06, "loss": 0.2435, "num_tokens": 2298189.0, "reward": 0.8065520524978638, "reward_std": 0.43946734070777893, "rewards/accuracy_reward/mean": 0.2503020465373993, "rewards/accuracy_reward/std": 0.46272438764572144, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.11874999850988388, "rewards/grounding_reward/std": 0.2644907832145691, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 386, "step_time": 107.46748455334455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 538.75, "completions/mean_terminated_length": 377.0, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "entropy": 0.23731738328933716, "epoch": 0.5528571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.20644159615039825, "learning_rate": 4.485714285714286e-06, "loss": 0.4345, "num_tokens": 2305507.0, "reward": 1.1120902299880981, "reward_std": 0.532241940498352, "rewards/accuracy_reward/mean": 0.35896527767181396, "rewards/accuracy_reward/std": 0.24392753839492798, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.37812501192092896, "rewards/grounding_reward/std": 0.185855895280838, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 387, "step_time": 108.47914626728743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 469.5, "completions/mean_terminated_length": 469.5, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.05649639666080475, "epoch": 0.5542857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.471428571428571e-06, "loss": 0.0, "num_tokens": 2312207.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 388, "step_time": 96.43601263407618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.17408707737922668, "epoch": 0.5557142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.457142857142858e-06, "loss": 0.0, "num_tokens": 2316374.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 389, "step_time": 81.06171551253647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 673.125, "completions/mean_terminated_length": 322.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.2677837610244751, "epoch": 0.5571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.18285734951496124, "learning_rate": 4.442857142857143e-06, "loss": 0.2637, "num_tokens": 2324775.0, "reward": 1.0008516311645508, "reward_std": 0.4623587727546692, "rewards/accuracy_reward/mean": 0.0008516156813129783, "rewards/accuracy_reward/std": 0.0009152241982519627, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.37796446681022644, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 390, "step_time": 107.95085728541017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 901.625, "completions/mean_terminated_length": 534.5, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.17299561202526093, "epoch": 0.5585714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3286117613315582, "learning_rate": 4.428571428571429e-06, "loss": 0.1623, "num_tokens": 2335036.0, "reward": 0.6126431226730347, "reward_std": 0.5135941505432129, "rewards/accuracy_reward/mean": 0.004309766925871372, "rewards/accuracy_reward/std": 0.0026674375403672457, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.4833333492279053, "rewards/grounding_reward/std": 0.42798903584480286, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 391, "step_time": 109.45570846274495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 337.125, "completions/mean_terminated_length": 337.125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.08514762669801712, "epoch": 0.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.15474212169647217, "learning_rate": 4.414285714285715e-06, "loss": -0.0578, "num_tokens": 2340629.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 392, "step_time": 88.97601482644677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 609.75, "completions/mean_terminated_length": 361.20001220703125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.24530304968357086, "epoch": 0.5614285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.16376376152038574, "learning_rate": 4.4e-06, "loss": 0.3893, "num_tokens": 2348451.0, "reward": 1.1152448654174805, "reward_std": 0.5247355699539185, "rewards/accuracy_reward/mean": 0.002744890982285142, "rewards/accuracy_reward/std": 0.0038212439976632595, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.800000011920929, "rewards/grounding_reward/std": 0.3545621335506439, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 393, "step_time": 109.28908981289715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.08011943846940994, "epoch": 0.5628571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.22764012217521667, "learning_rate": 4.385714285714286e-06, "loss": -0.1236, "num_tokens": 2353500.0, "reward": 0.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 394, "step_time": 86.55159978102893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 119.125, "completions/mean_terminated_length": 119.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.17303092777729034, "epoch": 0.5642857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3420649468898773, "learning_rate": 4.371428571428572e-06, "loss": 0.0636, "num_tokens": 2357349.0, "reward": 2.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 395, "step_time": 78.59192580450326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 369.875, "completions/mean_terminated_length": 369.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.06899508833885193, "epoch": 0.5657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2320631593465805, "learning_rate": 4.357142857142857e-06, "loss": -0.0652, "num_tokens": 2363148.0, "reward": 1.118749976158142, "reward_std": 0.6782000660896301, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3687500059604645, "rewards/grounding_reward/std": 0.4131218492984772, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 396, "step_time": 90.7447570702061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.46288907527923584, "epoch": 0.5671428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.1332860141992569, "learning_rate": 4.342857142857143e-06, "loss": 0.0, "num_tokens": 2374284.0, "reward": 0.6176455020904541, "reward_std": 0.33118659257888794, "rewards/accuracy_reward/mean": 0.0009788188617676497, "rewards/accuracy_reward/std": 0.0004910486750304699, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6166666746139526, "rewards/grounding_reward/std": 0.3314231038093567, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 397, "step_time": 111.05817733705044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 887.875, "completions/mean_terminated_length": 751.75, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "entropy": 0.12598112225532532, "epoch": 0.5685714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.08928357809782028, "learning_rate": 4.328571428571429e-06, "loss": 0.1404, "num_tokens": 2384315.0, "reward": 0.910456120967865, "reward_std": 0.5631332397460938, "rewards/accuracy_reward/mean": 0.12712278962135315, "rewards/accuracy_reward/std": 0.35270220041275024, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.5333333611488342, "rewards/grounding_reward/std": 0.4291002154350281, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 398, "step_time": 107.74474654719234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.14692138135433197, "epoch": 0.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.272189736366272, "learning_rate": 4.314285714285714e-06, "loss": 0.1632, "num_tokens": 2389535.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 399, "step_time": 97.50385406240821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 833.125, "completions/mean_terminated_length": 642.25, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.13398268818855286, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.08597218245267868, "learning_rate": 4.3e-06, "loss": 0.1832, "num_tokens": 2399072.0, "reward": 1.0629396438598633, "reward_std": 0.41706302762031555, "rewards/accuracy_reward/mean": 0.00043958588503301144, "rewards/accuracy_reward/std": 0.0004945762921124697, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.3720119297504425, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 400, "step_time": 105.92895916104317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 476.375, "completions/mean_terminated_length": 476.375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.1055891215801239, "epoch": 0.5728571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.13850493729114532, "learning_rate": 4.2857142857142855e-06, "loss": 0.0765, "num_tokens": 2405819.0, "reward": 3.1666667461395264, "reward_std": 0.28171801567077637, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.6666666865348816, "rewards/operation_reward/std": 0.28171807527542114, "step": 401, "step_time": 92.79643146879971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 570.625, "completions/mean_terminated_length": 570.625, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "entropy": 0.06614210456609726, "epoch": 0.5742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.11383882910013199, "learning_rate": 4.271428571428572e-06, "loss": 0.1114, "num_tokens": 2413344.0, "reward": 2.065476179122925, "reward_std": 0.5449877381324768, "rewards/accuracy_reward/mean": 0.8779761791229248, "rewards/accuracy_reward/std": 0.18680019676685333, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.45806270837783813, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 402, "step_time": 101.44653413351625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 412.25, "completions/mean_terminated_length": 324.8571472167969, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.3122481107711792, "epoch": 0.5757142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.18969614803791046, "learning_rate": 4.257142857142857e-06, "loss": 0.2833, "num_tokens": 2419602.0, "reward": 1.2859225273132324, "reward_std": 0.24043519794940948, "rewards/accuracy_reward/mean": 0.16092249751091003, "rewards/accuracy_reward/std": 0.05967669561505318, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 403, "step_time": 106.31167941354215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 701.25, "completions/mean_terminated_length": 507.6000061035156, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.16413867473602295, "epoch": 0.5771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.13855351507663727, "learning_rate": 4.242857142857143e-06, "loss": 0.3408, "num_tokens": 2428228.0, "reward": 1.7897883653640747, "reward_std": 0.6294848322868347, "rewards/accuracy_reward/mean": 0.4772883951663971, "rewards/accuracy_reward/std": 0.37792837619781494, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 404, "step_time": 106.11315769795328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 87.875, "completions/mean_terminated_length": 87.875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.22056707739830017, "epoch": 0.5785714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.228571428571429e-06, "loss": 0.0, "num_tokens": 2431803.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 405, "step_time": 77.25560043845326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 537.125, "completions/mean_terminated_length": 537.125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.05287303030490875, "epoch": 0.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.09054316580295563, "learning_rate": 4.2142857142857145e-06, "loss": -0.0803, "num_tokens": 2439012.0, "reward": 0.875, "reward_std": 0.7440237998962402, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 406, "step_time": 93.93100622016937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 573.875, "completions/mean_terminated_length": 509.5714416503906, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.12624946236610413, "epoch": 0.5814285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.15873903036117554, "learning_rate": 4.2000000000000004e-06, "loss": 0.1925, "num_tokens": 2446475.0, "reward": 1.1877087354660034, "reward_std": 0.5933905243873596, "rewards/accuracy_reward/mean": 0.7502087354660034, "rewards/accuracy_reward/std": 0.4625237286090851, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 407, "step_time": 105.86321049928665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.14184683561325073, "epoch": 0.5828571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.39721938967704773, "learning_rate": 4.185714285714286e-06, "loss": -0.0033, "num_tokens": 2450753.0, "reward": 1.7084869146347046, "reward_std": 0.25823524594306946, "rewards/accuracy_reward/mean": 0.5209869146347046, "rewards/accuracy_reward/std": 0.01291672233492136, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 408, "step_time": 79.85209975577891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 616.375, "completions/mean_terminated_length": 616.375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.10375896841287613, "epoch": 0.5842857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1394026279449463, "learning_rate": 4.1714285714285715e-06, "loss": -0.0457, "num_tokens": 2458676.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 409, "step_time": 100.96210156287998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 317.375, "completions/mean_terminated_length": 317.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.10999272018671036, "epoch": 0.5857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24979561567306519, "learning_rate": 4.1571428571428575e-06, "loss": 0.0797, "num_tokens": 2464183.0, "reward": 1.09375, "reward_std": 0.6805656552314758, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.09375, "rewards/grounding_reward/std": 0.2651650309562683, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 410, "step_time": 85.55751388706267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 904.875, "completions/mean_terminated_length": 785.75, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "entropy": 0.3225453495979309, "epoch": 0.5871428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.3179590702056885, "learning_rate": 4.1428571428571435e-06, "loss": 0.0653, "num_tokens": 2474550.0, "reward": 1.4966744184494019, "reward_std": 0.5175410509109497, "rewards/accuracy_reward/mean": 0.24667438864707947, "rewards/accuracy_reward/std": 0.32634180784225464, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 411, "step_time": 110.48959814663976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 347.5, "completions/mean_terminated_length": 347.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.27008599042892456, "epoch": 0.5885714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.2720741331577301, "learning_rate": 4.128571428571429e-06, "loss": -0.0558, "num_tokens": 2480322.0, "reward": 1.4375, "reward_std": 0.4172614812850952, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 412, "step_time": 92.70118790306151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 1017.5, "completions/mean_terminated_length": 972.0, "completions/min_length": 972.0, "completions/min_terminated_length": 972.0, "entropy": 0.25669485330581665, "epoch": 0.59, "frac_reward_zero_std": 0.0, "grad_norm": 0.125342458486557, "learning_rate": 4.114285714285715e-06, "loss": 0.0158, "num_tokens": 2491606.0, "reward": 1.0815123319625854, "reward_std": 0.22179818153381348, "rewards/accuracy_reward/mean": 0.019012348726391792, "rewards/accuracy_reward/std": 0.04502149671316147, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 413, "step_time": 106.39935852494091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.12657801806926727, "epoch": 0.5914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2954899072647095, "learning_rate": 4.1e-06, "loss": -0.0434, "num_tokens": 2495883.0, "reward": 2.0833334922790527, "reward_std": 0.34503281116485596, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5833333730697632, "rewards/grounding_reward/std": 0.34503278136253357, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 414, "step_time": 79.23646620288491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 417.625, "completions/mean_terminated_length": 417.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.24249662458896637, "epoch": 0.5928571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2639155387878418, "learning_rate": 4.0857142857142865e-06, "loss": 0.1098, "num_tokens": 2502256.0, "reward": 1.2374999523162842, "reward_std": 0.3889087438583374, "rewards/accuracy_reward/mean": 0.737500011920929, "rewards/accuracy_reward/std": 0.3889087438583374, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 415, "step_time": 95.52845700085163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.16155634820461273, "epoch": 0.5942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.253553569316864, "learning_rate": 4.071428571428572e-06, "loss": 0.0096, "num_tokens": 2507086.0, "reward": 2.2220466136932373, "reward_std": 0.33702120184898376, "rewards/accuracy_reward/mean": 0.22204670310020447, "rewards/accuracy_reward/std": 0.3164457380771637, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.37796446681022644, "step": 416, "step_time": 84.80535182822496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 219.5, "completions/mean_terminated_length": 219.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.07419577240943909, "epoch": 0.5957142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.057142857142858e-06, "loss": 0.0, "num_tokens": 2511834.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 417, "step_time": 82.81825435999781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.25989052653312683, "epoch": 0.5971428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.30734673142433167, "learning_rate": 4.042857142857144e-06, "loss": 0.0866, "num_tokens": 2516552.0, "reward": 3.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.75, "rewards/operation_reward/std": 0.4629100561141968, "step": 418, "step_time": 83.80720705352724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.20412367582321167, "epoch": 0.5985714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.028571428571429e-06, "loss": 0.0, "num_tokens": 2520418.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 419, "step_time": 77.9339019022882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.09846057742834091, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.11642301827669144, "learning_rate": 4.014285714285715e-06, "loss": 0.0, "num_tokens": 2531650.0, "reward": 1.0056246519088745, "reward_std": 0.00039210295653901994, "rewards/accuracy_reward/mean": 0.005624622106552124, "rewards/accuracy_reward/std": 0.00039205545908771455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 420, "step_time": 106.41973601467907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.08196545392274857, "epoch": 0.6014285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "num_tokens": 2536069.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 421, "step_time": 80.9261044273153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.09644744545221329, "epoch": 0.6028571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.985714285714286e-06, "loss": 0.0, "num_tokens": 2540817.0, "reward": 0.6666666865348816, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 422, "step_time": 80.87486174423248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.16252663731575012, "epoch": 0.6042857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.971428571428572e-06, "loss": 0.0, "num_tokens": 2544778.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 423, "step_time": 78.36249199602753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 94.875, "completions/mean_terminated_length": 94.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.16063174605369568, "epoch": 0.6057142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.957142857142858e-06, "loss": 0.0, "num_tokens": 2548409.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 424, "step_time": 76.67285405099392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.110194131731987, "epoch": 0.6071428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.21564216911792755, "learning_rate": 3.942857142857143e-06, "loss": 0.0169, "num_tokens": 2553160.0, "reward": 2.745689630508423, "reward_std": 0.4657224714756012, "rewards/accuracy_reward/mean": 0.9956896305084229, "rewards/accuracy_reward/std": 0.012191502377390862, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 425, "step_time": 102.16798993665725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 349.5, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.0689125582575798, "epoch": 0.6085714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.12126858532428741, "learning_rate": 3.928571428571429e-06, "loss": -0.0964, "num_tokens": 2558844.0, "reward": 2.2750000953674316, "reward_std": 0.3105294704437256, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7749999761581421, "rewards/grounding_reward/std": 0.31052953004837036, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 426, "step_time": 93.25992099940777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 687.0, "completions/mean_terminated_length": 638.857177734375, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.09013237059116364, "epoch": 0.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.08354510366916656, "learning_rate": 3.914285714285714e-06, "loss": 0.1895, "num_tokens": 2567252.0, "reward": 1.2760090827941895, "reward_std": 0.5145934224128723, "rewards/accuracy_reward/mean": 0.8385090827941895, "rewards/accuracy_reward/std": 0.34445518255233765, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 427, "step_time": 106.64170259982347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.09189087152481079, "epoch": 0.6114285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.900000000000001e-06, "loss": 0.0, "num_tokens": 2571662.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 428, "step_time": 82.51249812543392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 429.125, "completions/mean_terminated_length": 429.125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.07132823765277863, "epoch": 0.6128571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.15746505558490753, "learning_rate": 3.885714285714286e-06, "loss": 0.2123, "num_tokens": 2578055.0, "reward": 1.7946429252624512, "reward_std": 0.027109762653708458, "rewards/accuracy_reward/mean": 0.2946428656578064, "rewards/accuracy_reward/std": 0.027109716087579727, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 429, "step_time": 91.65802551526576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.09781242907047272, "epoch": 0.6142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.871428571428572e-06, "loss": 0.0, "num_tokens": 2582586.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 430, "step_time": 80.05938355531543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 785.125, "completions/mean_terminated_length": 387.0, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "entropy": 0.42730191349983215, "epoch": 0.6157142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.14773210883140564, "learning_rate": 3.857142857142858e-06, "loss": 0.3325, "num_tokens": 2591899.0, "reward": 1.8136720657348633, "reward_std": 0.9601729512214661, "rewards/accuracy_reward/mean": 0.5011720657348633, "rewards/accuracy_reward/std": 0.533269464969635, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 431, "step_time": 106.64518674183637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07932526618242264, "epoch": 0.6171428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.842857142857143e-06, "loss": 0.0, "num_tokens": 2597609.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 432, "step_time": 85.93105992581695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 539.625, "completions/mean_terminated_length": 539.625, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.13671112060546875, "epoch": 0.6185714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.15509121119976044, "learning_rate": 3.828571428571429e-06, "loss": -0.0649, "num_tokens": 2604926.0, "reward": 1.084760069847107, "reward_std": 0.40342703461647034, "rewards/accuracy_reward/mean": 0.28476011753082275, "rewards/accuracy_reward/std": 0.0996980294585228, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.30000001192092896, "rewards/grounding_reward/std": 0.440778523683548, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 433, "step_time": 98.47976019885391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 351.5, "completions/mean_terminated_length": 351.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.12974898517131805, "epoch": 0.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.21163511276245117, "learning_rate": 3.814285714285715e-06, "loss": -0.11, "num_tokens": 2610626.0, "reward": 1.5932481288909912, "reward_std": 0.6999961137771606, "rewards/accuracy_reward/mean": 0.28074806928634644, "rewards/accuracy_reward/std": 0.2004992663860321, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.45806270837783813, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 434, "step_time": 87.99226763751358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 470.875, "completions/mean_terminated_length": 470.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.18432387709617615, "epoch": 0.6214285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.41339483857154846, "learning_rate": 3.8000000000000005e-06, "loss": -0.0136, "num_tokens": 2617281.0, "reward": 2.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 435, "step_time": 98.85436902660877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 752.625, "completions/mean_terminated_length": 589.7999877929688, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.30721044540405273, "epoch": 0.6228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.1290902942419052, "learning_rate": 3.785714285714286e-06, "loss": 0.2563, "num_tokens": 2626334.0, "reward": 1.4477999210357666, "reward_std": 1.2464393377304077, "rewards/accuracy_reward/mean": 0.5102999210357666, "rewards/accuracy_reward/std": 0.4659154415130615, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 436, "step_time": 105.60218364465982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 502.625, "completions/mean_terminated_length": 189.8000030517578, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.48745569586753845, "epoch": 0.6242857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.21285411715507507, "learning_rate": 3.771428571428572e-06, "loss": 0.6711, "num_tokens": 2633283.0, "reward": 0.8284246921539307, "reward_std": 0.7343926429748535, "rewards/accuracy_reward/mean": 0.5159246921539307, "rewards/accuracy_reward/std": 0.519092857837677, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 437, "step_time": 106.28912958223373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 794.875, "completions/mean_terminated_length": 565.75, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "entropy": 0.1276673674583435, "epoch": 0.6257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.09816066175699234, "learning_rate": 3.7571428571428575e-06, "loss": 0.1903, "num_tokens": 2642522.0, "reward": 0.5009768009185791, "reward_std": 0.6538013219833374, "rewards/accuracy_reward/mean": 0.0009767783340066671, "rewards/accuracy_reward/std": 0.0010506190592423081, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 438, "step_time": 106.3348581539467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 519.75, "completions/mean_terminated_length": 217.1999969482422, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2587818503379822, "epoch": 0.6271428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.16911038756370544, "learning_rate": 3.742857142857143e-06, "loss": 0.5447, "num_tokens": 2649552.0, "reward": 2.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 439, "step_time": 105.54361868649721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 509.875, "completions/mean_terminated_length": 436.4285888671875, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.14717884361743927, "epoch": 0.6285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2113879770040512, "learning_rate": 3.7285714285714286e-06, "loss": 0.2593, "num_tokens": 2656511.0, "reward": 2.1875946521759033, "reward_std": 0.5936511754989624, "rewards/accuracy_reward/mean": 0.7500946521759033, "rewards/accuracy_reward/std": 0.46273481845855713, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 440, "step_time": 105.57270648144186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 281.5, "completions/mean_terminated_length": 281.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.19372214376926422, "epoch": 0.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.23618587851524353, "learning_rate": 3.7142857142857146e-06, "loss": -0.1939, "num_tokens": 2661755.0, "reward": 1.8359192609786987, "reward_std": 0.8432210683822632, "rewards/accuracy_reward/mean": 0.2734193205833435, "rewards/accuracy_reward/std": 0.29404354095458984, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 441, "step_time": 87.5232652137056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 577.0, "completions/mean_terminated_length": 577.0, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.08543933928012848, "epoch": 0.6314285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.1378505825996399, "learning_rate": 3.7e-06, "loss": -0.1134, "num_tokens": 2669275.0, "reward": 1.4583333730697632, "reward_std": 0.5473601818084717, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8333333730697632, "rewards/grounding_reward/std": 0.35634833574295044, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 442, "step_time": 104.27368251234293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.10649323463439941, "epoch": 0.6328571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.18874596059322357, "learning_rate": 3.6857142857142857e-06, "loss": 0.0288, "num_tokens": 2674528.0, "reward": 2.1500000953674316, "reward_std": 0.4869731366634369, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7749999761581421, "rewards/grounding_reward/std": 0.4200340509414673, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 443, "step_time": 83.30101077072322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 426.25, "completions/mean_terminated_length": 426.25, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.072840616106987, "epoch": 0.6342857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.671428571428572e-06, "loss": 0.0, "num_tokens": 2680898.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 444, "step_time": 91.13552318699658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 257.75, "completions/mean_terminated_length": 257.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.17465141415596008, "epoch": 0.6357142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.25625380873680115, "learning_rate": 3.6571428571428576e-06, "loss": 0.0387, "num_tokens": 2685920.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 445, "step_time": 84.58839409518987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.23154708743095398, "epoch": 0.6371428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.11044975370168686, "learning_rate": 3.642857142857143e-06, "loss": 0.0, "num_tokens": 2697112.0, "reward": 0.7285668849945068, "reward_std": 0.3147205114364624, "rewards/accuracy_reward/mean": 0.0035668641794472933, "rewards/accuracy_reward/std": 0.0008951810305006802, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7250000238418579, "rewards/grounding_reward/std": 0.315096378326416, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 446, "step_time": 108.15048327017576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.20653821527957916, "epoch": 0.6385714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.628571428571429e-06, "loss": 0.0, "num_tokens": 2702313.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 447, "step_time": 84.44418363645673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 371.125, "completions/mean_terminated_length": 277.8571472167969, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.32256880402565, "epoch": 0.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.18075963854789734, "learning_rate": 3.6142857142857147e-06, "loss": 0.0265, "num_tokens": 2708194.0, "reward": 1.3125, "reward_std": 0.5938674807548523, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.2314550280570984, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 448, "step_time": 105.38085480034351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.21347874402999878, "epoch": 0.6414285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.3562413454055786, "learning_rate": 3.6000000000000003e-06, "loss": 0.0405, "num_tokens": 2712807.0, "reward": 1.1036090850830078, "reward_std": 0.10886974632740021, "rewards/accuracy_reward/mean": 0.35360902547836304, "rewards/accuracy_reward/std": 0.10886970907449722, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 449, "step_time": 81.3046874590218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.14277318120002747, "epoch": 0.6428571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5857142857142862e-06, "loss": 0.0, "num_tokens": 2717785.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 450, "step_time": 83.98927503917366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.08192051202058792, "epoch": 0.6442857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.5714285714285718e-06, "loss": 0.0, "num_tokens": 2723067.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 451, "step_time": 83.92974579986185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 549.125, "completions/mean_terminated_length": 264.20001220703125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2757047712802887, "epoch": 0.6457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.19910022616386414, "learning_rate": 3.5571428571428573e-06, "loss": 0.3611, "num_tokens": 2730460.0, "reward": 0.8131365180015564, "reward_std": 0.6506840586662292, "rewards/accuracy_reward/mean": 0.37563657760620117, "rewards/accuracy_reward/std": 0.5170225501060486, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 452, "step_time": 104.83814800158143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.13318473100662231, "epoch": 0.6471428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.24274957180023193, "learning_rate": 3.542857142857143e-06, "loss": 0.0505, "num_tokens": 2735654.0, "reward": 1.3145990371704102, "reward_std": 0.2939240634441376, "rewards/accuracy_reward/mean": 0.8145990967750549, "rewards/accuracy_reward/std": 0.2939240634441376, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 453, "step_time": 90.64104399085045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 355.625, "completions/mean_terminated_length": 355.625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.0732671245932579, "epoch": 0.6485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.11395025998353958, "learning_rate": 3.528571428571429e-06, "loss": 0.0533, "num_tokens": 2741387.0, "reward": 1.7375000715255737, "reward_std": 0.1060660108923912, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.23750001192092896, "rewards/grounding_reward/std": 0.1060660183429718, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 454, "step_time": 98.71880325302482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 387.25, "completions/mean_terminated_length": 387.25, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.06927551329135895, "epoch": 0.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.19965992867946625, "learning_rate": 3.5142857142857144e-06, "loss": -0.0436, "num_tokens": 2747365.0, "reward": 2.0, "reward_std": 0.30860668420791626, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.30860671401023865, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 455, "step_time": 86.37264341581613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 275.25, "completions/mean_terminated_length": 275.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.14971210062503815, "epoch": 0.6514285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2340022474527359, "learning_rate": 3.5e-06, "loss": -0.1034, "num_tokens": 2752567.0, "reward": 2.9123761653900146, "reward_std": 0.4603710472583771, "rewards/accuracy_reward/mean": 0.9123761057853699, "rewards/accuracy_reward/std": 0.1623847335577011, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 456, "step_time": 88.65216525364667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.26446887850761414, "epoch": 0.6528571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.10122158378362656, "learning_rate": 3.4857142857142863e-06, "loss": -0.0, "num_tokens": 2763951.0, "reward": 1.1256791353225708, "reward_std": 0.3532789647579193, "rewards/accuracy_reward/mean": 0.1256791353225708, "rewards/accuracy_reward/std": 0.3532789647579193, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 457, "step_time": 106.72257035970688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.27343514561653137, "epoch": 0.6542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.27572137117385864, "learning_rate": 3.471428571428572e-06, "loss": -0.0843, "num_tokens": 2768348.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 458, "step_time": 115.33395291585475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.1189660057425499, "epoch": 0.6557142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.33962443470954895, "learning_rate": 3.4571428571428574e-06, "loss": -0.0032, "num_tokens": 2773171.0, "reward": 1.375, "reward_std": 0.5650537610054016, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.3505098521709442, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 459, "step_time": 94.01141766738147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 370.75, "completions/mean_terminated_length": 370.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.1015327200293541, "epoch": 0.6571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.1292719691991806, "learning_rate": 3.4428571428571434e-06, "loss": 0.0012, "num_tokens": 2779033.0, "reward": 1.4357142448425293, "reward_std": 0.18182748556137085, "rewards/accuracy_reward/mean": 0.9107142686843872, "rewards/accuracy_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.02500000037252903, "rewards/grounding_reward/std": 0.0707106813788414, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 460, "step_time": 116.79500701185316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 666.75, "completions/mean_terminated_length": 309.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.31518903374671936, "epoch": 0.6585714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.14809376001358032, "learning_rate": 3.428571428571429e-06, "loss": 0.4738, "num_tokens": 2787375.0, "reward": 0.6221996545791626, "reward_std": 0.5543262958526611, "rewards/accuracy_reward/mean": 0.0596996434032917, "rewards/accuracy_reward/std": 0.0779728963971138, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.3125, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 461, "step_time": 110.64048377703875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.09715496748685837, "epoch": 0.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.19482658803462982, "learning_rate": 3.4142857142857145e-06, "loss": -0.0274, "num_tokens": 2792436.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 462, "step_time": 105.14995145890862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 402.75, "completions/mean_terminated_length": 402.75, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.0976443737745285, "epoch": 0.6614285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.19287273287773132, "learning_rate": 3.4000000000000005e-06, "loss": -0.0749, "num_tokens": 2798506.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 463, "step_time": 96.70053752977401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 549.25, "completions/mean_terminated_length": 549.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2077193260192871, "epoch": 0.6628571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.16541233658790588, "learning_rate": 3.385714285714286e-06, "loss": 0.0712, "num_tokens": 2805756.0, "reward": 1.6375000476837158, "reward_std": 0.47490599751472473, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.38749998807907104, "rewards/grounding_reward/std": 0.36425071954727173, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 464, "step_time": 103.76773238927126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.11285616457462311, "epoch": 0.6642857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.29615819454193115, "learning_rate": 3.3714285714285716e-06, "loss": 0.0149, "num_tokens": 2809940.0, "reward": 2.25, "reward_std": 0.7071067690849304, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 465, "step_time": 80.81713223550469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.23418359458446503, "epoch": 0.6657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.08666067570447922, "learning_rate": 3.357142857142857e-06, "loss": 0.0, "num_tokens": 2821092.0, "reward": 0.8507270812988281, "reward_std": 0.592602014541626, "rewards/accuracy_reward/mean": 0.1257270723581314, "rewards/accuracy_reward/std": 0.3532596230506897, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7250000238418579, "rewards/grounding_reward/std": 0.45276927947998047, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 466, "step_time": 107.3083621589467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1919221580028534, "epoch": 0.6671428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.3368843197822571, "learning_rate": 3.342857142857143e-06, "loss": -0.0119, "num_tokens": 2825610.0, "reward": 2.096153736114502, "reward_std": 0.49978864192962646, "rewards/accuracy_reward/mean": 0.9711538553237915, "rewards/accuracy_reward/std": 0.0815892368555069, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 467, "step_time": 81.38020977471024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 866.25, "completions/mean_terminated_length": 603.3333740234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.5052850842475891, "epoch": 0.6685714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.13075056672096252, "learning_rate": 3.3285714285714286e-06, "loss": 0.0851, "num_tokens": 2835588.0, "reward": 0.5752283334732056, "reward_std": 0.5606566071510315, "rewards/accuracy_reward/mean": 0.26272833347320557, "rewards/accuracy_reward/std": 0.4553877115249634, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 468, "step_time": 131.20583302341402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 332.5, "completions/mean_terminated_length": 332.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.09410712867975235, "epoch": 0.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.19796741008758545, "learning_rate": 3.314285714285714e-06, "loss": 0.0102, "num_tokens": 2841128.0, "reward": 2.5, "reward_std": 0.9258201122283936, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.625, "rewards/operation_reward/std": 0.5175492167472839, "step": 469, "step_time": 92.50178963784128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.11926531046628952, "epoch": 0.6714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.18336281180381775, "learning_rate": 3.3000000000000006e-06, "loss": -0.0478, "num_tokens": 2846207.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 470, "step_time": 86.91869411524385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.08203445374965668, "epoch": 0.6728571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.1537579596042633, "learning_rate": 3.285714285714286e-06, "loss": -0.1023, "num_tokens": 2851674.0, "reward": 1.079545497894287, "reward_std": 0.49540388584136963, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5795454382896423, "rewards/grounding_reward/std": 0.495403915643692, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 471, "step_time": 89.04428469855338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 447.875, "completions/mean_terminated_length": 365.5714416503906, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.36308231949806213, "epoch": 0.6742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.155737966299057, "learning_rate": 3.2714285714285717e-06, "loss": -0.0485, "num_tokens": 2858185.0, "reward": 1.2818405628204346, "reward_std": 0.3639170825481415, "rewards/accuracy_reward/mean": 0.0005905511789023876, "rewards/accuracy_reward/std": 0.0016703309956938028, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.84375, "rewards/grounding_reward/std": 0.35197150707244873, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 472, "step_time": 116.0222507044673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.21274526417255402, "epoch": 0.6757142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2986924946308136, "learning_rate": 3.2571428571428577e-06, "loss": -0.0438, "num_tokens": 2862304.0, "reward": 3.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 473, "step_time": 78.41773036215454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 890.0, "completions/mean_terminated_length": 756.0, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "entropy": 0.19243791699409485, "epoch": 0.6771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.10617765039205551, "learning_rate": 3.242857142857143e-06, "loss": 0.1192, "num_tokens": 2872512.0, "reward": 1.4547362327575684, "reward_std": 1.054391622543335, "rewards/accuracy_reward/mean": 0.22556957602500916, "rewards/accuracy_reward/std": 0.18647626042366028, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.4791666865348816, "rewards/grounding_reward/std": 0.3012869656085968, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 474, "step_time": 106.97755585145205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 109.375, "completions/mean_terminated_length": 109.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.12612777948379517, "epoch": 0.6785714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2601330280303955, "learning_rate": 3.2285714285714288e-06, "loss": -0.0205, "num_tokens": 2876355.0, "reward": 2.25, "reward_std": 0.6546536684036255, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 475, "step_time": 82.73487896099687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 599.375, "completions/mean_terminated_length": 599.375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.05288351699709892, "epoch": 0.68, "frac_reward_zero_std": 0.0, "grad_norm": 0.086248479783535, "learning_rate": 3.2142857142857147e-06, "loss": 0.019, "num_tokens": 2884110.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 476, "step_time": 104.2616516361013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 952.625, "completions/mean_terminated_length": 881.25, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "entropy": 0.09162838011980057, "epoch": 0.6814285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1438758224248886, "learning_rate": 3.2000000000000003e-06, "loss": 0.0704, "num_tokens": 2894627.0, "reward": 0.73487788438797, "reward_std": 0.688805878162384, "rewards/accuracy_reward/mean": 0.0005028520245105028, "rewards/accuracy_reward/std": 0.0005442037363536656, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.484375, "rewards/grounding_reward/std": 0.4454006254673004, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 477, "step_time": 108.55745028425008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 356.25, "completions/mean_terminated_length": 356.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.13762089610099792, "epoch": 0.6828571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3009006381034851, "learning_rate": 3.185714285714286e-06, "loss": -0.1854, "num_tokens": 2900389.0, "reward": 1.0833333730697632, "reward_std": 0.49601587653160095, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0833333358168602, "rewards/grounding_reward/std": 0.2357022911310196, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 478, "step_time": 96.55238619912416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.1986573040485382, "epoch": 0.6842857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.35836267471313477, "learning_rate": 3.1714285714285714e-06, "loss": -0.1728, "num_tokens": 2904073.0, "reward": 2.375, "reward_std": 0.6408699750900269, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 479, "step_time": 91.08078520093113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 780.125, "completions/mean_terminated_length": 698.8333740234375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.20148834586143494, "epoch": 0.6857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.22984375059604645, "learning_rate": 3.1571428571428573e-06, "loss": 0.1186, "num_tokens": 2913290.0, "reward": 1.2744556665420532, "reward_std": 0.9365131855010986, "rewards/accuracy_reward/mean": 0.5244556069374084, "rewards/accuracy_reward/std": 0.39473727345466614, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 480, "step_time": 108.39425375591964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1454208940267563, "epoch": 0.6871428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.32583582401275635, "learning_rate": 3.142857142857143e-06, "loss": -0.024, "num_tokens": 2917742.0, "reward": 2.1458334922790527, "reward_std": 0.40274354815483093, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7708333730697632, "rewards/grounding_reward/std": 0.3204349875450134, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 481, "step_time": 86.69542696047574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 409.5, "completions/mean_terminated_length": 409.5, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.17983852326869965, "epoch": 0.6885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.19912704825401306, "learning_rate": 3.1285714285714284e-06, "loss": 0.0427, "num_tokens": 2923930.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 482, "step_time": 92.72318123281002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.11995667964220047, "epoch": 0.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.09702728688716888, "learning_rate": 3.114285714285715e-06, "loss": -0.0, "num_tokens": 2935074.0, "reward": 0.25385546684265137, "reward_std": 0.4630305767059326, "rewards/accuracy_reward/mean": 0.0038554768543690443, "rewards/accuracy_reward/std": 0.00044009380508214235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 483, "step_time": 121.50155776645988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 793.25, "completions/mean_terminated_length": 654.7999877929688, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "entropy": 0.14225183427333832, "epoch": 0.6914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.12097469717264175, "learning_rate": 3.1000000000000004e-06, "loss": 0.1713, "num_tokens": 2944268.0, "reward": 1.1880948543548584, "reward_std": 0.5296499729156494, "rewards/accuracy_reward/mean": 0.0005948961479589343, "rewards/accuracy_reward/std": 0.0008298495085909963, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 484, "step_time": 110.26616677828133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.12826380133628845, "epoch": 0.6928571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.2506123185157776, "learning_rate": 3.085714285714286e-06, "loss": 0.0112, "num_tokens": 2949239.0, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 485, "step_time": 87.90493204630911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.13326449692249298, "epoch": 0.6942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.32842281460762024, "learning_rate": 3.071428571428572e-06, "loss": -0.0832, "num_tokens": 2953124.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 486, "step_time": 93.8507729023695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 532.125, "completions/mean_terminated_length": 368.16668701171875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.34010255336761475, "epoch": 0.6957142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.18479415774345398, "learning_rate": 3.0571428571428575e-06, "loss": 0.3683, "num_tokens": 2960261.0, "reward": 1.3755600452423096, "reward_std": 1.0930094718933105, "rewards/accuracy_reward/mean": 0.6255599856376648, "rewards/accuracy_reward/std": 0.5167767405509949, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.375, "rewards/operation_reward/std": 0.5175492167472839, "step": 487, "step_time": 110.74553132709116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 746.0, "completions/mean_terminated_length": 468.0, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "entropy": 0.21025797724723816, "epoch": 0.6971428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.1369006335735321, "learning_rate": 3.042857142857143e-06, "loss": 0.204, "num_tokens": 2969261.0, "reward": 0.8770840167999268, "reward_std": 0.6393790245056152, "rewards/accuracy_reward/mean": 0.002084063831716776, "rewards/accuracy_reward/std": 0.0022380410227924585, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 488, "step_time": 116.96199273131788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.07755622267723083, "epoch": 0.6985714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.18958741426467896, "learning_rate": 3.028571428571429e-06, "loss": 0.0271, "num_tokens": 2974222.0, "reward": 1.5499999523162842, "reward_std": 0.09258202463388443, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.05000000074505806, "rewards/grounding_reward/std": 0.09258200973272324, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 489, "step_time": 97.49403983913362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.08118271082639694, "epoch": 0.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.24112239480018616, "learning_rate": 3.0142857142857145e-06, "loss": -0.0038, "num_tokens": 2978439.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 490, "step_time": 81.54848909564316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 249.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07945085316896439, "epoch": 0.7014285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.7835097908973694, "learning_rate": 3e-06, "loss": -0.1501, "num_tokens": 2983325.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 491, "step_time": 90.03516732249409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 408.875, "completions/mean_terminated_length": 408.875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.05120152607560158, "epoch": 0.7028571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.985714285714286e-06, "loss": 0.0, "num_tokens": 2989572.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 492, "step_time": 88.33695158362389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.16690067946910858, "epoch": 0.7042857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9714285714285716e-06, "loss": 0.0, "num_tokens": 2993519.0, "reward": 2.1666667461395264, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6666666865348816, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 493, "step_time": 82.39936299063265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 383.25, "completions/mean_terminated_length": 383.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.2316240519285202, "epoch": 0.7057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23297914862632751, "learning_rate": 2.957142857142857e-06, "loss": 0.0945, "num_tokens": 2999433.0, "reward": 2.1500000953674316, "reward_std": 0.4869731366634369, "rewards/accuracy_reward/mean": 0.8999999761581421, "rewards/accuracy_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 494, "step_time": 97.5556889474392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 498.25, "completions/mean_terminated_length": 498.25, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "entropy": 0.1272675096988678, "epoch": 0.7071428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.1264694780111313, "learning_rate": 2.9428571428571427e-06, "loss": -0.0267, "num_tokens": 3006427.0, "reward": 0.6979166865348816, "reward_std": 0.3505593538284302, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.1979166716337204, "rewards/grounding_reward/std": 0.35055938363075256, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 495, "step_time": 98.20441559702158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.147262305021286, "epoch": 0.7085714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.928571428571429e-06, "loss": 0.0, "num_tokens": 3010938.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 496, "step_time": 81.8466724678874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.12062064558267593, "epoch": 0.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.28424787521362305, "learning_rate": 2.9142857142857146e-06, "loss": -0.0166, "num_tokens": 3015165.0, "reward": 3.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 497, "step_time": 85.2387605253607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.07942166924476624, "epoch": 0.7114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1769862323999405, "learning_rate": 2.9e-06, "loss": 0.0642, "num_tokens": 3019598.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 498, "step_time": 98.02670593187213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 489.875, "completions/mean_terminated_length": 489.875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.06401866674423218, "epoch": 0.7128571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.12220099568367004, "learning_rate": 2.885714285714286e-06, "loss": -0.0275, "num_tokens": 3026429.0, "reward": 1.0677554607391357, "reward_std": 0.47202396392822266, "rewards/accuracy_reward/mean": 0.08858879655599594, "rewards/accuracy_reward/std": 0.024792468175292015, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3541666865348816, "rewards/grounding_reward/std": 0.2260337918996811, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 499, "step_time": 97.15374192688614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 779.5, "completions/mean_terminated_length": 698.0, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "entropy": 0.24599656462669373, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1483028382062912, "learning_rate": 2.8714285714285717e-06, "loss": 0.0334, "num_tokens": 3035569.0, "reward": 0.9777964353561401, "reward_std": 0.597320556640625, "rewards/accuracy_reward/mean": 0.22779643535614014, "rewards/accuracy_reward/std": 0.2427431344985962, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 500, "step_time": 107.77393380086869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.19487285614013672, "epoch": 0.7157142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.08748532831668854, "learning_rate": 2.8571428571428573e-06, "loss": 0.0, "num_tokens": 3046857.0, "reward": 0.6578405499458313, "reward_std": 0.5151792168617249, "rewards/accuracy_reward/mean": 0.032840535044670105, "rewards/accuracy_reward/std": 0.0068929726257920265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 501, "step_time": 106.52479167468846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 526.375, "completions/mean_terminated_length": 526.375, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.06813335418701172, "epoch": 0.7171428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.1839105188846588, "learning_rate": 2.8428571428571432e-06, "loss": 0.0094, "num_tokens": 3054092.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 502, "step_time": 120.20033265650272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 750.0, "completions/mean_terminated_length": 710.857177734375, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "entropy": 0.10912986099720001, "epoch": 0.7185714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.10653160512447357, "learning_rate": 2.8285714285714288e-06, "loss": 0.0469, "num_tokens": 3063084.0, "reward": 1.9828739166259766, "reward_std": 0.7364751696586609, "rewards/accuracy_reward/mean": 0.7953739166259766, "rewards/accuracy_reward/std": 0.38858702778816223, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 503, "step_time": 110.08901645522565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.21504054963588715, "epoch": 0.72, "frac_reward_zero_std": 0.0, "grad_norm": 0.34111446142196655, "learning_rate": 2.8142857142857143e-06, "loss": 0.0274, "num_tokens": 3067140.0, "reward": 1.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 504, "step_time": 81.7113909330219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.20844373106956482, "epoch": 0.7214285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.27730366587638855, "learning_rate": 2.8000000000000003e-06, "loss": -0.039, "num_tokens": 3072063.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.2314550280570984, "rewards/operation_reward/mean": 0.1875, "rewards/operation_reward/std": 0.3720119297504425, "step": 505, "step_time": 92.19760127365589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 101.625, "completions/mean_terminated_length": 101.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2055923044681549, "epoch": 0.7228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.6280041933059692, "learning_rate": 2.785714285714286e-06, "loss": -0.0643, "num_tokens": 3075788.0, "reward": 3.125, "reward_std": 0.7440237998962402, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 506, "step_time": 86.19567226152867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 556.25, "completions/mean_terminated_length": 556.25, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.3260478973388672, "epoch": 0.7242857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.16775982081890106, "learning_rate": 2.7714285714285714e-06, "loss": -0.054, "num_tokens": 3083134.0, "reward": 1.3072316646575928, "reward_std": 0.3500998020172119, "rewards/accuracy_reward/mean": 0.2322317659854889, "rewards/accuracy_reward/std": 0.032931849360466, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.574999988079071, "rewards/grounding_reward/std": 0.355567991733551, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 507, "step_time": 101.69287618156523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 966.125, "completions/mean_terminated_length": 869.6666870117188, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "entropy": 0.0785595178604126, "epoch": 0.7257142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.10354644060134888, "learning_rate": 2.757142857142857e-06, "loss": 0.0723, "num_tokens": 3093791.0, "reward": 0.18908077478408813, "reward_std": 0.2574656903743744, "rewards/accuracy_reward/mean": 0.001580765936523676, "rewards/accuracy_reward/std": 0.0013308111811056733, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 508, "step_time": 110.72152769565582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 459.125, "completions/mean_terminated_length": 378.4285888671875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.19199898838996887, "epoch": 0.7271428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.21655672788619995, "learning_rate": 2.7428571428571433e-06, "loss": 0.422, "num_tokens": 3100320.0, "reward": 1.312583327293396, "reward_std": 0.7528043389320374, "rewards/accuracy_reward/mean": 0.750083327293396, "rewards/accuracy_reward/std": 0.46275582909584045, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 509, "step_time": 110.25396376289427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 849.25, "completions/mean_terminated_length": 674.5, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.25426480174064636, "epoch": 0.7285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.11270377039909363, "learning_rate": 2.728571428571429e-06, "loss": 0.171, "num_tokens": 3109994.0, "reward": 0.6360725164413452, "reward_std": 0.8307912945747375, "rewards/accuracy_reward/mean": 0.2610725164413452, "rewards/accuracy_reward/std": 0.38338467478752136, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 510, "step_time": 110.2974889241159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.12428689002990723, "epoch": 0.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.23277877271175385, "learning_rate": 2.7142857142857144e-06, "loss": 0.025, "num_tokens": 3114091.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 511, "step_time": 83.12534975726157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.07987839728593826, "epoch": 0.7314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.29787537455558777, "learning_rate": 2.7000000000000004e-06, "loss": -0.0075, "num_tokens": 3118554.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 512, "step_time": 83.33568498678505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1792805939912796, "epoch": 0.7328571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.685714285714286e-06, "loss": 0.0, "num_tokens": 3122204.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 513, "step_time": 79.96782743278891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.18439806997776031, "epoch": 0.7342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.4177195131778717, "learning_rate": 2.6714285714285715e-06, "loss": -0.0023, "num_tokens": 3126004.0, "reward": 3.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 514, "step_time": 79.59894938580692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.15240539610385895, "epoch": 0.7357142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.25622567534446716, "learning_rate": 2.6571428571428575e-06, "loss": -0.1049, "num_tokens": 3131586.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 515, "step_time": 92.80240829102695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 918.75, "completions/mean_terminated_length": 603.0, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "entropy": 0.5461427569389343, "epoch": 0.7371428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.1260281503200531, "learning_rate": 2.642857142857143e-06, "loss": 0.1691, "num_tokens": 3141896.0, "reward": 1.1375000476837158, "reward_std": 0.7322065830230713, "rewards/accuracy_reward/mean": 0.26249992847442627, "rewards/accuracy_reward/std": 0.35887548327445984, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.34503278136253357, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 516, "step_time": 123.41077731642872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 117.125, "completions/mean_terminated_length": 117.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.17351779341697693, "epoch": 0.7385714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6285714285714286e-06, "loss": 0.0, "num_tokens": 3145817.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 517, "step_time": 83.2556199785322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 558.0, "completions/mean_terminated_length": 558.0, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.0571138821542263, "epoch": 0.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.09565670043230057, "learning_rate": 2.614285714285715e-06, "loss": -0.001, "num_tokens": 3153193.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 518, "step_time": 100.78757618833333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 453.5, "completions/mean_terminated_length": 372.0000305175781, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.1909741908311844, "epoch": 0.7414285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25879523158073425, "learning_rate": 2.6e-06, "loss": 0.2834, "num_tokens": 3159741.0, "reward": 1.0628416538238525, "reward_std": 0.6225464344024658, "rewards/accuracy_reward/mean": 0.0003416467516217381, "rewards/accuracy_reward/std": 0.0009663229575380683, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 519, "step_time": 106.93534711096436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.1431952267885208, "epoch": 0.7428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3304479718208313, "learning_rate": 2.5857142857142856e-06, "loss": 0.01, "num_tokens": 3164645.0, "reward": 1.5451650619506836, "reward_std": 0.5942133665084839, "rewards/accuracy_reward/mean": 0.2951650619506836, "rewards/accuracy_reward/std": 0.2540966272354126, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 520, "step_time": 91.54239757172763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 460.5, "completions/mean_terminated_length": 460.5, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.05745897814631462, "epoch": 0.7442857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.10949490964412689, "learning_rate": 2.571428571428571e-06, "loss": -0.0796, "num_tokens": 3171209.0, "reward": 2.305555582046509, "reward_std": 0.37912923097610474, "rewards/accuracy_reward/mean": 0.9305555820465088, "rewards/accuracy_reward/std": 0.19641855359077454, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 521, "step_time": 115.5010206149891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 339.25, "completions/mean_terminated_length": 339.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.05615028738975525, "epoch": 0.7457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.244592547416687, "learning_rate": 2.5571428571428576e-06, "loss": 0.0326, "num_tokens": 3176803.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 522, "step_time": 90.38195141777396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 405.625, "completions/mean_terminated_length": 405.625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.1840309202671051, "epoch": 0.7471428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.21851488947868347, "learning_rate": 2.542857142857143e-06, "loss": -0.0389, "num_tokens": 3183080.0, "reward": 1.2598488330841064, "reward_std": 0.3941526710987091, "rewards/accuracy_reward/mean": 0.1369321644306183, "rewards/accuracy_reward/std": 0.025792883709073067, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6229166388511658, "rewards/grounding_reward/std": 0.3914703130722046, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 523, "step_time": 95.47892821859568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 434.0, "completions/mean_terminated_length": 237.33334350585938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.7329496741294861, "epoch": 0.7485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.21810629963874817, "learning_rate": 2.5285714285714287e-06, "loss": 0.3678, "num_tokens": 3189416.0, "reward": 0.9031925201416016, "reward_std": 0.5071759819984436, "rewards/accuracy_reward/mean": 0.09069249033927917, "rewards/accuracy_reward/std": 0.08078839629888535, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.4172614812850952, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 524, "step_time": 107.95662480778992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 627.75, "completions/mean_terminated_length": 627.75, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "entropy": 0.05248197540640831, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.08247967064380646, "learning_rate": 2.5142857142857147e-06, "loss": -0.0044, "num_tokens": 3197398.0, "reward": 2.25, "reward_std": 0.311677485704422, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.311677485704422, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 525, "step_time": 97.94998487643898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 977.25, "completions/mean_terminated_length": 961.6666870117188, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "entropy": 0.0402093268930912, "epoch": 0.7514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.07272662967443466, "learning_rate": 2.5e-06, "loss": 0.0326, "num_tokens": 3208176.0, "reward": 1.8757685422897339, "reward_std": 0.6932588219642639, "rewards/accuracy_reward/mean": 0.7507685422897339, "rewards/accuracy_reward/std": 0.46148696541786194, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 526, "step_time": 110.44125402253121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.12518037855625153, "epoch": 0.7528571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.22005563974380493, "learning_rate": 2.485714285714286e-06, "loss": 0.02, "num_tokens": 3212888.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 527, "step_time": 86.27782933041453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 778.625, "completions/mean_terminated_length": 778.625, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "entropy": 0.0858079269528389, "epoch": 0.7542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.10158804059028625, "learning_rate": 2.4714285714285717e-06, "loss": -0.0195, "num_tokens": 3222317.0, "reward": 1.5492191314697266, "reward_std": 0.39699602127075195, "rewards/accuracy_reward/mean": 0.9242191314697266, "rewards/accuracy_reward/std": 0.10466208308935165, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 528, "step_time": 111.39795460831374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 791.5, "completions/mean_terminated_length": 559.0, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "entropy": 0.15428462624549866, "epoch": 0.7557142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.10879386216402054, "learning_rate": 2.4571428571428573e-06, "loss": 0.184, "num_tokens": 3231665.0, "reward": 1.3064357042312622, "reward_std": 0.44422560930252075, "rewards/accuracy_reward/mean": 0.181435689330101, "rewards/accuracy_reward/std": 0.19926351308822632, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 529, "step_time": 114.2153406534344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 893.25, "completions/mean_terminated_length": 849.6666870117188, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "entropy": 0.08948476612567902, "epoch": 0.7571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.09926964342594147, "learning_rate": 2.442857142857143e-06, "loss": 0.0145, "num_tokens": 3241819.0, "reward": 1.5592248439788818, "reward_std": 0.6267483830451965, "rewards/accuracy_reward/mean": 0.6842248439788818, "rewards/accuracy_reward/std": 0.45791196823120117, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 530, "step_time": 109.9113870402798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.11154524981975555, "epoch": 0.7585714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.25263088941574097, "learning_rate": 2.428571428571429e-06, "loss": 0.0574, "num_tokens": 3246490.0, "reward": 2.375, "reward_std": 0.6408699750900269, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 531, "step_time": 84.39976694528013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.12373130768537521, "epoch": 0.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.1917319893836975, "learning_rate": 2.4142857142857148e-06, "loss": -0.1392, "num_tokens": 3251717.0, "reward": 2.012772560119629, "reward_std": 0.4965971112251282, "rewards/accuracy_reward/mean": 0.8877723813056946, "rewards/accuracy_reward/std": 0.06643346697092056, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 532, "step_time": 94.0337685495615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 397.75, "completions/mean_terminated_length": 397.75, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.09697873145341873, "epoch": 0.7614285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "num_tokens": 3257891.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 533, "step_time": 91.84247202426195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1920120269060135, "epoch": 0.7628571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2624067962169647, "learning_rate": 2.385714285714286e-06, "loss": -0.0242, "num_tokens": 3262573.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 534, "step_time": 84.71482431516051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.18846726417541504, "epoch": 0.7642857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.2987704873085022, "learning_rate": 2.371428571428572e-06, "loss": -0.0133, "num_tokens": 3266875.0, "reward": 2.3278303146362305, "reward_std": 0.3755262494087219, "rewards/accuracy_reward/mean": 0.9528301954269409, "rewards/accuracy_reward/std": 0.05042664334177971, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 535, "step_time": 83.0933573320508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.19164416193962097, "epoch": 0.7657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.4265018701553345, "learning_rate": 2.3571428571428574e-06, "loss": -0.0701, "num_tokens": 3270706.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 536, "step_time": 98.22692297864705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 364.25, "completions/mean_terminated_length": 364.25, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.07945331931114197, "epoch": 0.7671428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.22581487894058228, "learning_rate": 2.342857142857143e-06, "loss": -0.1575, "num_tokens": 3276524.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 537, "step_time": 90.60047293175012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 953.875, "completions/mean_terminated_length": 463.0, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.4612971544265747, "epoch": 0.7685714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.11815857142210007, "learning_rate": 2.3285714285714285e-06, "loss": 0.1524, "num_tokens": 3287171.0, "reward": 0.49492496252059937, "reward_std": 0.6671184301376343, "rewards/accuracy_reward/mean": 0.11992493271827698, "rewards/accuracy_reward/std": 0.3063284754753113, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.3125, "rewards/grounding_reward/std": 0.3720119297504425, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 538, "step_time": 109.95737711247057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.17556875944137573, "epoch": 0.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.32330122590065, "learning_rate": 2.3142857142857145e-06, "loss": -0.1569, "num_tokens": 3292217.0, "reward": 1.9878243207931519, "reward_std": 0.7493224740028381, "rewards/accuracy_reward/mean": 0.5503243207931519, "rewards/accuracy_reward/std": 0.3954470455646515, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.3125, "rewards/operation_reward/std": 0.45806270837783813, "step": 539, "step_time": 88.69454499334097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.10713588446378708, "epoch": 0.7714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2953065037727356, "learning_rate": 2.3000000000000004e-06, "loss": -0.1662, "num_tokens": 3296666.0, "reward": 1.7083333730697632, "reward_std": 0.17251640558242798, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.2083333432674408, "rewards/grounding_reward/std": 0.17251639068126678, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 540, "step_time": 86.21573546994478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 1021.0, "completions/mean_terminated_length": 1000.0, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "entropy": 0.22368158400058746, "epoch": 0.7728571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.14813897013664246, "learning_rate": 2.285714285714286e-06, "loss": 0.006, "num_tokens": 3307778.0, "reward": 0.6907655000686646, "reward_std": 0.8823403716087341, "rewards/accuracy_reward/mean": 0.2532655596733093, "rewards/accuracy_reward/std": 0.4608948826789856, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 541, "step_time": 115.45173935126513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.09704980254173279, "epoch": 0.7742857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.21954797208309174, "learning_rate": 2.2714285714285715e-06, "loss": 0.024, "num_tokens": 3312071.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.375, "rewards/operation_reward/std": 0.5175492167472839, "step": 542, "step_time": 84.8649103268981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.15532423555850983, "epoch": 0.7757142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.257142857142857e-06, "loss": 0.0, "num_tokens": 3316290.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 543, "step_time": 81.3635561466217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.07420556992292404, "epoch": 0.7771428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.1782972365617752, "learning_rate": 2.242857142857143e-06, "loss": -0.0603, "num_tokens": 3322104.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 544, "step_time": 92.58923375979066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 642.125, "completions/mean_terminated_length": 642.125, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "entropy": 0.05189422890543938, "epoch": 0.7785714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.09595181792974472, "learning_rate": 2.228571428571429e-06, "loss": -0.0274, "num_tokens": 3330209.0, "reward": 1.25, "reward_std": 0.37796446681022644, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.37796446681022644, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 545, "step_time": 103.49732807278633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.11848444491624832, "epoch": 0.78, "frac_reward_zero_std": 0.0, "grad_norm": 0.24472816288471222, "learning_rate": 2.2142857142857146e-06, "loss": -0.0606, "num_tokens": 3335922.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 546, "step_time": 89.43348093517125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.1015879362821579, "epoch": 0.7814285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.16930848360061646, "learning_rate": 2.2e-06, "loss": -0.0123, "num_tokens": 3340737.0, "reward": 1.9375, "reward_std": 0.4955156147480011, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.4955156147480011, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 547, "step_time": 87.22930338326842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 473.75, "completions/mean_terminated_length": 473.75, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.09450367838144302, "epoch": 0.7828571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.15189668536186218, "learning_rate": 2.185714285714286e-06, "loss": -0.0095, "num_tokens": 3347495.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 548, "step_time": 101.37060858681798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.26184117794036865, "epoch": 0.7842857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1283608376979828, "learning_rate": 2.1714285714285716e-06, "loss": 0.0, "num_tokens": 3358735.0, "reward": 0.008464822545647621, "reward_std": 0.0012254684697836637, "rewards/accuracy_reward/mean": 0.008464822545647621, "rewards/accuracy_reward/std": 0.001225468353368342, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 549, "step_time": 123.37321825884283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08829907327890396, "epoch": 0.7857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1741095781326294, "learning_rate": 2.157142857142857e-06, "loss": -0.0812, "num_tokens": 3364128.0, "reward": 1.625, "reward_std": 0.5444525480270386, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.3162277638912201, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 550, "step_time": 87.62649071402848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.10517580062150955, "epoch": 0.7871428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.2650366425514221, "learning_rate": 2.1428571428571427e-06, "loss": -0.0019, "num_tokens": 3368878.0, "reward": 1.9500000476837158, "reward_std": 0.5244861245155334, "rewards/accuracy_reward/mean": 0.9499999284744263, "rewards/accuracy_reward/std": 0.028284266591072083, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 551, "step_time": 85.06450598221272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 743.5, "completions/mean_terminated_length": 650.0, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.17598091065883636, "epoch": 0.7885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.16064083576202393, "learning_rate": 2.1285714285714287e-06, "loss": 0.1853, "num_tokens": 3377762.0, "reward": 1.1281388998031616, "reward_std": 0.7386025786399841, "rewards/accuracy_reward/mean": 0.6281388998031616, "rewards/accuracy_reward/std": 0.4381532371044159, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 552, "step_time": 114.53589423466474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 349.125, "completions/mean_terminated_length": 252.71429443359375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.37175658345222473, "epoch": 0.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.16208359599113464, "learning_rate": 2.1142857142857147e-06, "loss": -0.1067, "num_tokens": 3383523.0, "reward": 1.0630179643630981, "reward_std": 0.49544310569763184, "rewards/accuracy_reward/mean": 0.0005179558065719903, "rewards/accuracy_reward/std": 0.0014650003286078572, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 553, "step_time": 110.91618509963155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.0728733167052269, "epoch": 0.7914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.15784715116024017, "learning_rate": 2.1000000000000002e-06, "loss": -0.0306, "num_tokens": 3388781.0, "reward": 1.5, "reward_std": 0.7071067690849304, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.4432026445865631, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 554, "step_time": 91.00835410598665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 883.375, "completions/mean_terminated_length": 461.5, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "entropy": 0.4925979971885681, "epoch": 0.7928571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 2.5991480350494385, "learning_rate": 2.0857142857142858e-06, "loss": 0.0941, "num_tokens": 3398832.0, "reward": 1.2433571815490723, "reward_std": 0.372700035572052, "rewards/accuracy_reward/mean": 0.14960721135139465, "rewards/accuracy_reward/std": 0.3455039858818054, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.96875, "rewards/grounding_reward/std": 0.0883883461356163, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 555, "step_time": 114.43605844862759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 160.85714721679688, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.34503012895584106, "epoch": 0.7942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.5097323060035706, "learning_rate": 2.0714285714285717e-06, "loss": 0.9072, "num_tokens": 3403918.0, "reward": 2.062641143798828, "reward_std": 0.9034926891326904, "rewards/accuracy_reward/mean": 0.7501411437988281, "rewards/accuracy_reward/std": 0.46264883875846863, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 556, "step_time": 112.00676626898348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 596.375, "completions/mean_terminated_length": 535.2857666015625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.21743181347846985, "epoch": 0.7957142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.19896188378334045, "learning_rate": 2.0571428571428573e-06, "loss": 0.2522, "num_tokens": 3411705.0, "reward": 2.070880174636841, "reward_std": 0.6137511730194092, "rewards/accuracy_reward/mean": 0.7583801746368408, "rewards/accuracy_reward/std": 0.4475795030593872, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 557, "step_time": 111.63428842183203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 548.5, "completions/mean_terminated_length": 263.20001220703125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.5850458145141602, "epoch": 0.7971428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.20928727090358734, "learning_rate": 2.0428571428571433e-06, "loss": 0.5919, "num_tokens": 3418981.0, "reward": 1.331070899963379, "reward_std": 1.0796444416046143, "rewards/accuracy_reward/mean": 0.5185708999633789, "rewards/accuracy_reward/std": 0.38526687026023865, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 558, "step_time": 107.65567400399595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.139784574508667, "epoch": 0.7985714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 1.0720775127410889, "learning_rate": 2.028571428571429e-06, "loss": -0.0257, "num_tokens": 3423647.0, "reward": 2.2291667461395264, "reward_std": 0.689706027507782, "rewards/accuracy_reward/mean": 0.8541666865348816, "rewards/accuracy_reward/std": 0.10681165754795074, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 559, "step_time": 83.81820934824646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.18954214453697205, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.3506511449813843, "learning_rate": 2.0142857142857144e-06, "loss": -0.022, "num_tokens": 3427502.0, "reward": 2.25, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 560, "step_time": 79.63732129335403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.18232421576976776, "epoch": 0.8014285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.17633846402168274, "learning_rate": 2.0000000000000003e-06, "loss": 0.0496, "num_tokens": 3432209.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 561, "step_time": 93.12773750722408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.18128632009029388, "epoch": 0.8028571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.324119508266449, "learning_rate": 1.985714285714286e-06, "loss": -0.0854, "num_tokens": 3436647.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 562, "step_time": 82.12577256094664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.051751501858234406, "epoch": 0.8042857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.08798330277204514, "learning_rate": 1.9714285714285714e-06, "loss": 0.0058, "num_tokens": 3442755.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 563, "step_time": 89.22666682582349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 536.75, "completions/mean_terminated_length": 536.75, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.059796303510665894, "epoch": 0.8057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1377439647912979, "learning_rate": 1.957142857142857e-06, "loss": -0.0069, "num_tokens": 3450217.0, "reward": 1.7268519401550293, "reward_std": 0.48712101578712463, "rewards/accuracy_reward/mean": 0.9351851940155029, "rewards/accuracy_reward/std": 0.12001372873783112, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0416666679084301, "rewards/grounding_reward/std": 0.1178511455655098, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 564, "step_time": 97.14675609208643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 376.125, "completions/mean_terminated_length": 376.125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "entropy": 0.04391862824559212, "epoch": 0.8071428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.09359177201986313, "learning_rate": 1.942857142857143e-06, "loss": -0.0246, "num_tokens": 3456178.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 565, "step_time": 102.46044059563428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08913516253232956, "epoch": 0.8085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3188174366950989, "learning_rate": 1.928571428571429e-06, "loss": 0.0861, "num_tokens": 3461814.0, "reward": 1.774999976158142, "reward_std": 0.3807886838912964, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4000000059604645, "rewards/grounding_reward/std": 0.1414213627576828, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 566, "step_time": 89.67789220251143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 912.875, "completions/mean_terminated_length": 727.6666870117188, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.2032458484172821, "epoch": 0.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.13363319635391235, "learning_rate": 1.9142857142857145e-06, "loss": 0.1641, "num_tokens": 3472077.0, "reward": 1.0690020322799683, "reward_std": 1.0454432964324951, "rewards/accuracy_reward/mean": 0.3815019726753235, "rewards/accuracy_reward/std": 0.5121651887893677, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 567, "step_time": 107.765246655792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 391.5, "completions/mean_terminated_length": 301.14288330078125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.41496309638023376, "epoch": 0.8114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19735290110111237, "learning_rate": 1.9000000000000002e-06, "loss": -0.0187, "num_tokens": 3478137.0, "reward": 2.1558375358581543, "reward_std": 0.7689189910888672, "rewards/accuracy_reward/mean": 0.7808377146720886, "rewards/accuracy_reward/std": 0.138453409075737, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.4381372928619385, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 568, "step_time": 108.50415588915348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.08695388585329056, "epoch": 0.8128571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.5018839836120605, "learning_rate": 1.885714285714286e-06, "loss": -0.0185, "num_tokens": 3482670.0, "reward": 2.0, "reward_std": 0.7559289336204529, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 569, "step_time": 83.88718199077994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.19881325960159302, "epoch": 0.8142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.37226420640945435, "learning_rate": 1.8714285714285715e-06, "loss": -0.0723, "num_tokens": 3486594.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 570, "step_time": 81.09502493869513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 419.375, "completions/mean_terminated_length": 419.375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "entropy": 0.15271764993667603, "epoch": 0.8157142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.19462338089942932, "learning_rate": 1.8571428571428573e-06, "loss": 0.0541, "num_tokens": 3492941.0, "reward": 0.6009831428527832, "reward_std": 0.030124209821224213, "rewards/accuracy_reward/mean": 0.10098309814929962, "rewards/accuracy_reward/std": 0.030124196782708168, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 571, "step_time": 93.48074483498931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.1204833984375, "epoch": 0.8171428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.23919318616390228, "learning_rate": 1.8428571428571428e-06, "loss": 0.0478, "num_tokens": 3497588.0, "reward": 2.1500000953674316, "reward_std": 0.39641252160072327, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6499999761581421, "rewards/grounding_reward/std": 0.3964124619960785, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 572, "step_time": 84.43433008063585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 940.5, "completions/mean_terminated_length": 940.5, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "entropy": 0.05671112611889839, "epoch": 0.8185714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.16450929641723633, "learning_rate": 1.8285714285714288e-06, "loss": 0.034, "num_tokens": 3508088.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 573, "step_time": 113.96021191962063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.09792526066303253, "epoch": 0.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.1661607027053833, "learning_rate": 1.8142857142857146e-06, "loss": -0.0635, "num_tokens": 3513838.0, "reward": 2.443855047225952, "reward_std": 0.5378323793411255, "rewards/accuracy_reward/mean": 0.8188550472259521, "rewards/accuracy_reward/std": 0.3354220688343048, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 574, "step_time": 98.77630173973739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.45703792572021484, "epoch": 0.8214285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2642688453197479, "learning_rate": 1.8000000000000001e-06, "loss": 0.1664, "num_tokens": 3518929.0, "reward": 1.130037546157837, "reward_std": 0.5183125734329224, "rewards/accuracy_reward/mean": 0.2550375163555145, "rewards/accuracy_reward/std": 0.07532065361738205, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 575, "step_time": 137.76128007937223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 332.0, "completions/mean_terminated_length": 332.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.25494036078453064, "epoch": 0.8228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.21955637633800507, "learning_rate": 1.7857142857142859e-06, "loss": 0.0352, "num_tokens": 3524457.0, "reward": 2.375, "reward_std": 0.6408699750900269, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 576, "step_time": 91.6710669323802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 952.625, "completions/mean_terminated_length": 833.6666870117188, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "entropy": 0.12646923959255219, "epoch": 0.8242857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.10039035230875015, "learning_rate": 1.7714285714285714e-06, "loss": 0.0462, "num_tokens": 3535054.0, "reward": 0.8579236268997192, "reward_std": 0.6325905919075012, "rewards/accuracy_reward/mean": 0.26417356729507446, "rewards/accuracy_reward/std": 0.4546462595462799, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.40625, "rewards/grounding_reward/std": 0.4988826811313629, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 577, "step_time": 154.18740148376673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.05758701637387276, "epoch": 0.8257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1247791051864624, "learning_rate": 1.7571428571428572e-06, "loss": -0.1242, "num_tokens": 3540146.0, "reward": 1.375, "reward_std": 0.6408699750900269, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 578, "step_time": 96.01318345312029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 530.375, "completions/mean_terminated_length": 530.375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.033511094748973846, "epoch": 0.8271428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.07309036701917648, "learning_rate": 1.7428571428571432e-06, "loss": -0.0104, "num_tokens": 3547317.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 579, "step_time": 103.0358213307336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.08597812801599503, "epoch": 0.8285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.22724227607250214, "learning_rate": 1.7285714285714287e-06, "loss": 0.0014, "num_tokens": 3551749.0, "reward": 2.125, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 580, "step_time": 86.55414065159857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.4793652892112732, "epoch": 0.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.1393241286277771, "learning_rate": 1.7142857142857145e-06, "loss": 0.0, "num_tokens": 3562925.0, "reward": 0.4381033778190613, "reward_std": 0.4955117404460907, "rewards/accuracy_reward/mean": 0.0006033714744262397, "rewards/accuracy_reward/std": 1.4474792806140613e-05, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.4375, "rewards/grounding_reward/std": 0.4955156147480011, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 581, "step_time": 112.55461619980633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 651.125, "completions/mean_terminated_length": 597.857177734375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "entropy": 0.13035452365875244, "epoch": 0.8314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19184546172618866, "learning_rate": 1.7000000000000002e-06, "loss": 0.1894, "num_tokens": 3571222.0, "reward": 1.2503631114959717, "reward_std": 0.3776912987232208, "rewards/accuracy_reward/mean": 0.0003631082072388381, "rewards/accuracy_reward/std": 0.0010270250495523214, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.3720119297504425, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 582, "step_time": 209.91500887461007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 421.375, "completions/mean_terminated_length": 421.375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.04580702632665634, "epoch": 0.8328571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.09570314735174179, "learning_rate": 1.6857142857142858e-06, "loss": 0.0106, "num_tokens": 3577497.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 583, "step_time": 91.12449974194169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 666.125, "completions/mean_terminated_length": 451.3999938964844, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.22649888694286346, "epoch": 0.8342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.14839580655097961, "learning_rate": 1.6714285714285715e-06, "loss": 0.3891, "num_tokens": 3585834.0, "reward": 0.31278955936431885, "reward_std": 0.258374959230423, "rewards/accuracy_reward/mean": 0.0002895544748753309, "rewards/accuracy_reward/std": 0.00039968208875507116, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 584, "step_time": 708.0705030439422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.11090544611215591, "epoch": 0.8357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.657142857142857e-06, "loss": 0.0, "num_tokens": 3589823.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 585, "step_time": 120.51522484607995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.148942232131958, "epoch": 0.8371428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.22637824714183807, "learning_rate": 1.642857142857143e-06, "loss": -0.0459, "num_tokens": 3595863.0, "reward": 1.5375339984893799, "reward_std": 0.5797940492630005, "rewards/accuracy_reward/mean": 0.4958673119544983, "rewards/accuracy_reward/std": 0.363790899515152, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.2916666567325592, "rewards/grounding_reward/std": 0.3646046221256256, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 586, "step_time": 107.44242190290242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.19246575236320496, "epoch": 0.8385714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.18634848296642303, "learning_rate": 1.6285714285714288e-06, "loss": -0.0167, "num_tokens": 3601678.0, "reward": 1.8125, "reward_std": 0.4381372928619385, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1157275140285492, "rewards/operation_reward/mean": 0.25, "rewards/operation_reward/std": 0.4629100561141968, "step": 587, "step_time": 119.39854933414608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 551.25, "completions/mean_terminated_length": 551.25, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.05632099136710167, "epoch": 0.84, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6142857142857144e-06, "loss": 0.0, "num_tokens": 3609120.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 588, "step_time": 127.2945375693962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.27325764298439026, "epoch": 0.8414285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3168981969356537, "learning_rate": 1.6000000000000001e-06, "loss": -0.1184, "num_tokens": 3613935.0, "reward": 2.0714285373687744, "reward_std": 0.4948716461658478, "rewards/accuracy_reward/mean": 0.9464285373687744, "rewards/accuracy_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 589, "step_time": 163.84307133592665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.09735977649688721, "epoch": 0.8428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.19848449528217316, "learning_rate": 1.5857142857142857e-06, "loss": 0.0066, "num_tokens": 3618702.0, "reward": 2.1875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6875, "rewards/grounding_reward/std": 0.25877460837364197, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 590, "step_time": 113.0495432857424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 701.5, "completions/mean_terminated_length": 594.0, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.2137235403060913, "epoch": 0.8442857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.17485496401786804, "learning_rate": 1.5714285714285714e-06, "loss": 0.2176, "num_tokens": 3627234.0, "reward": 0.7136756181716919, "reward_std": 0.44132283329963684, "rewards/accuracy_reward/mean": 0.3386755883693695, "rewards/accuracy_reward/std": 0.22425499558448792, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 591, "step_time": 140.77468523662537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1872609704732895, "epoch": 0.8457142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5571428571428574e-06, "loss": 0.0, "num_tokens": 3631659.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 592, "step_time": 131.08998095802963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 619.875, "completions/mean_terminated_length": 619.875, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.19786612689495087, "epoch": 0.8471428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.424671471118927, "learning_rate": 1.542857142857143e-06, "loss": 0.076, "num_tokens": 3639618.0, "reward": 2.0833334922790527, "reward_std": 0.5841830372810364, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7083333730697632, "rewards/grounding_reward/std": 0.4520675837993622, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 593, "step_time": 156.1208164235577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 80.625, "completions/mean_terminated_length": 80.625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.21103601157665253, "epoch": 0.8485714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5285714285714287e-06, "loss": 0.0, "num_tokens": 3643135.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 594, "step_time": 89.68830676563084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 490.125, "completions/mean_terminated_length": 490.125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.141521155834198, "epoch": 0.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.17693236470222473, "learning_rate": 1.5142857142857145e-06, "loss": -0.0851, "num_tokens": 3650200.0, "reward": 0.8553571701049805, "reward_std": 0.43097537755966187, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3553571403026581, "rewards/grounding_reward/std": 0.43097540736198425, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 595, "step_time": 424.1455810582265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.08801719546318054, "epoch": 0.8514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.19750981032848358, "learning_rate": 1.5e-06, "loss": 0.069, "num_tokens": 3655678.0, "reward": 2.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 596, "step_time": 104.4278645850718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.11245506256818771, "epoch": 0.8528571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2893732190132141, "learning_rate": 1.4857142857142858e-06, "loss": -0.0234, "num_tokens": 3660042.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 597, "step_time": 96.87715092487633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 935.5, "completions/mean_terminated_length": 788.0, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "entropy": 0.3392183184623718, "epoch": 0.8542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1464710682630539, "learning_rate": 1.4714285714285713e-06, "loss": 0.0784, "num_tokens": 3670398.0, "reward": 1.2281572818756104, "reward_std": 0.8212672472000122, "rewards/accuracy_reward/mean": 0.373990535736084, "rewards/accuracy_reward/std": 0.398314505815506, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.6666666865348816, "rewards/grounding_reward/std": 0.4364357888698578, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 598, "step_time": 148.00744765531272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 854.25, "completions/mean_terminated_length": 345.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.338549941778183, "epoch": 0.8557142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.1066000834107399, "learning_rate": 1.4571428571428573e-06, "loss": 0.3218, "num_tokens": 3680296.0, "reward": 0.12637042999267578, "reward_std": 0.23060961067676544, "rewards/accuracy_reward/mean": 0.0013704305747523904, "rewards/accuracy_reward/std": 0.0009573130519129336, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 599, "step_time": 179.15729369875044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 96.625, "completions/mean_terminated_length": 96.625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.1793941855430603, "epoch": 0.8571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.442857142857143e-06, "loss": 0.0, "num_tokens": 3683981.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 600, "step_time": 91.18997743166983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 355.75, "completions/mean_terminated_length": 355.75, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.08809037506580353, "epoch": 0.8585714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.15878631174564362, "learning_rate": 1.4285714285714286e-06, "loss": -0.0323, "num_tokens": 3689763.0, "reward": 2.495370388031006, "reward_std": 0.013094520196318626, "rewards/accuracy_reward/mean": 0.9953703880310059, "rewards/accuracy_reward/std": 0.013094563037157059, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 601, "step_time": 133.61686667334288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.23860643804073334, "epoch": 0.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.47487977147102356, "learning_rate": 1.4142857142857144e-06, "loss": 0.1537, "num_tokens": 3693839.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.375, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 602, "step_time": 107.1239212602377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 467.875, "completions/mean_terminated_length": 467.875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.11554120481014252, "epoch": 0.8614285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.18509981036186218, "learning_rate": 1.4000000000000001e-06, "loss": -0.0624, "num_tokens": 3700542.0, "reward": 1.1875, "reward_std": 0.5130441188812256, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.34069257974624634, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.4898979663848877, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 603, "step_time": 132.71116832084954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 950.0, "completions/mean_terminated_length": 826.6666870117188, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "entropy": 0.2617017328739166, "epoch": 0.8628571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.14025354385375977, "learning_rate": 1.3857142857142857e-06, "loss": 0.0797, "num_tokens": 3711406.0, "reward": 1.2730686664581299, "reward_std": 0.343873530626297, "rewards/accuracy_reward/mean": 0.08556868135929108, "rewards/accuracy_reward/std": 0.12941277027130127, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 604, "step_time": 161.90015036985278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 382.75, "completions/mean_terminated_length": 382.75, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.06185469403862953, "epoch": 0.8642857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.11982530355453491, "learning_rate": 1.3714285714285717e-06, "loss": -0.1186, "num_tokens": 3717348.0, "reward": 1.5625, "reward_std": 0.4172614812850952, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 605, "step_time": 103.8342257020995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.11948120594024658, "epoch": 0.8657142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3571428571428572e-06, "loss": 0.0, "num_tokens": 3721424.0, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 606, "step_time": 92.27454398013651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.09608647972345352, "epoch": 0.8671428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.1757800579071045, "learning_rate": 1.342857142857143e-06, "loss": 0.0746, "num_tokens": 3726452.0, "reward": 2.5, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.625, "rewards/grounding_reward/std": 0.5175492167472839, "rewards/operation_reward/mean": 0.375, "rewards/operation_reward/std": 0.5175492167472839, "step": 607, "step_time": 97.95620188955218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.065525121986866, "epoch": 0.8685714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3285714285714287e-06, "loss": 0.0, "num_tokens": 3731315.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 608, "step_time": 100.11250359751284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 99.5, "completions/mean_terminated_length": 99.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.17968899011611938, "epoch": 0.87, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3142857142857143e-06, "loss": 0.0, "num_tokens": 3735007.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 609, "step_time": 83.0108622033149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 757.625, "completions/mean_terminated_length": 757.625, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "entropy": 0.12643493711948395, "epoch": 0.8714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1100219190120697, "learning_rate": 1.3e-06, "loss": -0.0515, "num_tokens": 3743988.0, "reward": 1.3225574493408203, "reward_std": 0.5189278721809387, "rewards/accuracy_reward/mean": 0.2183908075094223, "rewards/accuracy_reward/std": 0.23701564967632294, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.6041666269302368, "rewards/grounding_reward/std": 0.4001736044883728, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 610, "step_time": 128.48759962804615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 873.0, "completions/mean_terminated_length": 822.6666870117188, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "entropy": 0.08474599570035934, "epoch": 0.8728571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.13895770907402039, "learning_rate": 1.2857142857142856e-06, "loss": 0.0934, "num_tokens": 3753948.0, "reward": 1.3759989738464355, "reward_std": 0.22960533201694489, "rewards/accuracy_reward/mean": 0.0009990455582737923, "rewards/accuracy_reward/std": 0.0018721712986007333, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 611, "step_time": 128.9161320719868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 106.25, "completions/mean_terminated_length": 106.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.16662779450416565, "epoch": 0.8742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3970695436000824, "learning_rate": 1.2714285714285716e-06, "loss": 0.0108, "num_tokens": 3757694.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 612, "step_time": 91.57392895594239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.17662130296230316, "epoch": 0.8757142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2571428571428573e-06, "loss": 0.0, "num_tokens": 3762941.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 613, "step_time": 104.47063150629401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 251.25, "completions/mean_terminated_length": 251.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.11665564030408859, "epoch": 0.8771428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.24567218124866486, "learning_rate": 1.242857142857143e-06, "loss": -0.0242, "num_tokens": 3767991.0, "reward": 1.3009554147720337, "reward_std": 0.36855894327163696, "rewards/accuracy_reward/mean": 0.8009554147720337, "rewards/accuracy_reward/std": 0.36855897307395935, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 614, "step_time": 112.35512618441135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.09914127737283707, "epoch": 0.8785714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2285714285714286e-06, "loss": 0.0, "num_tokens": 3772571.0, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 615, "step_time": 100.0220715738833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.05641509220004082, "epoch": 0.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.15981511771678925, "learning_rate": 1.2142857142857144e-06, "loss": -0.0952, "num_tokens": 3778690.0, "reward": 1.1875, "reward_std": 0.45806270837783813, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 616, "step_time": 106.75668543577194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 428.125, "completions/mean_terminated_length": 229.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.2758728861808777, "epoch": 0.8814285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.1901431679725647, "learning_rate": 1.2000000000000002e-06, "loss": 0.4981, "num_tokens": 3785027.0, "reward": 1.1259685754776, "reward_std": 0.5813330411911011, "rewards/accuracy_reward/mean": 0.0009685674449428916, "rewards/accuracy_reward/std": 0.0017952268244698644, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 617, "step_time": 132.3820188138634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 448.375, "completions/mean_terminated_length": 256.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.21424739062786102, "epoch": 0.8828571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2097054123878479, "learning_rate": 1.185714285714286e-06, "loss": 0.5599, "num_tokens": 3791550.0, "reward": 2.750986099243164, "reward_std": 0.9236918687820435, "rewards/accuracy_reward/mean": 0.8759859800338745, "rewards/accuracy_reward/std": 0.3507646322250366, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.5, "rewards/operation_reward/std": 0.5345224738121033, "step": 618, "step_time": 113.97381440736353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07741279900074005, "epoch": 0.8842857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.22339516878128052, "learning_rate": 1.1714285714285715e-06, "loss": -0.0779, "num_tokens": 3796790.0, "reward": 2.25, "reward_std": 0.37796446681022644, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.2314550280570984, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 619, "step_time": 104.29766491334885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 689.5, "completions/mean_terminated_length": 355.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.6257914900779724, "epoch": 0.8857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.14163964986801147, "learning_rate": 1.1571428571428572e-06, "loss": 0.4455, "num_tokens": 3805218.0, "reward": 0.4632549583911896, "reward_std": 0.5464903712272644, "rewards/accuracy_reward/mean": 0.2132549285888672, "rewards/accuracy_reward/std": 0.33436307311058044, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 620, "step_time": 123.09732777811587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 774.0, "completions/mean_terminated_length": 524.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.20214331150054932, "epoch": 0.8871428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.1708899289369583, "learning_rate": 1.142857142857143e-06, "loss": 0.0794, "num_tokens": 3814402.0, "reward": 0.7516998052597046, "reward_std": 0.46300143003463745, "rewards/accuracy_reward/mean": 0.0016998035134747624, "rewards/accuracy_reward/std": 0.0018359280657023191, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 621, "step_time": 192.1455360017717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 110.125, "completions/mean_terminated_length": 110.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2173701971769333, "epoch": 0.8885714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1285714285714285e-06, "loss": 0.0, "num_tokens": 3818219.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 622, "step_time": 94.81255181971937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 564.5, "completions/mean_terminated_length": 564.5, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.07060421258211136, "epoch": 0.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.13077156245708466, "learning_rate": 1.1142857142857145e-06, "loss": 0.0065, "num_tokens": 3825735.0, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.40625, "rewards/grounding_reward/std": 0.2651650309562683, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 623, "step_time": 165.0388179961592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.06488080322742462, "epoch": 0.8914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.18068405985832214, "learning_rate": 1.1e-06, "loss": -0.0187, "num_tokens": 3830636.0, "reward": 2.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.8125, "rewards/grounding_reward/std": 0.1157275140285492, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 624, "step_time": 107.39220028929412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 90.125, "completions/mean_terminated_length": 90.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.10809606313705444, "epoch": 0.8928571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0857142857142858e-06, "loss": 0.0, "num_tokens": 3834269.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 625, "step_time": 104.68574695754796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1444564014673233, "epoch": 0.8942857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.26800844073295593, "learning_rate": 1.0714285714285714e-06, "loss": 0.0912, "num_tokens": 3838515.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.125, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 626, "step_time": 85.73096856009215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.24418826401233673, "epoch": 0.8957142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0571428571428573e-06, "loss": 0.0, "num_tokens": 3843147.0, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 627, "step_time": 125.40353995747864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 367.0, "completions/mean_terminated_length": 367.0, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.12205956131219864, "epoch": 0.8971428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.0428571428571429e-06, "loss": 0.0, "num_tokens": 3849147.0, "reward": 1.1666667461395264, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 628, "step_time": 130.03097821306437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 599.75, "completions/mean_terminated_length": 539.1428833007812, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.0834767147898674, "epoch": 0.8985714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.15292879939079285, "learning_rate": 1.0285714285714286e-06, "loss": 0.1397, "num_tokens": 3856801.0, "reward": 1.5376805067062378, "reward_std": 0.8276376724243164, "rewards/accuracy_reward/mean": 0.7501804828643799, "rewards/accuracy_reward/std": 0.4625760018825531, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.3500000238418579, "rewards/grounding_reward/std": 0.4105744957923889, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 629, "step_time": 148.888939935714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 1005.0, "completions/mean_terminated_length": 948.0, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "entropy": 0.09868048876523972, "epoch": 0.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.10036351531744003, "learning_rate": 1.0142857142857144e-06, "loss": 0.0175, "num_tokens": 3867745.0, "reward": 0.7390735745429993, "reward_std": 0.5701162219047546, "rewards/accuracy_reward/mean": 0.1265735626220703, "rewards/accuracy_reward/std": 0.3529183864593506, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.48750001192092896, "rewards/grounding_reward/std": 0.4517821967601776, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 630, "step_time": 183.06084249075502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.09734431654214859, "epoch": 0.9014285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.23746070265769958, "learning_rate": 1.0000000000000002e-06, "loss": 0.0615, "num_tokens": 3872768.0, "reward": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0625, "rewards/grounding_reward/std": 0.1157275140285492, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 631, "step_time": 143.52274842653424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.07664600014686584, "epoch": 0.9028571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.19658316671848297, "learning_rate": 9.857142857142857e-07, "loss": -0.0548, "num_tokens": 3877777.0, "reward": 2.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5625, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 632, "step_time": 98.6570783527568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.13563816249370575, "epoch": 0.9042857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.20080386102199554, "learning_rate": 9.714285714285715e-07, "loss": -0.0245, "num_tokens": 3882589.0, "reward": 1.6666667461395264, "reward_std": 0.35634833574295044, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.1666666716337204, "rewards/grounding_reward/std": 0.35634833574295044, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 633, "step_time": 117.58611807972193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 492.75, "completions/mean_terminated_length": 492.75, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.14118219912052155, "epoch": 0.9057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.13480107486248016, "learning_rate": 9.571428571428572e-07, "loss": -0.0396, "num_tokens": 3889523.0, "reward": 2.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 634, "step_time": 127.0910409502685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1608639806509018, "epoch": 0.9071428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.3289256691932678, "learning_rate": 9.42857142857143e-07, "loss": -0.0155, "num_tokens": 3893712.0, "reward": 3.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.5, "rewards/grounding_reward/std": 0.5345224738121033, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 635, "step_time": 107.77789423149079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 474.5, "completions/mean_terminated_length": 474.5, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "entropy": 0.15703541040420532, "epoch": 0.9085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.12803766131401062, "learning_rate": 9.285714285714287e-07, "loss": 0.1217, "num_tokens": 3900444.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 636, "step_time": 172.24105140101165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 755.25, "completions/mean_terminated_length": 486.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.18717136979103088, "epoch": 0.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.19251684844493866, "learning_rate": 9.142857142857144e-07, "loss": 0.2691, "num_tokens": 3909406.0, "reward": 0.9516928195953369, "reward_std": 0.4643588066101074, "rewards/accuracy_reward/mean": 0.04127615690231323, "rewards/accuracy_reward/std": 0.11094573140144348, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.6604167222976685, "rewards/grounding_reward/std": 0.30419522523880005, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 637, "step_time": 227.8966602999717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 871.625, "completions/mean_terminated_length": 617.6666870117188, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "entropy": 0.41610822081565857, "epoch": 0.9114285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.12526901066303253, "learning_rate": 9.000000000000001e-07, "loss": 0.2111, "num_tokens": 3919299.0, "reward": 0.5643890500068665, "reward_std": 0.7747595906257629, "rewards/accuracy_reward/mean": 0.37688905000686646, "rewards/accuracy_reward/std": 0.5159850716590881, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 638, "step_time": 116.11408862750977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.05087653547525406, "epoch": 0.9128571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.14919981360435486, "learning_rate": 8.857142857142857e-07, "loss": -0.0366, "num_tokens": 3924774.0, "reward": 1.5499999523162842, "reward_std": 0.1414213627576828, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.05000000074505806, "rewards/grounding_reward/std": 0.1414213627576828, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 639, "step_time": 111.91056032385677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.1764916479587555, "epoch": 0.9142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.37514615058898926, "learning_rate": 8.714285714285716e-07, "loss": -0.0308, "num_tokens": 3929091.0, "reward": 3.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.875, "rewards/operation_reward/std": 0.3535533845424652, "step": 640, "step_time": 98.46564865298569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.085275799036026, "epoch": 0.9157142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.571428571428572e-07, "loss": 0.0, "num_tokens": 3934333.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 641, "step_time": 132.78263681475073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 561.875, "completions/mean_terminated_length": 561.875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.06200901046395302, "epoch": 0.9171428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.08396927267313004, "learning_rate": 8.428571428571429e-07, "loss": -0.0439, "num_tokens": 3941916.0, "reward": 1.412500023841858, "reward_std": 0.12963621318340302, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9125000238418579, "rewards/grounding_reward/std": 0.1296362429857254, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 642, "step_time": 105.9493146603927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 643.375, "completions/mean_terminated_length": 262.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.5182953476905823, "epoch": 0.9185714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.19572940468788147, "learning_rate": 8.285714285714285e-07, "loss": 0.3184, "num_tokens": 3950223.0, "reward": 1.8781601190567017, "reward_std": 0.6920881867408752, "rewards/accuracy_reward/mean": 0.6906601190567017, "rewards/accuracy_reward/std": 0.4560910165309906, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.25877460837364197, "rewards/grounding_reward/mean": 0.875, "rewards/grounding_reward/std": 0.3535533845424652, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 643, "step_time": 119.70140890777111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 155.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2334062159061432, "epoch": 0.92, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.142857142857144e-07, "loss": 0.0, "num_tokens": 3954463.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 644, "step_time": 92.918806867674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 398.0, "completions/mean_terminated_length": 308.5714416503906, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5403669476509094, "epoch": 0.9214285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2661222219467163, "learning_rate": 8.000000000000001e-07, "loss": 0.6345, "num_tokens": 3960663.0, "reward": 1.954427719116211, "reward_std": 0.5546976327896118, "rewards/accuracy_reward/mean": 0.5169276595115662, "rewards/accuracy_reward/std": 0.5166452527046204, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0625, "rewards/operation_reward/std": 0.1767766922712326, "step": 645, "step_time": 136.71604956127703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 1008.75, "completions/mean_terminated_length": 963.0, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "entropy": 0.4324718713760376, "epoch": 0.9228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.1012440174818039, "learning_rate": 7.857142857142857e-07, "loss": 0.0245, "num_tokens": 3971605.0, "reward": 1.3801836967468262, "reward_std": 0.6911674737930298, "rewards/accuracy_reward/mean": 0.25518375635147095, "rewards/accuracy_reward/std": 0.4597133994102478, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 646, "step_time": 159.85065445303917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.10555080324411392, "epoch": 0.9242857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3889601528644562, "learning_rate": 7.714285714285715e-07, "loss": -0.0217, "num_tokens": 3976408.0, "reward": 1.3525526523590088, "reward_std": 0.45322442054748535, "rewards/accuracy_reward/mean": 0.1775527000427246, "rewards/accuracy_reward/std": 0.02165956236422062, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.675000011920929, "rewards/grounding_reward/std": 0.46521884202957153, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 647, "step_time": 399.8728101439774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 451.25, "completions/mean_terminated_length": 451.25, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.12766189873218536, "epoch": 0.9257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.13166764378547668, "learning_rate": 7.571428571428572e-07, "loss": -0.0327, "num_tokens": 3982882.0, "reward": 2.118098258972168, "reward_std": 0.3594016134738922, "rewards/accuracy_reward/mean": 0.836848258972168, "rewards/accuracy_reward/std": 0.1275881677865982, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.78125, "rewards/grounding_reward/std": 0.31160587072372437, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 648, "step_time": 99.13256026990712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.17858827114105225, "epoch": 0.9271428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.38216158747673035, "learning_rate": 7.428571428571429e-07, "loss": -0.4269, "num_tokens": 3987506.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 649, "step_time": 141.39885879401118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 190.2857208251953, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.47129178047180176, "epoch": 0.9285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.4614579677581787, "learning_rate": 7.285714285714287e-07, "loss": 0.8756, "num_tokens": 3992726.0, "reward": 2.31333589553833, "reward_std": 0.527965784072876, "rewards/accuracy_reward/mean": 0.8758358955383301, "rewards/accuracy_reward/std": 0.35118910670280457, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 650, "step_time": 117.65587904304266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 186.57144165039062, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.23811176419258118, "epoch": 0.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.23442266881465912, "learning_rate": 7.142857142857143e-07, "loss": -0.2967, "num_tokens": 3998032.0, "reward": 0.6880927085876465, "reward_std": 0.3725842535495758, "rewards/accuracy_reward/mean": 0.0005926979356445372, "rewards/accuracy_reward/std": 0.001676402986049652, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 651, "step_time": 132.19201297406107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 191.875, "completions/mean_terminated_length": 191.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.08910418301820755, "epoch": 0.9314285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "num_tokens": 4002487.0, "reward": 1.8333333730697632, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.3333333432674408, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 652, "step_time": 135.28216077852994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 743.375, "completions/mean_terminated_length": 462.75, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.20725151896476746, "epoch": 0.9328571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.12852971255779266, "learning_rate": 6.857142857142858e-07, "loss": 0.1869, "num_tokens": 4011362.0, "reward": 0.5763580799102783, "reward_std": 0.4944787323474884, "rewards/accuracy_reward/mean": 0.0013580911327153444, "rewards/accuracy_reward/std": 0.0014568098122254014, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.26726123690605164, "rewards/grounding_reward/mean": 0.32499998807907104, "rewards/grounding_reward/std": 0.4652188718318939, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 653, "step_time": 428.4561538239941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 357.75, "completions/mean_terminated_length": 357.75, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.23899832367897034, "epoch": 0.9342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.19388079643249512, "learning_rate": 6.714285714285715e-07, "loss": 0.0734, "num_tokens": 4017104.0, "reward": 1.59375, "reward_std": 0.6223156452178955, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.2893187701702118, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 654, "step_time": 96.52746923733503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.09294109791517258, "epoch": 0.9357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.571428571428571e-07, "loss": 0.0, "num_tokens": 4021304.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 655, "step_time": 110.78821982815862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.1180126890540123, "epoch": 0.9371428571428572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.428571428571428e-07, "loss": 0.0, "num_tokens": 4026512.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 656, "step_time": 89.6670637363568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 543.0, "completions/mean_terminated_length": 543.0, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 0.08458777517080307, "epoch": 0.9385714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.17080163955688477, "learning_rate": 6.285714285714287e-07, "loss": 0.0283, "num_tokens": 4033816.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 657, "step_time": 112.83000158239156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 101.5, "completions/mean_terminated_length": 101.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.12947455048561096, "epoch": 0.94, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.142857142857143e-07, "loss": 0.0, "num_tokens": 4037524.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 658, "step_time": 98.62449595797807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 671.875, "completions/mean_terminated_length": 671.875, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "entropy": 0.07003167271614075, "epoch": 0.9414285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "num_tokens": 4045931.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 659, "step_time": 102.44575176946819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 320.125, "completions/mean_terminated_length": 320.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.07942003756761551, "epoch": 0.9428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.14069026708602905, "learning_rate": 5.857142857142857e-07, "loss": -0.0682, "num_tokens": 4051404.0, "reward": 2.09375, "reward_std": 0.4419417381286621, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.59375, "rewards/grounding_reward/std": 0.4419417679309845, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 660, "step_time": 112.31727853696793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.06131310760974884, "epoch": 0.9442857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.16445772349834442, "learning_rate": 5.714285714285715e-07, "loss": -0.1404, "num_tokens": 4056919.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.75, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 661, "step_time": 138.23825258854777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 442.875, "completions/mean_terminated_length": 442.875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.20647843182086945, "epoch": 0.9457142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.571428571428573e-07, "loss": 0.0, "num_tokens": 4063478.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 662, "step_time": 132.97704581916332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.13734014332294464, "epoch": 0.9471428571428572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.428571428571429e-07, "loss": 0.0, "num_tokens": 4068809.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 663, "step_time": 142.1450596805662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 361.875, "completions/mean_terminated_length": 361.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.0880085825920105, "epoch": 0.9485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.14750149846076965, "learning_rate": 5.285714285714287e-07, "loss": -0.0545, "num_tokens": 4074608.0, "reward": 1.2000000476837158, "reward_std": 0.41403934359550476, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7000000476837158, "rewards/grounding_reward/std": 0.41403937339782715, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 664, "step_time": 129.49741022940725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.057086534798145294, "epoch": 0.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.142857142857143e-07, "loss": 0.0, "num_tokens": 4080121.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 665, "step_time": 135.0190743226558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 339.25, "completions/mean_terminated_length": 339.25, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.18428806960582733, "epoch": 0.9514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23003119230270386, "learning_rate": 5.000000000000001e-07, "loss": -0.0055, "num_tokens": 4085947.0, "reward": 2.3857924938201904, "reward_std": 0.34644651412963867, "rewards/accuracy_reward/mean": 0.7607924938201904, "rewards/accuracy_reward/std": 0.1480480581521988, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.125, "rewards/operation_reward/std": 0.3535533845424652, "step": 666, "step_time": 117.51136596407741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.2629231810569763, "epoch": 0.9528571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.21465446054935455, "learning_rate": 4.857142857142857e-07, "loss": 0.032, "num_tokens": 4092202.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 667, "step_time": 131.7940636239946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 366.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4105266332626343, "epoch": 0.9542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.18403035402297974, "learning_rate": 4.714285714285715e-07, "loss": -0.1196, "num_tokens": 4098090.0, "reward": 0.6877645254135132, "reward_std": 0.3722665309906006, "rewards/accuracy_reward/mean": 0.00026455026818439364, "rewards/accuracy_reward/std": 0.0007482611108571291, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 668, "step_time": 230.84358654543757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 355.375, "completions/mean_terminated_length": 355.375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "entropy": 0.06317327916622162, "epoch": 0.9557142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.571428571428572e-07, "loss": 0.0, "num_tokens": 4103853.0, "reward": 2.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 669, "step_time": 117.41606339812279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.11447387933731079, "epoch": 0.9571428571428572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.4285714285714286e-07, "loss": 0.0, "num_tokens": 4108986.0, "reward": 3.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 1.0, "rewards/grounding_reward/std": 0.0, "rewards/operation_reward/mean": 1.0, "rewards/operation_reward/std": 0.0, "step": 670, "step_time": 134.03054324164987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.10232523828744888, "epoch": 0.9585714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.24004586040973663, "learning_rate": 4.285714285714286e-07, "loss": -0.0059, "num_tokens": 4113336.0, "reward": 2.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.9375, "rewards/grounding_reward/std": 0.1767766922712326, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 671, "step_time": 97.41275583021343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.10124813765287399, "epoch": 0.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.19985058903694153, "learning_rate": 4.142857142857143e-07, "loss": 0.0048, "num_tokens": 4118008.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.25, "rewards/grounding_reward/std": 0.4629100561141968, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 672, "step_time": 105.05240074265748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 737.625, "completions/mean_terminated_length": 642.1666870117188, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.3469061851501465, "epoch": 0.9614285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1694260835647583, "learning_rate": 4.0000000000000003e-07, "loss": 0.2236, "num_tokens": 4127037.0, "reward": 2.644906520843506, "reward_std": 1.3412351608276367, "rewards/accuracy_reward/mean": 0.7542816400527954, "rewards/accuracy_reward/std": 0.4549823999404907, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2314550280570984, "rewards/grounding_reward/mean": 0.765625, "rewards/grounding_reward/std": 0.43526214361190796, "rewards/operation_reward/mean": 0.75, "rewards/operation_reward/std": 0.4629100561141968, "step": 673, "step_time": 214.7428671484813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 395.0, "completions/mean_terminated_length": 395.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.09800849854946136, "epoch": 0.9628571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.20251259207725525, "learning_rate": 3.8571428571428574e-07, "loss": -0.2662, "num_tokens": 4133109.0, "reward": 1.399999976158142, "reward_std": 0.5424810647964478, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/grounding_reward/mean": 0.7749999761581421, "rewards/grounding_reward/std": 0.3240370452404022, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 674, "step_time": 125.25112904421985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 617.25, "completions/mean_terminated_length": 559.1428833007812, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.07744933664798737, "epoch": 0.9642857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.14822296798229218, "learning_rate": 3.7142857142857145e-07, "loss": 0.0742, "num_tokens": 4141007.0, "reward": 1.675531029701233, "reward_std": 0.9196880459785461, "rewards/accuracy_reward/mean": 0.7505310773849487, "rewards/accuracy_reward/std": 0.4619280993938446, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1767766922712326, "rewards/grounding_reward/mean": 0.48750001192092896, "rewards/grounding_reward/std": 0.4517821967601776, "rewards/operation_reward/mean": 0.0, "rewards/operation_reward/std": 0.0, "step": 675, "step_time": 136.65184369124472 } ], "logging_steps": 1, "max_steps": 700, "num_input_tokens_seen": 4141007, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }